<h1>Data Pre-Processing</h1>

In [1]:
import pandas as pd

data = pd.read_csv('feedback_customer.csv')

print(data.head(10))

print(data.isnull().sum())  #print(pd.isna(data).sum())

data.dropna(inplace=True)

  Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score
0  "I love this product!", Positive, Twitter, 202...                     
1  "The service was terrible.", Negative, Yelp Re...                     
2  "This movie is amazing!", Positive, IMDb, 2023...                     
3  "I'm so disappointed with their customer suppo...                     
4  "Just had the best meal of my life!", Positive...                     
5  "The quality of this product is subpar.", Nega...                     
6  "I can't stop listening to this song. It's inc...                     
7  "Their website is so user-friendly. Love it!",...                     
8  "I loved the movie! It was fantastic!", Positi...                     
9  "The customer service was terrible.", Negative...                     
Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score    0
dtype: int64


In [2]:
updated_data = data['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score'].str.split(',', expand=True)

data['Text'] = updated_data[0]
data['Sentiment'] = updated_data[1]
data['Source'] = updated_data[2]
data['Date/Time'] = updated_data[3]
data['User ID'] = updated_data[4]
data['Location'] = updated_data[5]
data['Confidence Score'] = updated_data[6]

new_data = data.iloc[:,1:]  #remove the first column  #data.drop(data.columns[0], axis=1, inplace=True) #data[data.columns[1:]]
new_data.to_csv('feedback_customer_cleaned.csv', index=False)
print(new_data)


                                                 Text  Sentiment  \
0                              "I love this product!"   Positive   
1                         "The service was terrible."   Negative   
2                            "This movie is amazing!"   Positive   
3   "I'm so disappointed with their customer suppo...   Negative   
4                "Just had the best meal of my life!"   Positive   
..                                                ...        ...   
91  "Just had the most amazing vacation! I can't w...   Positive   
92  "The food at this restaurant was awful. Never ...   Negative   
93  "I can't stop listening to this song. It's my ...   Positive   
94  "Their website is so confusing and poorly desi...   Negative   
95  "I had an incredible experience at the theme p...   Positive   

             Source             Date/Time             User ID      Location  \
0           Twitter   2023-06-15 09:23:14            @user123      New York   
1      Yelp Reviews   202

<p>Removing HTML Tags and Special Characters</p>

In [3]:
import re

def remove_html_tags(text):
    clean_text = re.sub('<.*?>','',text)
    return clean_text

def remove_special_characters(text):
    clean_text = re.sub('[^a-zA-Z0-9\\s]+','',text)
    return clean_text

<p>Lowercasing</p>

In [4]:
def convert_to_lowercase(text):
    return text.lower()

<p>Stopword Removal</p>

In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    clean_text = [word for word in words if word not in stop_words]
    return ' '.join(clean_text)

"""
test_text = "This is a simple test sentence with some stop words."
cleaned_text = remove_stopwords(test_text)
print(cleaned_text)"
"""

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'\ntest_text = "This is a simple test sentence with some stop words."\ncleaned_text = remove_stopwords(test_text)\nprint(cleaned_text)"\n'

<p>Tokenization</p>

In [6]:
import nltk
#import pandas as pd
from nltk.tokenize import word_tokenize
nltk.download('punkt')

"""
try:
    nltk.data.find('tokenizers/punkt') #Check to see if punkt has been downloaded.
except LookupError:
    nltk.download('punkt') #if not, download it.
"""
def tokenize_text(text):
    return word_tokenize(text)

"""
# Example DataFrame (replace with your actual DataFrame)
data = {'Text': ["This is a test sentence.", "Another sentence here!"]}
new_data = pd.DataFrame(data)

# Apply the tokenize_text function
new_data['Text'] = new_data['Text'].apply(tokenize_text)

print(new_data['Text'])
"""

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'\n# Example DataFrame (replace with your actual DataFrame)\ndata = {\'Text\': ["This is a test sentence.", "Another sentence here!"]}\nnew_data = pd.DataFrame(data)\n\n# Apply the tokenize_text function\nnew_data[\'Text\'] = new_data[\'Text\'].apply(tokenize_text)\n\nprint(new_data[\'Text\'])\n'

<p>Dealing with Noisy Text</p>

In [7]:
from spellchecker import SpellChecker

def correct_spelling(text):
    spell = SpellChecker()
    words = text.split()
    corrected_words = [spell.correction(word) or word for word in words]
    return " ".join(corrected_words)

"""
text = "This sentense has some mispelled words."
corrected_text = correct_spelling(text)
print(corrected_text)"
"""

'\ntext = "This sentense has some mispelled words."\ncorrected_text = correct_spelling(text)\nprint(corrected_text)"\n'

<p>Lemmatization</p>

In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<p1>Extra Whitespace Removal</p1>

In [9]:
def remove_whitespace(text):
    cleaned_text = ' '.join(text.split())
    return cleaned_text

In [10]:
new_data['Text'] = new_data['Text'].apply(remove_html_tags)
new_data['Text'] = new_data['Text'].apply(remove_special_characters)    
new_data['Text'] = new_data['Text'].apply(convert_to_lowercase)
new_data['Text'] = new_data['Text'].apply(remove_stopwords)
#new_data['Text'] = new_data['Text'].apply(tokenize_text)
new_data['Text'] = new_data['Text'].apply(correct_spelling)
new_data['Text'] = new_data['Text'].apply(lemmatize_text)
new_data['Text'] = new_data['Text'].apply(remove_whitespace)

new_data = new_data.drop(columns=['Source', 'Date/Time', 'User ID', 'Location'])
print(new_data)

                                         Text  Sentiment Confidence Score
0                                love product   Positive             0.85
1                            service terrible   Negative             0.65
2                               movie amazing   Positive             0.92
3             i disappointed customer support   Negative             0.78
4                              best meal life   Positive             0.88
..                                        ...        ...              ...
91         amazing vacation cant wait go back   Positive             0.93
92     food restaurant awful never going back   Negative             0.55
93      cant stop listening song new favorite   Positive             0.91
94          website confusing poorly designed   Negative             0.68
95  incredible experience theme park much fun   Positive             0.89

[96 rows x 3 columns]


<p>Labeling Sentiment</p>

In [11]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

array_text = np.array(new_data['Text'])
array_sentiment = np.array(new_data['Sentiment'])

le = LabelEncoder()
le.fit(array_sentiment)
numerical_sentiment = le.transform(array_sentiment)

print("Original sentiments:", array_sentiment)
print("Numerical sentiments:", numerical_sentiment)

Original sentiments: [' Positive' ' Negative' ' Positive' ' Negative' ' Positive' ' Negative'
 ' Positive' ' Positive' ' Positive' ' Negative' ' Positive' ' Negative'
 ' Positive' ' Negative' ' Positive' ' Negative' ' Positive' ' Negative'
 ' Positive' ' Negative' ' Positive' ' Positive' ' Negative' ' Negative'
 ' Negative' ' Positive' ' Positive' ' Positive' ' Negative' ' Positive'
 ' Negative' ' Positive' ' Positive' ' Positive' ' Negative' ' Positive'
 ' Positive' ' Negative' ' Positive' ' Negative' ' Positive' ' Negative'
 ' Positive' ' Negative' ' Positive' ' Positive' ' Positive' ' Negative'
 ' Positive' ' Negative' ' Negative' ' Positive' ' Positive' ' Negative'
 ' Positive' ' Negative' ' Negative' ' Negative' ' Positive' ' Negative'
 ' Positive' ' Negative' ' Positive' ' Negative' ' Positive' ' Negative'
 ' Positive' ' Negative' ' Positive' ' Positive' ' Positive' ' Negative'
 ' Positive' ' Positive' ' Negative' ' Negative' ' Negative' ' Positive'
 ' Negative' ' Positive' ' Neg

<h1>Data Training with LSTM</h1>

In [12]:
import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Hyperparameters 
max_words = 10000 # max number of words to use in the vocabulary
max_len = 100 # max length of each text (in terms of number of words)
embedding_dim = 500 # dimension of word embeddings
lstm_units = 176 # number of units in the LSTM layer
num_classes = len(set(numerical_sentiment)) # number of classes

# Tokenize the texts and create a vocabulary
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(array_text)
sequences = tokenizer.texts_to_sequences(array_text)

# Pad the sequences so they all have the same length
x = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
#x = keras.preprocessing.sequence.pad_sequences(sequences)

# Create one-hot encoded labels
y = keras.utils.to_categorical(numerical_sentiment, num_classes)

# Build the model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_words, embedding_dim))
model.add(tf.keras.layers.SpatialDropout1D(0.4))
#model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units,dropout=0.2, recurrent_dropout=0.2)))
model.add(tf.keras.layers.LSTM(lstm_units,dropout=0.2, recurrent_dropout=0.2))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=20)
#model.fit(x, y, epochs=20, batch_size=32, validation_split=0.2)

print(model.summary())

Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 398ms/step - accuracy: 0.6400 - loss: 0.6875
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 396ms/step - accuracy: 0.8351 - loss: 0.6578
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 426ms/step - accuracy: 0.9332 - loss: 0.6206
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 399ms/step - accuracy: 0.9620 - loss: 0.5693
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 389ms/step - accuracy: 0.9451 - loss: 0.5010
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 385ms/step - accuracy: 0.9817 - loss: 0.4003
Epoch 7/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 394ms/step - accuracy: 1.0000 - loss: 0.2881
Epoch 8/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 382ms/step - accuracy: 0.9712 - loss: 0.1801
Epoch 9/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

None


In [13]:
# Evaluate the model
model.evaluate(X_test,y_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 719ms/step - accuracy: 0.9000 - loss: 0.1473


[0.14727488160133362, 0.8999999761581421]

<h1>Training with simple dense Neural Network</h1>

In [14]:
"""
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Hyperparameters
max_words = 10000 # max number of words to use in the vocabulary
max_len = 100 # max length of each text (in terms of number of words)
num_classes = len(set(numerical_sentiment)) # number of classes

# Tokenize the texts and create a vocabulary
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(array_text)
sequences = tokenizer.texts_to_sequences(array_text)

# Pad the sequences so they all have the same length
x = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
#x = keras.preprocessing.sequence.pad_sequences(sequences)

# Create one-hot encoded labels
y = tf.keras.utils.to_categorical(numerical_sentiment, num_classes)

input_shape = x.shape[1:]

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=input_shape))  # Flatten the input sequences
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.SpatialDropout1D(0.4))
#model.add(tf.keras.layers.Dropout(0.5)) # Add dropout for regularization
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))  # Output layer

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=20)
#model.fit(x, y, epochs=40, batch_size=32, validation_split=0.2)

print(model.summary())"
"""

'\nimport tensorflow as tf\nfrom sklearn.model_selection import train_test_split\n\n# Hyperparameters\nmax_words = 10000 # max number of words to use in the vocabulary\nmax_len = 100 # max length of each text (in terms of number of words)\nnum_classes = len(set(numerical_sentiment)) # number of classes\n\n# Tokenize the texts and create a vocabulary\ntokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)\ntokenizer.fit_on_texts(array_text)\nsequences = tokenizer.texts_to_sequences(array_text)\n\n# Pad the sequences so they all have the same length\nx = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)\n#x = keras.preprocessing.sequence.pad_sequences(sequences)\n\n# Create one-hot encoded labels\ny = tf.keras.utils.to_categorical(numerical_sentiment, num_classes)\n\ninput_shape = x.shape[1:]\n\nmodel = tf.keras.models.Sequential()\nmodel.add(tf.keras.layers.Flatten(input_shape=input_shape))  # Flatten the input sequences\nmodel.add(tf.keras.layers.

<p>To predict using the trained model</p>

In [20]:
"""
new_text = "This product is worst!"
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_len)
predictions = model.predict(padded_sequence)
print(predictions)
"""

def predict_sentiment(text, model, tokenizer, max_len):
    """Predicts the sentiment of a given text."""

    # 1. Preprocess the text
    sequence = tokenizer.texts_to_sequences([text])  # Tokenize
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
        sequence, maxlen=max_len
    )  # Pad

    # 2. Make predictions
    predictions = model.predict(padded_sequence)

    # 3. Interpret the results (binary classification)
    positive_probability = predictions[0][1]  # Assuming one output neuron
    #negative_probability = predictions[0][0]  # Assuming one output neuron

    if positive_probability >= 0.5:
        return "Positive", positive_probability, predictions
    else:
        return "Negative", positive_probability, predictions

# Example usage
new_text = "The service was amazing!"
sentiment, probability, prediction = predict_sentiment(new_text, model, tokenizer, max_len)
print(f"Sentiment: {sentiment}, Probability: {probability}, Prediction: {prediction}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
Sentiment: Positive, Probability: 0.920915961265564, Prediction: [[0.079084   0.92091596]]


<h1>Saving the trained model</h1>

In [None]:
# Add .keras extension
model_path = r"D:\My Workspace\Discord-ML-Bot\lstm_model.keras" #using raw string
#or
#model_path = "D:/My Workspace/Discord-ML-Bot/lstm_model.keras" #using forward slashes

# Assuming 'model' is your trained Keras model
model.save(model_path)

"""
import shutil
import base64
from IPython.display import HTML
from IPython.display import display

# Add .keras extension
model_path = r"D:\My Workspace\Discord-ML-Bot\lstm_model.keras" #using raw string
#or
#model_path = "D:/My Workspace/Discord-ML-Bot/lstm_model.keras" #using forward slashes

# Assuming 'model' is your trained Keras model
model.save(model_path)

# Create a zip archive
shutil.make_archive(model_path.replace(".keras",""), 'zip', model_path.replace(".keras","")) #remove the .keras extension for the zip file.

# Path to the zip file
zip_file_path = model_path.replace(".keras", "") + '.zip' #remove .keras to create correct zip file path.

# Read the zip file's content
with open(zip_file_path, 'rb') as f:
    zip_data = f.read()

# Encode the zip data in base64
b64 = base64.b64encode(zip_data).decode()

# Create the download link
href = f'<a href="data:file/zip;base64,{b64}" download="saved_model.zip">Download saved_model.zip</a>'

# Display the download link
display(HTML(href))"
"""

  """


'\nimport shutil\nimport base64\nfrom IPython.display import HTML\nfrom IPython.display import display\n\n# Add .keras extension\nmodel_path = r"D:\\My Workspace\\Discord-ML-Bot\\lstm_model.keras" #using raw string\n#or\n#model_path = "D:/My Workspace/Discord-ML-Bot/lstm_model.keras" #using forward slashes\n\n# Assuming \'model\' is your trained Keras model\nmodel.save(model_path)\n\n# Create a zip archive\nshutil.make_archive(model_path.replace(".keras",""), \'zip\', model_path.replace(".keras","")) #remove the .keras extension for the zip file.\n\n# Path to the zip file\nzip_file_path = model_path.replace(".keras", "") + \'.zip\' #remove .keras to create correct zip file path.\n\n# Read the zip file\'s content\nwith open(zip_file_path, \'rb\') as f:\n    zip_data = f.read()\n\n# Encode the zip data in base64\nb64 = base64.b64encode(zip_data).decode()\n\n# Create the download link\nhref = f\'<a href="data:file/zip;base64,{b64}" download="saved_model.zip">Download saved_model.zip</a>\'

In [21]:
import tensorflow as tf
import pickle
import numpy as np

training_sentences = np.array(new_data['Text'])

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(training_sentences)

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Tokenizer saved.")

Tokenizer saved.
