<a href="https://colab.research.google.com/github/abakm/ADDM/blob/master/Certified_Specialist_in_Machine_Learning_and_Artificial_Intelligence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraires

In [29]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load dataset

In [30]:
df = pd.read_csv('./tweet_emotions.csv')
df.drop(['tweet_id'], axis=1, inplace=True) # Remove tge tweet_id
df.head() # printthe datasets

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


# Preprocessing

In [31]:
STOPWORDS = set(stopwords.words('english'))
def preprocess(text):
  text = str(text).lower().replace('\n',' ').replace(',','').replace('.','') # convert to lowercase, replace ',' and '.' with empty string,
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  text = re.sub(r'@\w+|#\w+', '', text)
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  text = ' '.join([text for text in text.split() if text.isalpha() and text not in STOPWORDS]) # Remove stop words
  return text


df.dropna(inplace=True) # Remove Null values
df.drop_duplicates(inplace=True) # Remove duplicates value
df['formatted_content'] = df['content'].apply(lambda row: preprocess(row))
sentiment_mapping = {'empty':0, 'sadness':1, 'enthusiasm':2, 'neutral':3, 'worry':4, 'surprise':4, 'love':5, 'fun':6, 'hate':7,
                     'happiness':8, 'boredom':9, 'relief':10, 'anger':11}
df['sentiment_label']  =  df['sentiment'].map(sentiment_mapping)
df.head()

Unnamed: 0,sentiment,content,formatted_content,sentiment_label
0,empty,@tiffanylue i know i was listenin to bad habi...,know listenin bad habit earlier started freaki...,0
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhhwaitin call,1
2,sadness,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday,1
3,enthusiasm,wants to hang out with friends SOON!,wants hang friends soon,2
4,neutral,@dannycastillo We want to trade with someone w...,want trade someone houston tickets one,3


#Tokenization

In [32]:
contents = df["formatted_content"].values.tolist()
tokenizer = Tokenizer() # create the tokenizer object
tokenizer.fit_on_texts(contents) # fit the contents
total_words = len(tokenizer.word_index) +1
print(total_words)
print(tokenizer.word_index)

33512


# Padding

In [33]:
max_len = 5
sequences  =  tokenizer.texts_to_sequences(contents)
pad_seq = pad_sequences(sequences, maxlen=max_len, padding='pre')

# Create the model

In [34]:
model = Sequential()
model.add(Embedding(total_words, 10,input_shape=(max_len,)))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(60))
model.add(Dense(50,activation= 'relu'))
model.add(Dense(len(sentiment_mapping), activation = 'softmax'))


  super().__init__(**kwargs)


# Complie the model

In [35]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [36]:
print(model.summary())

None


# Fit the model

In [37]:
model.fit(pad_seq, df['sentiment_label'], epochs=10, verbose=1)

Epoch 1/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.2806 - loss: 2.0502
Epoch 2/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.3827 - loss: 1.7780
Epoch 3/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.4529 - loss: 1.5870
Epoch 4/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.5191 - loss: 1.4097
Epoch 5/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.5797 - loss: 1.2549
Epoch 6/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.6257 - loss: 1.1333
Epoch 7/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.6633 - loss: 1.0217
Epoch 8/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.6918 - loss: 0.9347
Epoch 9/10
[1m1248

<keras.src.callbacks.history.History at 0x79db8f9cd9d0>

# Evaluate the model

In [38]:
loss, accuracy  = model.evaluate(pad_seq, df['sentiment_label'])

print("Test accuracy: ", accuracy)

[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7432 - loss: 0.7567
Test accuracy:  0.7438923716545105


# Predict the model

In [47]:
tweet = "Choked on her retainers"
tweet = preprocess(text=tweet)
tweet_sequence = tokenizer.texts_to_sequences([tweet])
tweet_padded = pad_sequences(tweet_sequence, maxlen=max_len, padding='pre')
prediction = model.predict(tweet_padded)

print(prediction)
predicted_label = np.argmax(prediction)
print(predicted_label)

print("Predicted Sentiment: ", list(sentiment_mapping.keys())[list(sentiment_mapping.values()).index(predicted_label)])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[[3.8058092e-06 2.0475777e-04 9.9501831e-06 1.8569821e-04 9.9931550e-01
  3.9231876e-05 1.4420220e-05 3.4623794e-05 3.6296500e-05 2.9945539e-07
  1.5541093e-04 5.2113286e-08 5.8197099e-17]]
4
Predicted Sentiment:  worry
