Import libraries


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import tensorflow


load the data

In [4]:
data = pd.read_csv('/content/tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


Data preprocesasing

In [5]:
data.describe()

Unnamed: 0,tweet_id
count,10989.0
mean,1960200000.0
std,2146952.0
min,1956967000.0
25%,1957617000.0
50%,1960871000.0
75%,1962104000.0
max,1963082000.0


In [6]:
#check missing values
data.isnull().sum()

Unnamed: 0,0
tweet_id,0
sentiment,0
content,0


In [7]:
data.columns

Index(['tweet_id', 'sentiment', 'content'], dtype='object')

In [8]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [9]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
worry,3413
sadness,2458
neutral,2053
surprise,606
hate,581
happiness,507
love,410
relief,249
fun,229
empty,218


In [10]:
#rename column content as tweet
data.rename(columns={'content':'tweet'},inplace=True)

In [11]:
data.head()

Unnamed: 0,tweet_id,sentiment,tweet
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [12]:
# label encoding for sentiment column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['sentiment_label'] = le.fit_transform(data['sentiment'])

cleaning text data

In [13]:
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def clean_tweet(tweet):
    # Handle potential float values (e.g., NaN)
    if isinstance(tweet, float):
        return '' # Or any other appropriate handling for float values
    #convert to lowercase
    tweet = tweet.lower()
    #remove urls
    tweet = re.sub(r'http\S+', '', tweet)
    #remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    #remove hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    #remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    #remove special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)
    #remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    #remove stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    return tweet

data['cleaned_tweet'] = data['tweet'].apply(clean_tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
data.head()

Unnamed: 0,tweet_id,sentiment,tweet,sentiment_label,cleaned_tweet
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,2,know listenin bad habit earlier started freaki...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,10,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,10,funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3,wants hang friends soon
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,8,want trade someone houston tickets one


test train split


In [15]:
from sklearn.model_selection import train_test_split
x = data['cleaned_tweet']
y = data['sentiment_label']
#split dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


text vectorization

In [16]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#using tf and idf
tfidf_vectorizer = TfidfVectorizer()
x_train = tfidf_vectorizer.fit_transform(x_train)
x_test = tfidf_vectorizer.transform(x_test)

In [17]:
#convert sparse matrices to array
x_train = x_train.toarray()
x_test = x_test.toarray()

In [18]:
# Get the minimum and the maximum length of reviews (number of features after vectorization)
print("Max length of a review (number of features):: ", max(len(x_train[0]), len(x_test[0])))
print("Min length of a review (number of features):: ", min(len(x_train[0]), len(x_test[0])))

Max length of a review (number of features)::  11614
Min length of a review (number of features)::  11614


simple RNN (deep learning model)

In [19]:
# creating a RNN model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout
#model initialisation
RNN_model = Sequential()
#first RNN layer
RNN_model.add(SimpleRNN(units=128, input_shape=(x_train.shape[1], 1), activation='relu', return_sequences=True))
#first dropout layer for reducing overfitting
RNN_model.add(Dropout(0.2))
#second RNN layer
RNN_model.add(SimpleRNN(units=64, activation='relu'))
#second dropout layer
RNN_model.add(Dropout(0.2))
#output layer classifies input into 13 sentiment categories
RNN_model.add(Dense(units=13, activation='softmax'))
#printing model summary
RNN_model.summary()


  super().__init__(**kwargs)


compile the model

In [20]:
RNN_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
y_train = tf.keras.utils.to_categorical(y_train, num_classes=13)
# Training the model
history = RNN_model.fit(x_train, y_train,batch_size=64, epochs=3,verbose=1)

Epoch 1/3
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 1s/step - accuracy: 0.2932 - loss: 2.2000
Epoch 2/3
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 1s/step - accuracy: 0.2943 - loss: 1.9844
Epoch 3/3
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 1s/step - accuracy: 0.3127 - loss: 1.9447


Evaluation

In [21]:
#test the model
y_test = tf.keras.utils.to_categorical(y_test, num_classes=13)
test_loss, test_accuracy = RNN_model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 422ms/step - accuracy: 0.3039 - loss: 1.9541
Test Accuracy: 0.30785563588142395


In [32]:
#sample prediction
sample_review1 = 'Funeral ceremony... gloomy friday...'

# Clean the sample review using the same function used for training data
sample_review1_cleaned = clean_tweet(sample_review1)

# Transform the cleaned review into a numerical representation using the same TfidfVectorizer
sample_review1_vec = tfidf_vectorizer.transform([sample_review1_cleaned])

# Convert to array (if necessary, based on your TfidfVectorizer output)
sample_review1_vec = sample_review1_vec.toarray()

# Make the prediction
prediction1 = RNN_model.predict(sample_review1_vec)
# Step 4: Interpret prediction
class_labels = ['anger','boredom','empty', 'enthusiasm','fun','happiness','hate','love','neutral','relief', 'sadness', 'surprise','worry', ]  # Same order as during training
# Fix: Use prediction1 instead of prediction
predicted_class_index = prediction1.argmax(axis=1)[0]  # Get the index of the highest probability
predicted_label = class_labels[predicted_class_index]

# Output
# Fix: Use prediction1 instead of prediction
print(f"Prediction probabilities: {prediction1}")
print(f"Predicted Sentiment: {predicted_label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 511ms/step
Prediction probabilities: [[0.00360481 0.00528722 0.01722695 0.01033528 0.01753866 0.04662166
  0.04932838 0.03144328 0.19443965 0.01761242 0.23814292 0.05105267
  0.31736606]]
Predicted Sentiment: worry


using RNN model predicts well
