In [3]:
import pandas as pd
import numpy as np


Loading the dataset

In [4]:
data=pd.read_csv("/content/tweet_emotions.csv")

Inspecting the dataset

In [5]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


Getting the shape of dataset

In [6]:
data.shape

(40000, 3)

finding unique columns in target column

In [7]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

Data Processing

In [8]:
#removing punctuations
import string
def remove_punctuations(text):
  punc_free=''.join([i for i in text if i not in string.punctuation])
  return punc_free

In [9]:
#tokenzing
import nltk
nltk.download('punkt')
def tokenize(text):
  words=nltk.word_tokenize(text)
  return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
#removing stopwords
nltk.download('stopwords')
sw=nltk.corpus.stopwords.words('english')
def remove_sw(text):
  output=[i for i in text if i not in sw]
  return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
#Lemmatizing
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
word_lem=WordNetLemmatizer()
def lemm(text):
  lemm_text=[word_lem.lemmatize(word) for word in  text]
  return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# calling function for cleaning the data
def preprocess(df_col):
  corpus=[]
  for item in df_col:
    new_item=remove_punctuations(item)
    new_item=new_item.lower()
    new_item=tokenize(new_item)
    new_item=remove_sw(new_item)
    new_item=lemm(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [13]:
corpus=preprocess(data['content'])

Viewing the cleaned corpus

In [14]:
corpus

['tiffanylue know listenin bad habit earlier started freakin part',
 'layin n bed headache ughhhhwaitin call',
 'funeral ceremonygloomy friday',
 'want hang friend soon',
 'dannycastillo want trade someone houston ticket one',
 'repinging ghostridah14 didnt go prom bc bf didnt like friend',
 'sleep im thinking old friend want he married damn amp want 2 scandalous',
 'hmmm httpwwwdjherocom',
 'charviray charlene love miss',
 'kelcouch im sorry least friday',
 'cant fall asleep',
 'choked retainer',
 'ugh beat stupid song get next rude',
 'brodyjenner u watch hill london u realise tourture week week late watch itonlinelol',
 'got news',
 'storm electricity gone',
 'annarosekerr agreed',
 'sleepy even late fail',
 'perezhilton lady gaga tweeted impressed video leaking know',
 'convinced always wanted signal give offdamn think lost another friend',
 'raaaaaaek oh bad hope get better ive sleep issue lately',
 'wondering im awake 7amwriting new songplotting evil secret plot muahahahaoh damn 

 Checking for missing values

In [15]:
data.isna().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

As there is no missing value, we need not handle them

Feature Extraction: Word 2 Vec

In [16]:
from gensim.models import Word2Vec

In [23]:
word2vec_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

# Function to create document vectors by averaging word vectors
def document_vector(words, model):
    # Remove out-of-vocabulary words and handle empty texts
    words = [word for word in words if word in model.wv.index_to_key]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(model.wv[words], axis=0)

# Create document vectors
x= np.array([document_vector(text, word2vec_model) for text in corpus])




In [24]:

y=data['sentiment']

In [25]:
x

array([[-0.13177472, -0.01240805,  0.02279986, ...,  0.03032978,
        -0.01486854, -0.0601595 ],
       [-0.09605473, -0.02408435, -0.0004591 , ...,  0.06114985,
         0.00067852, -0.04123561],
       [-0.10158604, -0.02487048,  0.06113547, ...,  0.09996901,
         0.02248279, -0.08396704],
       ...,
       [-0.05714036, -0.09273102, -0.01999684, ...,  0.06991351,
         0.09144031, -0.08313614],
       [-0.15563956, -0.05063781,  0.01724872, ...,  0.0259412 ,
         0.05988463, -0.07056596],
       [-0.16698152, -0.00605428,  0.01944467, ...,  0.07563619,
         0.02431072,  0.0058636 ]])

In [26]:
y

0             empty
1           sadness
2           sadness
3        enthusiasm
4           neutral
            ...    
39995       neutral
39996          love
39997          love
39998     happiness
39999          love
Name: sentiment, Length: 40000, dtype: object

Label encoding the target column

In [27]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [28]:

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [29]:
y_encoded

array([ 2, 10, 10, ...,  7,  5,  7])

In [30]:
y_one_hot = to_categorical(y_encoded)

Split the dataset into training and testing sets

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)


Model Building

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [37]:
#Random Forest
rf_model = RandomForestClassifier(n_estimators=100,
    max_depth=20,
    n_jobs=-1,
    random_state=42)
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy}")

Random Forest Accuracy: 0.2555


In [38]:

print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.00      0.00      0.00       338
   happiness       0.23      0.07      0.11      1028
        hate       0.00      0.00      0.00       268
        love       0.37      0.10      0.16       762
     neutral       0.28      0.49      0.36      1740
      relief       0.00      0.00      0.00       352
     sadness       0.16      0.03      0.04      1046
    surprise       0.00      0.00      0.00       425
       worry       0.24      0.61      0.34      1666

    accuracy                           0.26      8000
   macro avg       0.10      0.10      0.08      8000
weighted avg       0.20      0.26      0.18      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
y_pred_lr = lr_model.predict(x_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("Logistic Regression Classification Report")
print(classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression Accuracy: 0.2555
Logistic Regression Classification Report
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.00      0.00      0.00       338
   happiness       0.10      0.01      0.02      1028
        hate       0.00      0.00      0.00       268
        love       0.29      0.11      0.16       762
     neutral       0.28      0.52      0.37      1740
      relief       0.00      0.00      0.00       352
     sadness       0.25      0.00      0.01      1046
    surprise       0.00      0.00      0.00       425
       worry       0.24      0.62      0.34      1666

    accuracy                           0.26      8000
   macro avg       0.09      0.10      0.07      8000
weighted avg       0.18      0.26      0.17      8000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
x_train_array.shape[1]

253419

In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM

In [41]:

# Artificial Neural Network
ann_model = Sequential()
ann_model.add(Dense(200, input_dim=x_train.shape[1], activation='relu'))
ann_model.add(Dropout(0.5))
ann_model.add(Dense(128, activation='relu'))
ann_model.add(Dropout(0.5))
ann_model.add(Dense(50, activation='relu'))
ann_model.add(Dropout(0.5))
ann_model.add(Dense(len(label_encoder.classes_), activation='softmax'))



In [42]:
ann_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
ann_model.fit(x_train, y_train, epochs=8, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.src.callbacks.History at 0x7c2e43fc5390>

In [45]:
#Evaluating the ANN model
ann_loss, ann_accuracy = ann_model.evaluate(x_test, y_test)
print(f"Artificial Neural Network Accuracy: {ann_accuracy}")

Artificial Neural Network Accuracy: 0.25200000405311584


In [46]:
# Predicting the ANN Model
y_pred_ann = ann_model.predict(x_test)
y_pred_ann_classes = y_pred_ann.argmax(axis=-1)



In [47]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [48]:
# Recurrent Neural Network (LSTM)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(corpus)
x_seq = tokenizer.texts_to_sequences(corpus)
x_padded = pad_sequences(x_seq, maxlen=100)

x_train_seq, x_test_seq, y_train_seq, y_test_seq = train_test_split(x_padded, y_encoded, test_size=0.2, random_state=42)

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=100, input_length=100))
lstm_model.add(LSTM(128))
lstm_model.add(Dense(len(label_encoder.classes_), activation='softmax'))

lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm_model.fit(x_train_seq, y_train_seq, epochs=10, batch_size=32, validation_data=(x_test_seq, y_test_seq))

lstm_loss, lstm_accuracy = lstm_model.evaluate(x_test_seq, y_test_seq)
print(f"LSTM Model Accuracy: {lstm_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LSTM Model Accuracy: 0.2877500057220459


In [52]:


# Evaluating the Results
results = {
    "Model": [ "Logistic Regression","Random Forest" "Simple Neural Network", "LSTM"],
    "Accuracy": [ rf_accuracy, lr_accuracy, ann_accuracy, lstm_accuracy]
}

# results_df = pd.DataFrame(results)
print(results)

{'Model': ['Logistic Regression', 'Random ForestSimple Neural Network', 'LSTM'], 'Accuracy': [0.2555, 0.2555, 0.25200000405311584, 0.2877500057220459]}
