In [2]:
import pandas as pd 
import nltk
import numpy as np 
from nltk.corpus import stopwords 
import re
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier 
import pickle
from nltk.stem import WordNetLemmatizer

In [3]:
#!pip install emoji
import emoji

In [4]:
data=pd.read_csv("tweet_emotions.csv")
data

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [5]:
data['sentiment'].nunique()

13

In [6]:
sentiment_list=data['sentiment'].unique()
sentiment_list

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [7]:
data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [8]:
nltk.download('stopwords')
STOPWORDS = stopwords.words("english")
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SiYu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SiYu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def clean(text):
    text = text.lower() 
    text=re.sub(r'@\w+',"", text)
    text = re.sub("[^\w\s]"," ",text) # Remove punctuations 
    text = " ".join(w for w in text.split() if w not in STOPWORDS)
    text = " ".join(lemmatizer.lemmatize(w) for w in text.split())
    return text

In [10]:
data['text'] = data['content'].apply(lambda x : clean(x)) 

In [11]:
data

Unnamed: 0,tweet_id,sentiment,content,text
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,know listenin bad habit earlier started freaki...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,want trade someone houston ticket one
...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,
39996,1753919001,love,Happy Mothers Day All my love,happy mother day love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...,happy mother day mommy woman man long momma so...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,wassup beautiful follow peep new hit single ww...


In [12]:
#https://raw.githubusercontent.com/carpedm20/emoji/master/emoji/unicode_codes/data_dict.py

'''
array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)
'''

dic={'empty':'tired_face',
    'sadness':"crying_face",
    'worry':'worried_face',
    'hate':'nauseated_face',
    'boredom':'flushed_face',
    'anger':'angry_face_with_horns',
    'neutral':'neutral_face',
    'relief':"relieved_face",
    'enthusiasm':"partying_face",
    'surprise':'face_with_hand_over_mouth',
    'love':'smiling_face_with_hearts',
    'fun':'face_with_tears_of_joy',
    'happiness':'kissing_face_with_closed_eyes'
    }

In [13]:
for key,value in dic.items():
    print(key,":",emoji.emojize(f':{value}:'))

empty : 😫
sadness : 😢
worry : 😟
hate : 🤢
boredom : 😳
anger : 👿
neutral : 😐
relief : 😌
enthusiasm : 🥳
surprise : 🤭
love : 🥰
fun : 😂
happiness : 😚


In [14]:
def sentiment2emoij(sentiment):
    return(emoji.emojize(f':{dic[sentiment]}:'))

In [15]:
data['emoji']=data['sentiment'].apply(lambda x :sentiment2emoij(x)) 

In [16]:
data[['emoji','sentiment','text']]

Unnamed: 0,emoji,sentiment,text
0,😫,empty,know listenin bad habit earlier started freaki...
1,😢,sadness,layin n bed headache ughhhh waitin call
2,😢,sadness,funeral ceremony gloomy friday
3,🥳,enthusiasm,want hang friend soon
4,😐,neutral,want trade someone houston ticket one
...,...,...,...
39995,😐,neutral,
39996,🥰,love,happy mother day love
39997,🥰,love,happy mother day mommy woman man long momma so...
39998,😚,happiness,wassup beautiful follow peep new hit single ww...


In [17]:
temp_data=data[['emoji','sentiment','text']].copy()

In [18]:
temp_data

Unnamed: 0,emoji,sentiment,text
0,😫,empty,know listenin bad habit earlier started freaki...
1,😢,sadness,layin n bed headache ughhhh waitin call
2,😢,sadness,funeral ceremony gloomy friday
3,🥳,enthusiasm,want hang friend soon
4,😐,neutral,want trade someone houston ticket one
...,...,...,...
39995,😐,neutral,
39996,🥰,love,happy mother day love
39997,🥰,love,happy mother day mommy woman man long momma so...
39998,😚,happiness,wassup beautiful follow peep new hit single ww...


In [19]:
x=np.array(data['text'])


Le = LabelEncoder()
y =  Le.fit_transform(np.array(data['sentiment']))

In [20]:
temp_data['Y_Encoder']=y

In [21]:
temp_data

Unnamed: 0,emoji,sentiment,text,Y_Encoder
0,😫,empty,know listenin bad habit earlier started freaki...,2
1,😢,sadness,layin n bed headache ughhhh waitin call,10
2,😢,sadness,funeral ceremony gloomy friday,10
3,🥳,enthusiasm,want hang friend soon,3
4,😐,neutral,want trade someone houston ticket one,8
...,...,...,...,...
39995,😐,neutral,,8
39996,🥰,love,happy mother day love,7
39997,🥰,love,happy mother day mommy woman man long momma so...,7
39998,😚,happiness,wassup beautiful follow peep new hit single ww...,5


In [22]:
print(list(zip([i for i in range(0,13)],list(Le.inverse_transform([i for i in range(0,13)])))))

[(0, 'anger'), (1, 'boredom'), (2, 'empty'), (3, 'enthusiasm'), (4, 'fun'), (5, 'happiness'), (6, 'hate'), (7, 'love'), (8, 'neutral'), (9, 'relief'), (10, 'sadness'), (11, 'surprise'), (12, 'worry')]


In [23]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [24]:
x_train.shape

(32000,)

In [25]:
y_train.shape

(32000,)

In [26]:
tf = TfidfVectorizer(analyzer='word',max_features=1000,ngram_range=(1,3))
x_tf = tf.fit_transform(x_train)
x_val_tf = tf.transform(x_test)

In [27]:
x_tf   = x_tf.toarray()
x_val_tf =  x_val_tf.toarray()

In [28]:
model =RandomForestClassifier()
model.fit(x_tf,y_train)

In [29]:
y_pred = model.predict(x_val_tf)

In [30]:
model.score(x_val_tf,y_test)

0.3185

In [31]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [80,100],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200,500]
}

Random_search_model = RandomizedSearchCV(estimator = model, 
                   param_distributions = param_grid, 
                   cv = 5, n_iter = 50)

In [32]:
Random_search_model.fit(x_tf,y_train)

KeyboardInterrupt: 

In [None]:
Random_search_model.best_estimator_

In [None]:
Random_search_model.best_params_

{'n_estimators': 200,
 'min_samples_split': 8,
 'min_samples_leaf': 3,
 'max_features': 3,
 'max_depth': 100,
 'bootstrap': True}

In [None]:
Random_search_model.best_score_

0.31074999999999997

In [None]:
best_model =RandomForestClassifier(n_estimators=200,min_samples_split=8,min_samples_leaf=3,max_features=3,max_depth=100,bootstrap=True)
best_model.fit(x_tf,y_train)

In [None]:
y_pred = best_model.predict(x_val_tf)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.299375

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split


In [None]:
temp_data

Unnamed: 0,emoji,sentiment,text,Y_Encoder
0,😫,empty,know listenin bad habit earlier started freaki...,2
1,😢,sadness,layin n bed headache ughhhh waitin call,10
2,😢,sadness,funeral ceremony gloomy friday,10
3,🥳,enthusiasm,want hang friend soon,3
4,😐,neutral,want trade someone houston ticket one,8
...,...,...,...,...
39995,😐,neutral,,8
39996,🥰,love,happy mother day love,7
39997,🥰,love,happy mother day mommy woman man long momma so...,7
39998,😚,happiness,wassup beautiful follow peep new hit single ww...,5


In [None]:
temp_data['Y_Encoder'].value_counts(normalize=True)

8     0.215950
12    0.211475
5     0.130225
10    0.129125
7     0.096050
11    0.054675
4     0.044400
9     0.038150
6     0.033075
2     0.020675
3     0.018975
1     0.004475
0     0.002750
Name: Y_Encoder, dtype: float64

In [None]:
X=temp_data['text'].array
y=temp_data['Y_Encoder']

In [None]:
from imblearn.over_sampling import SMOTEN
sampler = SMOTEN(random_state=0)
X_res, y_res = sampler.fit_resample(X.reshape(-1, 1), y)

In [None]:
X_res.shape

(112294, 1)

In [None]:
y_res.shape

(112294,)

In [None]:
corpus = [''.join(ele) for ele in X_res]


In [None]:
corpus

['know listenin bad habit earlier started freakin part',
 'layin n bed headache ughhhh waitin call',
 'funeral ceremony gloomy friday',
 'want hang friend soon',
 'want trade someone houston ticket one',
 'pinging go prom bc bf like friend',
 'sleep im thinking old friend want married damn amp want 2 scandalous',
 'hmmm http www djhero com',
 'charlene love miss',
 'sorry least friday',
 'cant fall asleep',
 'choked retainer',
 'ugh beat stupid song get next rude',
 'u watch hill london u realise tourture week week late watch itonlinelol',
 'got news',
 'storm electricity gone',
 'agreed',
 'sleepy even late fail',
 'lady gaga tweeted impressed video leaking know',
 'convinced always wanted signal give damn think lost another friend',
 'oh bad hope get better sleep issue lately',
 'wondering awake 7am writing new song plotting evil secret plot muahahaha oh damn secret anymore',
 'topic map talk balisage markup conference 2009 program online http tr im ml6z via topicmaps',
 'ate somethi

In [None]:
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[3405, 1383, 3992, 4054, 2039, 3127, 4899, 1248],
 [3193, 1829, 2696, 1829, 4905, 1177, 1685],
 [1781, 2613, 2548, 2824],
 [4260, 1638, 429, 44],
 [4260, 3892, 2644, 4853, 640, 1818],
 [4165, 4358, 3096, 1660, 4505, 2649, 429],
 [1860, 3653, 491, 2110, 429, 4260, 2791, 929, 3699, 4260, 4811, 3689],
 [950, 2552, 273, 2749, 4042],
 [4936, 231, 2493],
 [1000, 285, 2824],
 [605, 343, 3023],
 [3603, 2232],
 [4665, 543, 531, 1785, 2979, 3439, 4496],
 [3306, 906, 83, 1866, 3306, 624, 2740, 1609, 1609, 2588, 906, 755],
 [4272, 1937],
 [3056, 1189, 601],
 [905],
 [1382, 4904, 2588, 3011],
 [767, 870, 696, 2184, 527, 4781, 3405],
 [4418, 4869, 933, 4168, 4248, 929, 2925, 2332, 3122, 429],
 [4559, 3992, 1428, 2979, 2649, 1860, 43, 1618],
 [333,
  2719,
  4885,
  95,
  4779,
  1785,
  4244,
  2679,
  2090,
  1343,
  3408,
  4559,
  929,
  2090,
  272],
 [2067,
  2066,
  4473,
  2756,
  44,
  575,
  285,
  2,
  4194,
  2552,
  3398,
  3653,
  2084,
  3982,
  3781],
 [1609, 2789, 3405, 4465, 3778, 

In [None]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 3127 4899 1248]
 [   0    0    0 ... 4905 1177 1685]
 [   0    0    0 ... 2613 2548 2824]
 ...
 [   0    0    0 ... 4523 1136 1306]
 [   0    0    0 ... 4523 1136 1306]
 [   0    0    0 ... 4523 1136 1306]]


In [None]:
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(LSTM(100))
model.add(Dense(13,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_9 (LSTM)               (None, 200)               192800    
                                                                 
 dense_9 (Dense)             (None, 13)                2613      
                                                                 
Total params: 395,413
Trainable params: 395,413
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
X_final=np.array(embedded_docs)
y_final=np.array(y_res)

In [None]:
X_final.shape,y_final.shape

((112294, 20), (112294,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [None]:
X_train.shape, y_train.shape

((75236, 20), (75236,))

In [None]:
X_test.shape, y_test.shape

((37058, 20), (37058,))

In [None]:
from sklearn.preprocessing import LabelBinarizer
label_binarizer = LabelBinarizer()
y_train_label = label_binarizer.fit_transform( y_train)
y_test_label = label_binarizer.fit_transform( y_test)

In [None]:
y_train_label

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
model.fit(X_train,y_train_label,validation_data=(X_test,y_test_label),epochs=10,batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x249055f8af0>