In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,plot_precision_recall_curve, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import re
import pickle

In [2]:
data = pd.read_csv('twitter_training.csv', names=["Tweet_ID", "Entity", "Sentiment", "Text"])
data.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
data.Entity.unique()

array(['Borderlands', 'CallOfDutyBlackopsColdWar', 'Amazon', 'Overwatch',
       'Xbox(Xseries)', 'NBA2K', 'Dota2', 'PlayStation5(PS5)',
       'WorldOfCraft', 'CS-GO', 'Google', 'AssassinsCreed', 'ApexLegends',
       'LeagueOfLegends', 'Fortnite', 'Microsoft', 'Hearthstone',
       'Battlefield', 'PlayerUnknownsBattlegrounds(PUBG)', 'Verizon',
       'HomeDepot', 'FIFA', 'RedDeadRedemption(RDR)', 'CallOfDuty',
       'TomClancysRainbowSix', 'Facebook', 'GrandTheftAuto(GTA)',
       'MaddenNFL', 'johnson&johnson', 'Cyberpunk2077',
       'TomClancysGhostRecon', 'Nvidia'], dtype=object)

In [4]:
data.Sentiment.unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

Intake the training data and only keep the necessary columns

In [5]:
data = data[['Text','Sentiment']]

data.head()

Unnamed: 0,Text,Sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [6]:
data.Sentiment.unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [5]:
data.shape

(74682, 4)

In [7]:
data[data.Sentiment == 'Positive'].shape

(20832, 4)

In [8]:
data[data.Sentiment == 'Negative'].shape

(22542, 4)

In [27]:
print(f'{data.iloc[412].Text} - {data.iloc[412].Sentiment}')

Perfect casting for first Lilith, most anticipated film - Positive


In [23]:
data.iloc[412].Sentiment

'Positive'

In [8]:
data.iloc[426].Text

'Yasss!!! Co-Stream with @jimmysgotya  twitch.tv/jimmysgotya'

Goal is to identify Positive and Negative tweets, drop everything else and keep only valid text
- remove all rows with neutral or irrelevant sentiment
- convert all text to lower case
- remove emojis and special characters

In [10]:
data = data[data.Sentiment != "Neutral"]
data = data[data.Sentiment != "Irrelevant"]
data.Text = data.Text.apply(lambda x: str(x).lower())
data.Text = data.Text.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

# tfidf stuff 
corpus=data['Text']




In [11]:
data.Sentiment.unique()

array(['Positive', 'Negative'], dtype=object)

In [12]:
class_labels = data.Sentiment.unique()[0]
class_labels = ['negative', 'positive']

In [13]:
tfidfvectorizer= TfidfVectorizer(stop_words='english', max_features=5000, min_df=5)
countvectorizer = CountVectorizer(analyzer='word', stop_words='english')

In [14]:

tfidf_matrix=tfidfvectorizer.fit_transform(corpus)
tfidf_data=tfidf_matrix.toarray()

countvectorizer_matrix = countvectorizer.fit_transform(corpus)
count_data = countvectorizer_matrix.toarray()

In [15]:
tfidf_data.shape

(43374, 5000)

In [16]:
count_data.shape

(43374, 22965)

In [17]:
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()

In [46]:
le = LabelEncoder()

In [47]:
x=tfidf_data
columns= pd.get_dummies(data.Sentiment).columns
print(columns)
y= pd.get_dummies(data.Sentiment).values
y2=le.fit_transform(data.Sentiment)
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.33, random_state = 42)
X_train, X_test, Y2_train, Y2_test = train_test_split(x,y2, test_size = 0.33, random_state = 42)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

Index(['Negative', 'Positive'], dtype='object')
(29060, 5000) (29060, 2)
(14314, 5000) (14314, 2)


In [48]:
y2
Y2_train

array([1, 0, 0, ..., 0, 1, 1])

In [96]:
# this is for the count vectorizer (not used)


# x2=count_data
# y2= pd.get_dummies(data.Sentiment).values
# X2_train, X2_test, Y2_train, Y2_test = train_test_split(x,y, test_size = 0.33, random_state = 42)
# print(X2_train.shape,Y2_train.shape)
# print(X2_test.shape,Y2_test.shape)

(29060, 5000) (29060, 2)
(14314, 5000) (14314, 2)


In [20]:
checkpointer=ModelCheckpoint(filepath='best_weights1.hdf5',verbose=0,save_best_only=True)
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
model1=Sequential()
model1.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model1.add(Dense(2, activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer='adam')




In [21]:
# model 2 
checkpointer=ModelCheckpoint(filepath='best_weights2.hdf5',verbose=0,save_best_only=True)
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
model2=Sequential()
model2.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model2.add(Dense(50, input_dim=x.shape[1],activation='relu'))
model2.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model2.add(Dense(10, input_dim=x.shape[1],activation='relu'))

model2.add(Dense(2, activation='softmax'))
model2.compile(loss='categorical_crossentropy',optimizer='adam')

In [22]:
# model 3
# model3
embed_dim = 128
lstm_out = 196
max_features = 1000
model3 = Sequential([
     Embedding(max_features, embed_dim, input_length = x.shape[1]),
     SpatialDropout1D(0.4),
     LSTM(lstm_out, dropout=0.2),
     Dense(2, activation='softmax')
])

model3.compile(
     loss='categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
print(model3.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5000, 128)         128000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 5000, 128)         0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 394       
Total params: 383,194
Trainable params: 383,194
Non-trainable params: 0
_________________________________________________________________
None


In [49]:
# model 4 logistic regression
model4 = LogisticRegression(verbose=1, solver='liblinear',random_state=42, C=5, penalty='l2', max_iter=1000)


In [50]:
model4.fit(X_train, Y2_train)

[LibLinear]

LogisticRegression(C=5, max_iter=1000, random_state=42, solver='liblinear',
                   verbose=1)

In [55]:
print(model4.classes_)
print(model4.coef_)

[0 1]
[[ 0.18410028  1.59505115 -0.31588795 ...  1.77707589  1.60727889
  -2.85258206]]


In [51]:
pred4 = model4.predict(X_test)

In [52]:
pred4[0]

1

In [53]:

f1_score(Y2_test, pred4)

0.8723901299459775

In [57]:
# saving model
with open("model4.pkl", 'wb') as file:
    pickle.dump(model4, file)

In [58]:
with open("model4.pkl", 'rb') as file:
    test_model = pickle.load(file)

In [60]:
tes_pred = test_model.predict(X_test)

In [61]:
f1_score(Y2_test, tes_pred)

0.8723901299459775

In [131]:
# monitor3=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
# checkpointer3=ModelCheckpoint(filepath='weights3.hdf5',verbose=0,save_best_only=True)
# model3.fit(X_train, Y_train, epochs = 10, batch_size=32,validation_data=(X_test,Y_test),
#     callbacks=[monitor3, checkpointer3], verbose = 1)
# model3.save_weights("model3.hdf5")
model3.load_weights("weights.hdf5")

In [24]:
monitor2=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
checkpointer2=ModelCheckpoint(filepath='weights2.hdf5',verbose=0,save_best_only=True)
model2.fit(X_train,Y_train,validation_data=(X_test,Y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
model2.save_weights("model2.hdf5")


Epoch 1/100
909/909 - 1s - loss: 0.0609 - val_loss: 0.2625
Epoch 2/100
909/909 - 1s - loss: 0.0564 - val_loss: 0.2942
Epoch 3/100
909/909 - 1s - loss: 0.0541 - val_loss: 0.3051
Epoch 00003: early stopping


In [133]:
monitor1=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
checkpointer1=ModelCheckpoint(filepath='weights1.hdf5',verbose=0,save_best_only=True)

model1.fit(X_train,Y_train,validation_data=(X_test,Y_test),callbacks=[monitor1,checkpointer1],verbose=2,epochs=100)
model1.save_weights("model1.hdf5")

Epoch 1/100
909/909 - 2s - loss: 0.4241 - val_loss: 0.3160
Epoch 2/100
909/909 - 1s - loss: 0.2639 - val_loss: 0.2801
Epoch 3/100
909/909 - 1s - loss: 0.2238 - val_loss: 0.2671
Epoch 4/100
909/909 - 1s - loss: 0.2005 - val_loss: 0.2616
Epoch 5/100
909/909 - 1s - loss: 0.1823 - val_loss: 0.2603
Epoch 6/100
909/909 - 1s - loss: 0.1678 - val_loss: 0.2625
Epoch 7/100
909/909 - 1s - loss: 0.1534 - val_loss: 0.2628
Epoch 00007: early stopping


In [140]:
pred1 = np.argmax(model1.predict(X_test), axis=1)
#pred = np.argmax(pred, axis=1)
pred2 = np.argmax(model2.predict(X_test), axis=1)
#pred2 = np.argmax(pred2, axis=1)
#pred3 = np.argmax(model3.predict(X_test), axis=1)

# Y_test = np.argmax(Y_test, axis=1)
Y_test = np.argmax(Y_test, axis=1)

In [141]:
# f1_score(Y_test, pred, average="weighted")
f1_model1 = f1_score(Y_test, pred1, average="weighted")
f1_model2 = f1_score(Y_test, pred2, average="weighted")
#f1_model3 = f1_score(Y_test, pred3, average="weighted")

recall_model1 = recall_score(Y_test, pred1)
recall_model2 = recall_score(Y_test, pred2)
#recall_model3 = recall_score(Y_test, pred3)


precision_model1 = precision_score(Y_test, pred1)
precision_model2 = precision_score(Y_test, pred2)
#precision_model3 = precision_score(Y_test, pred3)





In [142]:
print("f1 scores for all models")
print(f1_model1)
print(f1_model2)
# print(f1_model3)

f1 scores for all models
0.8952003335461977
0.9218411008747404


In [143]:
print('recall scores')
print(recall_model1)
print(recall_model2)
# print(recall_model3)

recall scores
0.915968855589834
0.8946672542970472


In [144]:
print('precision scores')
print(precision_model1)
print(precision_model2)
# print(precision_model3)

precision scores
0.870323841429369
0.938366718027735
