# Twitter sentiment
- this notebook was written by Austin Wilson and Paul Nguyen
- in the first section we prepare the data for ingestion by our models
    - we used two techniques for preprocessing ()

In [99]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score,plot_precision_recall_curve, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import re
import pickle

In [56]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/austinwilson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/austinwilson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/austinwilson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

twitter_training.csv has the following attributes
- Tweet_ID: unique identifier of the tweet (different for all tweets)
- Entity: some topic that the tweet is related to
- Sentiment: a view of or attitude toward a situation or event; an opinion
- Text: text data entered as tweet by user

In [49]:
data = pd.read_csv('twitter_training.csv', names=["Tweet_ID", "Entity", "Sentiment", "Text"])
data.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


Text is the dependent variable we will use to predict Sentiment

In [50]:
data.drop(['Tweet_ID', 'Entity'], axis=1, inplace=True)
data.head()

Unnamed: 0,Sentiment,Text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [51]:
data.Sentiment.value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: Sentiment, dtype: int64

we are only interested in positive and negative (binary classification)
- remove neutral
- remove irrelevant
- make all text lower case
- remove invaid text
- remove rt (indicates retweet - not relevant to our purposes)

In [52]:
data = data[data.Sentiment != "Neutral"]
data = data[data.Sentiment != "Irrelevant"]
data.Text = data.Text.apply(lambda x: str(x).lower())
data.Text = data.Text.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data.Text = data.Text.replace('rt', ' ')

In [53]:
data.Sentiment.value_counts()

Negative    22542
Positive    20832
Name: Sentiment, dtype: int64

the data is balanced

## here we have two seperate preprocessing methods 
- one is using tfidf
- the other is using nltk to tokenize and lemmatize the words
- note that we will keep two datasets in memory to distinguish betwee the different approaches

In [54]:
data_copy = data.copy(deep=True)

In [57]:
lemmatiser = WordNetLemmatizer()
stopwords = set(stopwords.words())
def remove_stopwords(ls):
    # Removes stop words and lemmatises
    ls = [lemmatiser.lemmatize(word) for word in ls if word not in (stopwords) and (word.isalpha())]
    
    ls = " ".join(ls)
    return ls

data.Text = data.Text.apply(word_tokenize)
data.Text = data.Text.apply(remove_stopwords)

In [58]:
data.head()

Unnamed: 0,Sentiment,Text
0,Positive,getting borderland murder
1,Positive,coming border kill
2,Positive,getting borderland kill
3,Positive,coming borderland murder
4,Positive,getting borderland murder


In [64]:
max_features = 1000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data.Text.values)
X = tokenizer.texts_to_sequences(data.Text.values)
X = pad_sequences(X)

# save tokenizer for later 
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [71]:
Y = pd.get_dummies(data.Sentiment).values
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(29060, 99) (29060, 2)
(14314, 99) (14314, 2)


## tfidf
- create tfidf vectorizer
- convert text to tfidf
- one hot encode y
- create training and testing data 

In [92]:
tfidfvectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=5)
le = LabelEncoder()

In [61]:
corpus = data_copy.Text
tfidf_matrix=tfidfvectorizer.fit_transform(corpus)
tfidf_data=tfidf_matrix.toarray()
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(tfidfvectorizer, fin)

In [79]:
X2=tfidf_data
y2= pd.get_dummies(data_copy.Sentiment).values
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2, test_size = 0.33, random_state = 42)

In [93]:
# data for logistic regression
y_lr=le.fit_transform(data.Sentiment)

In [105]:
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X,y_lr, test_size = 0.33, random_state = 42)

verify the size of the training and test data 

In [80]:
print(X_train2.shape,y_train2.shape)
print(X_test2.shape,y_test2.shape)

(29060, 5000) (29060, 2)
(14314, 5000) (14314, 2)


# models
- the first is a LSTM (Long Short Term Memory)
    - this is a recurrent nueral network which can improve accuracy of NLP problems
    - able the detect context of text
- Neural Network 1 
    - this neural network is very minimal
    - one dense layer with 25 notes and relu activation function
    - the output is 2 nodes and softmax activation function
- Neural Network 2
    - here we have 4 layers with 25, 50,25,10 fully connected layers, respectively and all are relu activation function
    - the output is 2 nodes and softmax activation function

In [72]:
lstm_monitor = EarlyStopping(monitor='accuracy', patience=1)
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
checkpointer1=ModelCheckpoint(filepath='best_weights1.hdf5',verbose=0,save_best_only=True)
checkpointer2=ModelCheckpoint(filepath='best_weights2.hdf5',verbose=0,save_best_only=True)



In [81]:
max_features = 1000
embed_dim = 128
lstm_out = 196

model = Sequential([
     Embedding(max_features, embed_dim, input_length = X.shape[1]),
     SpatialDropout1D(0.4),
     LSTM(lstm_out, dropout=0.2),
     Dense(2, activation='softmax')
])
checkpointer1=ModelCheckpoint(filepath='best_weights1.hdf5',verbose=0,save_best_only=True)
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
model1=Sequential()
model1.add(Dense(25, input_dim=X2.shape[1],activation='relu'))
model1.add(Dense(2, activation='softmax'))
# model 2 
model2=Sequential()
model2.add(Dense(25, input_dim=X2.shape[1],activation='relu'))
model2.add(Dense(50, input_dim=X2.shape[1],activation='relu'))
model2.add(Dense(25, input_dim=X2.shape[1],activation='relu'))
model2.add(Dense(10, input_dim=X2.shape[1],activation='relu'))
model2.add(Dense(2, activation='softmax'))
# model 3 logistic regression
model3 = LogisticRegression(verbose=1, solver='liblinear',random_state=42, C=5, penalty='l2', max_iter=1000)


In [76]:
model.compile(
     loss='categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.fit(X_train, y_train, epochs = 10, batch_size=32, callbacks=[lstm_monitor], verbose = 1)
model.save_weights("best_weights_lstm.hdf5")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fead7655e80>

In [83]:
model1.compile(loss='categorical_crossentropy',optimizer='adam')
model1.fit(X_train2,y_train2,validation_data=(X_test2,y_test2),callbacks=[monitor,checkpointer1],verbose=2,epochs=100)
model1.save_weights("best_weights_nn1.hdf5")

Epoch 1/100
909/909 - 1s - loss: 0.4227 - val_loss: 0.3201
Epoch 2/100
909/909 - 1s - loss: 0.2635 - val_loss: 0.2785
Epoch 3/100
909/909 - 1s - loss: 0.2219 - val_loss: 0.2642
Epoch 4/100
909/909 - 1s - loss: 0.1972 - val_loss: 0.2621
Epoch 5/100
909/909 - 1s - loss: 0.1795 - val_loss: 0.2584
Epoch 6/100
909/909 - 1s - loss: 0.1631 - val_loss: 0.2606
Epoch 7/100
909/909 - 1s - loss: 0.1495 - val_loss: 0.2607
Epoch 00007: early stopping


In [85]:
model2.compile(loss='categorical_crossentropy',optimizer='adam')
model2.fit(X_train2,y_train2,validation_data=(X_test2,y_test2),callbacks=[monitor,checkpointer2],verbose=2,epochs=100)
model2.save_weights("best_weights_nn2.hdf5")

Epoch 1/100
909/909 - 1s - loss: 0.4660 - val_loss: 0.3796
Epoch 2/100
909/909 - 1s - loss: 0.2843 - val_loss: 0.2971
Epoch 3/100
909/909 - 1s - loss: 0.2043 - val_loss: 0.2681
Epoch 4/100
909/909 - 1s - loss: 0.1648 - val_loss: 0.2679
Epoch 5/100
909/909 - 1s - loss: 0.1453 - val_loss: 0.2828
Epoch 00005: early stopping


In [95]:
model3.fit(X_train_lr, y_train_lr)

[LibLinear]

LogisticRegression(C=5, max_iter=1000, random_state=42, solver='liblinear',
                   verbose=1)

# score training data

In [88]:
pred = model.predict(X_test)
pred1 = model1.predict(X_test2)
pred2 = model2.predict(X_test2)
pred3 = model3.predict(X_test_lr)

In [110]:
model_cm = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(pred, axis=1))
model1_cm = confusion_matrix(np.argmax(y_test2, axis=1), np.argmax(pred1, axis=1))
model2_cm = confusion_matrix(np.argmax(y_test2, axis=1), np.argmax(pred2, axis=1))


In [112]:
model_cr = classification_report(np.argmax(y_test, axis=1), np.argmax(pred, axis=1))
model1_cr = classification_report(np.argmax(y_test2, axis=1), np.argmax(pred1, axis=1))
model2_cr = classification_report(np.argmax(y_test2, axis=1), np.argmax(pred2, axis=1))

In [113]:
print(model_cm)
print(model1_cm)
print(model2_cm)

[[1226 6281]
 [ 826 5981]]
[[6608  899]
 [ 592 6215]]
[[6719  788]
 [ 590 6217]]


In [114]:
print(model_cr)
print(model1_cr)
print(model2_cr)

              precision    recall  f1-score   support

           0       0.60      0.16      0.26      7507
           1       0.49      0.88      0.63      6807

    accuracy                           0.50     14314
   macro avg       0.54      0.52      0.44     14314
weighted avg       0.55      0.50      0.43     14314

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      7507
           1       0.87      0.91      0.89      6807

    accuracy                           0.90     14314
   macro avg       0.90      0.90      0.90     14314
weighted avg       0.90      0.90      0.90     14314

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      7507
           1       0.89      0.91      0.90      6807

    accuracy                           0.90     14314
   macro avg       0.90      0.90      0.90     14314
weighted avg       0.90      0.90      0.90     14314

