In [1]:
import pandas as pd #Basic packages for creating dataframes and loading dataset
import numpy as np

import matplotlib.pyplot as plt #Package for visualization

import re #importing package for Regular expression operations

from sklearn.model_selection import train_test_split #Package for splitting the data

from sklearn.preprocessing import LabelEncoder #Package for conversion of categorical to Numerical

from keras.preprocessing.text import Tokenizer #Tokenization
from keras.preprocessing.sequence import pad_sequences #Add zeros or crop based on the length
from keras.models import Sequential #Sequential Neural Network
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D #For layers in Neural Network
from keras.utils.np_utils import to_categorical

In [2]:
data = pd.read_csv('Sentiment.csv') #Looading the dataset

data = data[['text','sentiment']] # Keeping only the neccessary columns

In [3]:
data

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
...,...,...
13866,RT @cappy_yarbrough: Love to see men who will ...,Negative
13867,RT @georgehenryw: Who thought Huckabee exceede...,Positive
13868,"RT @Lrihendry: #TedCruz As President, I will a...",Positive
13869,RT @JRehling: #GOPDebate Donald Trump says tha...,Negative


In [4]:
data['text'] = data['text'].apply(lambda x: x.lower()) #converting to lower case
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))) #only a-z,A-Z,0-9 would be remaining in the data, else special characters are removed

In [5]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') #Removing Retweets

In [6]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

In [7]:
X = pad_sequences(X)

embed_dim = 128
lstm_out = 196

In [8]:
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model
# print(model.summary())

In [9]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [10]:
batch_size = 32
model = createmodel() #Function call to Sequential Neural Network
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)

291/291 - 28s - loss: 0.8270 - accuracy: 0.6464
144/144 - 2s - loss: 0.7617 - accuracy: 0.6699
0.7616634368896484
0.6699432134628296


In [11]:
print(model.metrics_names)

['loss', 'accuracy']


## Save the model and use the saved model to predict on new text data (ex, “A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”)

In [12]:
model.save('sentimentAnalysis.h5')

In [13]:
from keras.models import load_model
model= load_model('sentimentAnalysis.h5')

In [14]:
print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [15]:
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence)
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0)
sentiment = model.predict_classes(sentence,batch_size=1,verbose = 2)[0]
print(sentiment)
if sentiment == 0:
  print("Neutral")
elif sentiment < 0:
  print("Negative")
elif sentiment > 0:
  print("Positive")
else:
  print("Can not be determined")

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
1/1 - 0s
0
Neutral


## Apply GridSearchCV on the source code provided in the class

In [16]:
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=createmodel,verbose=2)
batch_size= [10, 20, 40]
epochs = [1, 2]
param_grid= {'batch_size':batch_size, 'epochs':epochs}
from sklearn.model_selection import GridSearchCV
grid  = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result= grid.fit(X_train,Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

744/744 - 51s - loss: 0.8259 - accuracy: 0.6484
186/186 - 2s - loss: 0.7488 - accuracy: 0.6799
744/744 - 55s - loss: 0.8239 - accuracy: 0.6453
186/186 - 2s - loss: 0.7832 - accuracy: 0.6563
744/744 - 57s - loss: 0.8228 - accuracy: 0.6451
186/186 - 2s - loss: 0.7541 - accuracy: 0.6848
744/744 - 55s - loss: 0.8281 - accuracy: 0.6441
186/186 - 3s - loss: 0.7414 - accuracy: 0.6841
744/744 - 69s - loss: 0.8162 - accuracy: 0.6498
186/186 - 4s - loss: 0.7988 - accuracy: 0.6642
Epoch 1/2
744/744 - 119s - loss: 0.8282 - accuracy: 0.6492
Epoch 2/2
744/744 - 127s - loss: 0.6810 - accuracy: 0.7119
186/186 - 8s - loss: 0.7441 - accuracy: 0.6885
Epoch 1/2
744/744 - 118s - loss: 0.8217 - accuracy: 0.6478
Epoch 2/2
744/744 - 89s - loss: 0.6824 - accuracy: 0.7111
186/186 - 6s - loss: 0.7461 - accuracy: 0.6783
Epoch 1/2
744/744 - 122s - loss: 0.8240 - accuracy: 0.6484
Epoch 2/2
744/744 - 125s - loss: 0.6737 - accuracy: 0.7176
186/186 - 8s - loss: 0.7388 - accuracy: 0.6880
Epoch 1/2
744/744 - 123s - loss