In [64]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
import re


In [68]:
# import data
data = pd.read_csv("Sentiment.csv" ,usecols = ["text", "sentiment"])
data.text = data.text.apply(lambda x: x.lower()) 
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
data.head()


Unnamed: 0,sentiment,text
0,Neutral,rt nancyleegrahn how did everyone feel about t...
1,Positive,rt scottwalker didnt catch the full gopdebate ...
2,Neutral,rt tjmshow no mention of tamir rice and the go...
3,Positive,rt robgeorge that carly fiorina is trending h...
4,Positive,rt danscavino gopdebate w realdonaldtrump deli...


In [77]:
print( 'Positive: ', data[data['sentiment'] == 'Positive'].size)
print( 'Negative: ', data[data['sentiment'] == 'Negative'].size)
print( 'Neutral : ', data[data['sentiment'] == 'Neutral'].size)

Positive:  4472
Negative:  16986
Neutral :  6284


In [23]:
for idx, row in data.iterrows():
    row[1] = row[1].replace('rt', ' ')

In [24]:
data.head()

Unnamed: 0,sentiment,text
0,Neutral,nancyleegrahn how did everyone feel about th...
1,Positive,scottwalker didnt catch the full gopdebate l...
2,Neutral,tjmshow no mention of tamir rice and the gop...
3,Positive,robgeorge that carly fiorina is trending ho...
4,Positive,danscavino gopdebate w realdonaldtrump deliv...


In [25]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)


In [26]:
X

array([[   0,    0,    0, ...,   51, 1039,    1],
       [   0,    0,    0, ..., 1577, 1356,  847],
       [   0,    0,    0, ...,   10,  696,  518],
       ...,
       [   0,    0,    0, ...,   68,   62,    3],
       [   0,    0,    0, ..., 1112, 1588,   81],
       [   0,    0,    0, ...,  196,    3,  880]])

In [27]:
embed_dim = 128
lstm_out = 196
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model

In [28]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [114]:
batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model.metrics_names)


Epoch 1/10
291/291 - 31s - loss: 0.8239 - accuracy: 0.6440
Epoch 2/10
291/291 - 30s - loss: 0.6753 - accuracy: 0.7103
Epoch 3/10
291/291 - 29s - loss: 0.6122 - accuracy: 0.7459
Epoch 4/10
291/291 - 29s - loss: 0.5624 - accuracy: 0.7672
Epoch 5/10
291/291 - 29s - loss: 0.5158 - accuracy: 0.7880
Epoch 6/10
291/291 - 29s - loss: 0.4746 - accuracy: 0.8024
Epoch 7/10
291/291 - 29s - loss: 0.4360 - accuracy: 0.8233
Epoch 8/10
291/291 - 29s - loss: 0.4084 - accuracy: 0.8327
Epoch 9/10
291/291 - 29s - loss: 0.3790 - accuracy: 0.8438
Epoch 10/10
291/291 - 29s - loss: 0.3512 - accuracy: 0.8574
144/144 - 1s - loss: 1.2330 - accuracy: 0.6540
1.2330067157745361
0.6539973616600037
['loss', 'accuracy']


## Question 1

In [119]:
model.save('model.h5')
m = load_model('model.h5')

In [134]:
text = [['A lot of good things are happening. We are respected again throughout the world, and thats a great '
         'thing. @realDonaldTrump']]

df  = pd.DataFrame(text, index=range(0,1,1), columns = ['text'])
df['text'] = df.text.apply(lambda x : x.lower()).apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
print(df.text[0])

a lot of good things are happening we are respected again throughout the world and thats a great thing realdonaldtrump


In [135]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df.text.values)
X = tokenizer.texts_to_sequences(df.text.values)
X = pad_sequences(X, maxlen=28)

In [136]:
pred_value = m.predict(X)
pred_class = m.predict_classes(X)
pred_probability = m.predict_proba(X)
print("Predicted value  :" +str(pred_class))
print("Predicted probability  :" +str(pred_probability))
print()

Predicted value  :[0]
Predicted probability  :[[0.6787184  0.06578439 0.25549722]]

