# Train and save the model

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
import re
import os

In [None]:
os.chdir("/content/drive/MyDrive/LOG8415_tps/personal_project")

In [None]:
data = pd.read_csv('Sentiment.csv')

# Keeping only the neccessary columns
data = data[['text','sentiment']]

In [None]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [None]:
def preProcess_data(text):
   text = text.lower()
   new_text = re.sub('[^a-zA-z0-9\s]','',text)
   new_text = re.sub('rt', '', new_text)
   return new_text

data['text'] = data['text'].apply(preProcess_data)


In [None]:
data.head()

Unnamed: 0,text,sentiment
0,nancyleegrahn how did everyone feel about the...,Neutral
1,scottwalker didnt catch the full gopdebate la...,Positive
2,tjmshow no mention of tamir rice and the gopd...,Neutral
3,robgeorge that carly fiorina is trending hou...,Positive
4,danscavino gopdebate w realdonaldtrump delive...,Positive


In [None]:
max_fatures = 2000

tokenizer = Tokenizer(num_words=max_fatures, split=' ')  #english
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, 28) 

Y = pd.get_dummies(data['sentiment']).values


In [None]:
Y

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       ...,
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1]], dtype=uint8)

In [None]:
data['sentiment'].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [None]:
pd.get_dummies(data['sentiment'])

Unnamed: 0,Negative,Neutral,Positive
0,0,1,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,0,1
...,...,...,...
13866,1,0,0
13867,0,0,1
13868,0,0,1
13869,1,0,0


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20)

In [None]:
Y_test.shape

(2775, 3)

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(128,recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [None]:
batch_size = 512

model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, validation_data=(X_test, Y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe96fb56490>

In [None]:
#model.save('sentiment.h5')


In [None]:
!ls -lh


total 13M
-rw------- 1 root root 1.7K Dec  1 03:42 app.py
-rw------- 1 root root 1.8K Dec  1 02:10 lambda_function.py
-rw------- 1 root root 765K Dec  1 02:40 my-deployment-package.zip
-rw------- 1 root root 1.6K Nov 29 23:58 predict.py
-rw------- 1 root root   42 Nov 29 23:31 requirements.txt
drwx------ 2 root root 4.0K Dec  3 15:13 results
-rw------- 1 root root 3.8M Nov 29 06:01 Sentiment.csv
-rw------- 1 root root 7.9M Nov 29 06:12 sentiment.h5


# Load model for prediction

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
print(tf. __version__)

2.7.0


In [None]:
# data = pd.read_csv('Sentiment.csv')
# tokenizer = Tokenizer(num_words=2000, split=' ')
# tokenizer.fit_on_texts(data['text'].values)

def preProcess_data(text):
    text = text.lower()
    new_text = re.sub('[^a-zA-z0-9\s]','',text)
    new_text = re.sub('rt', '', new_text)
    return new_text

def my_pipeline(text):
    text_new = preProcess_data(text)
    X = tokenizer.texts_to_sequences(pd.Series(text_new).values)
    X = pad_sequences(X, maxlen=28)
    return X


In [None]:
def predict(text:str="it is a rainy cold day in seattle"):
    print(text)
    clean_text = my_pipeline(text) 
    loaded_model = tf.keras.models.load_model('sentiment.h5') #load the saved model 
    predictions = loaded_model.predict(clean_text) #predict the text
    sentiment = int(np.argmax(predictions)) 
    probability = max(predictions.tolist()[0]) 
    if sentiment==0:
         t_sentiment = 'Negative' 
    elif sentiment==1:
         t_sentiment = 'Neutral'
    elif sentiment==2:
         t_sentiment='Positive'
    return { 
         "message": text,
         "sentiment": t_sentiment,
         "score": probability
    }


In [None]:
predict("RT @warriorwoman91: I liked her and was happy when I heard she was going to be the moderator. Not anymore. #GOPDebate @megynkelly  https://â€¦")

RT @warriorwoman91: I liked her and was happy when I heard she was going to be the moderator. Not anymore. #GOPDebate @megynkelly  https://â€¦


{'message': 'RT @warriorwoman91: I liked her and was happy when I heard she was going to be the moderator. Not anymore. #GOPDebate @megynkelly  https://â€¦',
 'score': 0.9829199910163879,
 'sentiment': 'Negative'}

# Manual test


In [None]:
#text = "RT @hutchinsjohne: D.C. Attorney General @AGKarlRacine Announces New Role as Affordable Housing Advocate https://t.co/9FLJ3BfCnn via @allys…"
text = "Please vote for TXT on 🍎 Music too!"
clean_text = my_pipeline(text)
predictions = model.predict(clean_text)

new_text:  please vote for txt on  music too


In [None]:
predictions

array([[0.85676926, 0.04944487, 0.09378583]], dtype=float32)

In [None]:
sentiment = int(np.argmax(predictions))
sentiment

0

In [None]:
max(predictions.tolist()[0])

0.9947063326835632