In [2]:
import pandas as pd
import numpy
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import TensorBoard

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Sentiment.csv')
data = data[['text', 'sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

print(data[data['sentiment'] == 'Positive'].size)
print(data[data['sentiment'] == 'Negative'].size)
print(data[data['sentiment'] == 'Neutral'].size)

for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, maxlen=28)

embed_dim = 128
lstm_out = 196

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

batch_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tb = TensorBoard(log_dir="logs/{}", histogram_freq=0, write_graph=True, write_images=True)
model.fit(X_train, Y_train, epochs=5, batch_size=batch_size, verbose=2, callbacks=[tb])

model.save('/content/drive/MyDrive/Colab Notebooks/model.h5')
m = load_model('/content/drive/MyDrive/Colab Notebooks/model.h5')

print(m.summary())

text = [['A lot of good things are happening. We are respected again throughout the world, and thats a great '
         'thing.@realDonaldTrump']]
df = pd.DataFrame(text, index=range(0, 1, 1), columns=list('t'))
df['t'] = df['t'].apply(lambda x: x.lower())
df['t'] = df['t'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['t'].values)
X = tokenizer.texts_to_sequences(df['t'].values)
X = pad_sequences(X, maxlen=28)

output = m.predict(X)
print(output)
print(numpy.where(max(output[0])), ":", (max(output[0])))
print(numpy.argmax(output))
print(model.summary())

4472
16986
6284
tokenizer.texts_to_sequences [[52, 78, 341, 456, 22, 2, 420, 365, 95, 29, 51, 1039, 1], [351, 125, 1954, 2, 597, 1, 29, 51, 228, 35, 198, 5, 175, 1417, 10, 1577, 1356, 847], [61, 498, 5, 8, 2, 1, 21, 1797, 10, 696, 518], [17, 278, 238, 6, 736, 96, 160, 24, 132, 5, 2, 179, 10, 1, 214, 786, 16], [1239, 1, 291, 23, 2, 1685, 236, 10, 2, 669, 5, 176, 188, 403, 563], [118, 16, 47, 211, 333, 9, 70, 274, 421, 1357, 1955, 1194, 63, 1956, 192, 1, 56], [9, 1240, 160, 8, 21, 1418, 64, 9, 606, 184, 21, 168, 4, 32, 2, 632, 20, 1008, 1, 49, 822], [168, 16, 759, 404, 42, 135, 633, 182, 1578, 1], [10, 2, 167, 102, 535, 32, 2, 101, 607, 45, 57, 1798, 7, 16, 658, 1], [29, 116, 24, 881, 14, 1, 915], [23, 10, 48, 17], [1799, 38, 1957, 46, 4, 598, 19, 65, 1, 175, 608, 5, 2, 51, 161, 325], [34, 1148, 47, 457, 22, 52, 153, 2, 1, 21], [1149, 346, 293, 1, 1958], [619, 9, 609, 316, 174, 55, 1959, 10, 1500, 183, 14, 46, 1], [338, 382, 2, 5, 1040, 13, 86, 473, 283, 6, 265, 1], [250, 519, 938, 13, 4