In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

In [2]:
data = pd.read_csv("lstm.csv")
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
data.isnull().sum()

clean_text    4
category      7
dtype: int64

In [4]:
data.shape

(162980, 2)

In [8]:
data = data.dropna()

In [11]:
data = data.reset_index()

In [12]:
data

Unnamed: 0,index,clean_text,category
0,0,the world sees new india huge improvements int...,1.0
1,1,pappu can announce bribe 72k but modi being ca...,-1.0
2,2,only objective oppose modi and criticize whate...,1.0
3,3,dont want for higher and good education still ...,1.0
4,4,jada nacre nahi definitely modi should take al...,0.0
...,...,...,...
162964,162964,now chamchas and antimodi gang will start atta...,0.0
162965,162965,terrorists pakistan want lose opposition win m...,1.0
162966,162966,just tell one thing modi let mallya get out th...,1.0
162967,162967,body knows them only motto life famous opposin...,1.0


In [13]:
labels = pd.get_dummies(data.category)
labels.columns = ["negative", "neutral", "positive"]
labels.head(5)

Unnamed: 0,negative,neutral,positive
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,1,0


In [14]:
data = data.drop(columns = ['category'])

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
tokenizer = Tokenizer(num_words = 8150, lower = True, split = " ", oov_token = "~")
tokenizer.fit_on_texts(data["clean_text"])

In [17]:
word_index = tokenizer.word_index
len(word_index)

113679

In [18]:
data["clean_text"] = tokenizer.texts_to_sequences(data["clean_text"])

In [19]:
data.head()

Unnamed: 0,index,clean_text
0,0,"[3, 128, 2763, 117, 10, 476, 1, 691, 1, 87, 1,..."
1,1,"[240, 32, 565, 2332, 1499, 16, 2, 138, 163, 38..."
2,2,"[29, 3112, 1168, 2, 4, 1313, 581, 156, 305, 28..."
3,3,"[40, 72, 5, 1592, 4, 78, 616, 149, 72, 67, 206..."
4,4,"[1, 1, 472, 685, 2, 51, 101, 17, 3, 91]"


In [20]:
tweets = pad_sequences(data["clean_text"]) 

In [21]:
tweets.shape

(162969, 52)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size = 0.15)

In [24]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim = 8150, output_dim = 32),
    keras.layers.LSTM(128),
    keras.layers.Dense(128, activation = "leaky_relu", kernel_initializer = "he_normal", kernel_regularizer = "l1"),
    keras.layers.Dropout(0.35),
    keras.layers.Dense(3, activation = "softmax", kernel_initializer = "glorot_normal")
])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          260800    
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 3)                 387       
                                                                 
Total params: 360,131
Trainable params: 360,131
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [26]:
history = model.fit(
    X_train, y_train, 
    epochs = 20, validation_data = (X_test, y_test),
    callbacks = [keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True)]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [27]:
model.evaluate(X_test, y_test)



[0.19169634580612183, 0.9662112593650818]

In [28]:
model.save('text_emotion.h5')