# Bhaav: A Multilingual Sentiment Aanalyzer for Indian Languages

In [2]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from textblob import Word
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split 

In [3]:
#Loading the dataset
data = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv') 

In [4]:
data.head()

In [5]:
data.describe

In [6]:
#Data Cleaning 
def cleaning(df, stop_words):

    df['text'] = df['text'].apply(lambda x: ' '.join(x.lower() for x in x.split()))

    # Replacing the digits/numbers

    df['text'] = df['text'].str.replace('d', '')

    # Removing stop words

    df['text'] = df['text'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))

    # Lemmatization

    df['text'] = df['text'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))

    return df

stop_words = stopwords.words('english')

data_v1 = cleaning(data, stop_words)

In [7]:
data_v1

In [123]:
#Generating Embeddings
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(data_v1['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [124]:
X

In [10]:
#Model Building
model = Sequential()
model.add(Embedding(500, 120, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352, activation='LeakyReLU'))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

In [11]:
#Splitting the data into training and testing
y=pd.get_dummies(data_v1['label'])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [12]:
#Model Training
model.fit(X_train, y_train, epochs = 20, batch_size=32, verbose =1)

In [13]:
model.evaluate(X_test,y_test)

In [16]:
#Saving the model
model.save("Bhaav")


In [15]:
from keras.utils.vis_utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

In [160]:
#Saving the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)