In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import count 
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import label_binarize
from keras.preprocessing.text import Tokenizer
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout, GlobalMaxPool1D, Conv1D, BatchNormalization, MaxPool1D

You can find the data for this competition here: https://www.kaggle.com/c/movie-review-sentiment-analysis-kernels-only/data

In [None]:
train = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv.zip', sep = '\t')
test = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv.zip', sep = '\t')

In [None]:
test.head()

In [None]:
train.head()

In [None]:
sns.countplot(train['Sentiment'])
plt.title("Sentimental")

In [None]:
def create_vocabulary(df):
    counter = count(2)
    vocabulary = dict()
    lemmatizer = WordNetLemmatizer()
    for k in df['Phrase']:
        tokens= k.lower().split(' ')
        for token in tokens:
            lemmatoken = lemmatizer.lemmatize(token)
            if lemmatoken in vocabulary:
                continue
            vocabulary[lemmatoken] = next(counter)
    print('Vocabulary Length: {}'.format(max(vocabulary.values())))
    return vocabulary

In [None]:
def preprocess_df(df, vocabulary, max_sentence_length):
    vocabulary_length = max(vocabulary.values())
    X = []
    Y = label_binarize(df.Sentiment.to_xarray(), classes= [0,1,2,3,4]) if 'Sentiment' in df else None
    lemmatizer = WordNetLemmatizer()
    for sample in df.iterrows():
        tokens = sample[1]['Phrase'].lower().split(" ")
        vocab_tokens = []
        for i in range(max_sentence_length):
            try: 
                vocab_tokens.append(vocabulary.get(lemmatizer.lemmatize(tokens[i]), 1))
            except IndexError:
                vocab_tokens.append(0)
        X.append(vocab_tokens)
    return np.asarray(X), Y

In [None]:
vocabulary = create_vocabulary(train)
X, Y = preprocess_df(train, vocabulary, 52)

In [None]:
train_X, valid_X, train_Y, valid_Y = train_test_split(X,Y, test_size= 0.2, random_state= 42)

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim= 15189, output_dim= 10, mask_zero = True))

model.add(Conv1D(128, 3, activation= 'relu', padding= 'same'))
model.add(MaxPool1D(2))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Conv1D(128, 3, activation= 'relu', padding= 'same'))
model.add(MaxPool1D(2))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Conv1D(128, 3, activation= 'relu', padding= 'same'))
model.add(MaxPool1D(2))
model.add(BatchNormalization())
model.add(Dropout(0.15))

model.add(Conv1D(64, 3, activation= 'relu', padding= 'same'))
model.add(MaxPool1D(2))
model.add(BatchNormalization())
model.add(Dropout(0.15))

model.add(Conv1D(64, 3, activation= 'relu', padding= 'same'))
model.add(MaxPool1D(2))
model.add(BatchNormalization())
model.add(Dropout(0.15))

model.add(Bidirectional(LSTM(1280, recurrent_dropout = 0.5, dropout= 0.2, return_sequences= True)))
model.add(Bidirectional(LSTM(640, recurrent_dropout = 0.5, dropout= 0.2, return_sequences= True)))
model.add(Bidirectional(LSTM(640, recurrent_dropout = 0.5, dropout= 0.2, return_sequences= True)))
model.add(Bidirectional(LSTM(320, recurrent_dropout = 0.5, dropout= 0.2, return_sequences= True)))

model.add(GlobalMaxPool1D())

model.add(Dense(64, activation= 'relu'))
model.add(BatchNormalization())
model.add(Dropout(0.15))

model.add(Dense(32, activation= 'relu'))
model.add(BatchNormalization())
model.add(Dropout(0.15))

model.add(Dense(32, activation= 'relu'))
model.add(BatchNormalization())
model.add(Dropout(0.15))

model.add(Dense(5, activation= 'softmax'))

model.compile(optimizer = 'adam', loss= 'categorical_crossentropy', metrics= ['acc'])
model.summary()

In [None]:
history = model.fit(x= train_X, y= train_Y, batch_size = 500, epochs= 5, validation_data= (valid_X, valid_Y))

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history["val_"+string])
    plt.xlabel("Epochs")
    plt.xlabel(string)
    plt.legend([string, "val_"+string])
    plt.show()
plot_graphs(history, 'acc')
plot_graphs(history, 'loss')

In [None]:
test_X, test_Y = preprocess_df(test, vocabulary, 52)

In [None]:
predictions = model.predict(x= np.asarray(test_X))
prediction_results = pd.concat([test, pd.DataFrame([np.argmax(k) for k in predictions], columns= ['Sentiment'])], axis = 1)

In [None]:
submission = prediction_results[['PhraseId', 'Sentiment']]
submission.to_csv('submission.csv', index= False)