In [29]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)


from nltk import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords


%matplotlib inline
train = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep="\t")
test = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep="\t")
train_original = train.copy()
test_original = test.copy()

### Text Preprocessing

In [31]:
def Preprocess(df):
    for i in df['Phrase']:
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        i = tokenizer.tokenize(i)  
    tokenized_review_1 = df['Phrase'].apply(lambda x: x.split())
    ps = PorterStemmer()
    WL = WordNetLemmatizer()
    stemmed_review = tokenized_review_1.apply(lambda x: [ps.stem(i) for i in x])
    lemmatized_review = tokenized_review_1.apply(lambda x: [WL.lemmatize(i) for i in x])
    stop = stopwords.words('english')
    stemmed_review = stemmed_review.apply(lambda x: [item for item in x if item not in stop])
    lemmatized_review = lemmatized_review.apply(lambda x: [item for item in x if item not in stop])
    for i in range(len(stemmed_review)):
        stemmed_review[i] = ' '.join(stemmed_review[i])
    df['stemmed_review'] = stemmed_review
    for i in range(len(lemmatized_review)):
        lemmatized_review[i] = ' '.join(lemmatized_review[i])
    df['lemmatized_review'] = lemmatized_review
    df = df[df["stemmed_review"] != '']
    df = df[df["lemmatized_review"] != '']


In [32]:
Preprocess(train)

In [33]:
print("normal")
print("-"*100)

print(train['Phrase'][0])
print("\nafter stemming")
print("-"*100)

print(train['stemmed_review'][0])

print("\nafter lemmatizing")
print("-"*100)
print(train['lemmatized_review'][0])


normal
----------------------------------------------------------------------------------------------------
A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .

after stemming
----------------------------------------------------------------------------------------------------
A seri escapad demonstr adag good goos also good gander , occasion amus none amount much stori .

after lemmatizing
----------------------------------------------------------------------------------------------------
A series escapade demonstrating adage good goose also good gander , occasionally amuses none amount much story .


### LSTM 

In [34]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [35]:
y_train_NN = train["Sentiment"]
x_Train_stemmed_NN = train["stemmed_review"]
x_Train_lemmatized_NN = train["lemmatized_review"]

In [36]:
x_Train_stemmed_NN.shape, y_train_NN.shape 

((156060,), (156060,))

In [37]:
tokenize = Tokenizer()
tokenize.fit_on_texts(x_Train_stemmed_NN.values)

X_train_stemmed = tokenize.texts_to_sequences(x_Train_stemmed_NN)
tokenize.fit_on_texts(x_Train_lemmatized_NN.values)
X_train_lemmatized = tokenize.texts_to_sequences(x_Train_lemmatized_NN)

In [38]:
X_train_stemmed = pad_sequences(X_train_stemmed).astype(float)
X_train_lemmatized = pad_sequences(X_train_lemmatized)

In [39]:
EMBEDDING_DIM = 100
unknown = len(tokenize.word_index)+1
model = Sequential()
model.add(Embedding(unknown, EMBEDDING_DIM))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2 ))
model.add(Dense(5, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1904500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
Total params: 2,022,393
Trainable params: 2,022,393
Non-trainable params: 0
_________________________________________________________________


In [41]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_train_stemmed, y_train_NN, test_size=0.2)
x_train = pad_sequences(x_train)
x_test = pad_sequences(x_test)
model.fit(x_train, y_train, batch_size=128, epochs=7, verbose=1)
y_pred = model.predict_classes(x_test)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [42]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.56      0.31      0.40      1391
           1       0.54      0.57      0.55      5469
           2       0.74      0.81      0.77     15893
           3       0.59      0.54      0.57      6625
           4       0.60      0.34      0.44      1834

    accuracy                           0.66     31212
   macro avg       0.60      0.52      0.55     31212
weighted avg       0.65      0.66      0.65     31212



In [46]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_train_lemmatized, y_train_NN, test_size=0.2)
x_train = pad_sequences(x_train)
x_test = pad_sequences(x_test)
model.fit(x_train, y_train, batch_size=128, epochs=7, verbose=1)
y_pred = model.predict_classes(x_test)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [47]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.48      0.41      0.44      1443
           1       0.54      0.52      0.53      5444
           2       0.75      0.77      0.76     15835
           3       0.56      0.60      0.58      6671
           4       0.57      0.39      0.46      1819

    accuracy                           0.65     31212
   macro avg       0.58      0.54      0.55     31212
weighted avg       0.65      0.65      0.65     31212

