In [44]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd


#nltk
import nltk

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for part-of-speech tagging
from nltk import pos_tag
from nltk.corpus import movie_reviews

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# BeautifulSoup libraray
from bs4 import BeautifulSoup

import re # regex

#model_selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.metrics import classification_report

#preprocessing scikit

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
 
#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

#keras
from tensorflow import keras
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Flatten ,Embedding,Input,LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import text_to_word_sequence

#gensim w2v
#word2vec
from gensim.models import Word2Vec

In [99]:
rev_frame=pd.read_csv(r'./imdb.csv')
df=rev_frame.copy()


In [46]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [47]:
df= df[["review","sentiment"]]
df["review"]=df["review"]
df["rating"]=df["sentiment"].apply(lambda x: 1 if x=="positive" else 0)
df.drop("sentiment",axis=1,inplace=True)
df.head()

Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [48]:
print(df.shape)
df.head()

(299, 2)


Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [49]:
df["rating"].isnull().sum()
df.drop_duplicates(subset=["rating","review"],keep="first",inplace=True)
print(df.shape)
df.head()


(299, 2)


Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [50]:
for review in df['review'][:5]:
    print(review+'\n'+'\n')

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [51]:
df.head()

Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [52]:
df["sentiment"]=df["rating"]

In [53]:
df.drop("rating",axis=1,inplace=True)

In [54]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [55]:
df["sentiment"].value_counts()  

sentiment
0    161
1    138
Name: count, dtype: int64

In [56]:
def clean_reviews(review):  
    
    # 1. Removing html tags
    review_text = BeautifulSoup(review,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # 3. Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [57]:
pos_df=df.loc[df.sentiment==1,:][:50000]
neg_df=df.loc[df.sentiment==0,:][:50000]

In [58]:
pos_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1


In [59]:
neg_df.head()

Unnamed: 0,review,sentiment
3,Basically there's a family where a little boy ...,0
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
10,Phil the Alien is one of those quirky films wh...,0
11,I saw this movie when I was about 12 when it c...,0


In [60]:
df=pd.concat([pos_df,neg_df],ignore_index=True)

In [61]:
print(df.shape)
df.head()

(299, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,"Petter Mattei's ""Love in the Time of Money"" is...",1
4,"Probably my all-time favorite movie, a story o...",1


In [62]:
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape) 
df.head()

(299, 2)


Unnamed: 0,review,sentiment
0,"Nicholas Walker is Paul, the local town Revera...",0
1,"Maybe it was the title, or the trailer (certai...",0
2,Nice character development in a pretty cool mi...,1
3,I am not a golf fan by any means. On May 26 ab...,1
4,"I had heard good things about ""States of Grace...",0


In [63]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences=[]
sum=0
for review in df['review']:
    sents=tokenizer.tokenize(review.strip())
    sum+=len(sents)
    for sent in sents:
        cleaned_sent=clean_reviews(sent)
        sentences.append(cleaned_sent.split()) # can use word_tokenize also.
print(sum)
print(len(sentences)) 

3074
3074


In [64]:
for te in sentences[:5]:
    print(te,"\n")

['nicholas', 'walker', 'paul', 'local', 'town', 'reverand', 'married', 'martha', 'ally', 'sheedy', 'also', 'habitual', 'womanizer', 'decides', 'fake', 'death', 'run', 'away', 'current', 'affair', 'veronica', 'dara', 'tomanovich'] 

['however', 'get', 'bout', 'amnesia', 'hence', 'name', 'film'] 

['sally', 'kirkland', 'also', 'hand', 'crazy', 'old', 'coot', 'pine', 'good', 'reverand', 'shade', 'misery', 'type', 'way'] 

['sad', 'see', 'pretty', 'good', 'cast', 'wasted', 'like'] 

['least', 'bit', 'john', 'savage', 'horridly', 'forgettable', 'role', 'shoddy', 'private', 'investigator'] 



In [65]:
import gensim
w2v_model=gensim.models.Word2Vec(sentences=sentences,vector_size=300,window=10,min_count=1)

In [66]:
w2v_model.train(sentences,epochs=10,total_examples=len(sentences))

(339948, 356490)

In [67]:
w2v_model.wv.get_vector('like')

array([ 0.06569339,  0.9521061 ,  0.11548917,  0.4258007 ,  0.03242154,
       -0.7712088 ,  0.5788869 ,  1.5357063 ,  0.35029316, -0.20203252,
       -0.10399392, -0.6153211 , -0.01026566, -0.02313555, -0.5600405 ,
       -0.6065037 ,  0.36840317, -0.25274125,  0.170356  , -0.34891775,
       -0.53054756, -0.09185503,  0.6308288 ,  0.15072033,  0.6777105 ,
        0.20596495, -0.58892244, -0.09038148, -0.34430766, -0.66739905,
       -0.01071539, -0.45277512, -0.00966297, -0.1204746 , -0.21174681,
        0.27347007,  0.50031686, -0.8033316 ,  0.04401803, -0.26409885,
       -0.46483883,  0.13654856,  0.14124721, -0.6807424 ,  0.3105085 ,
        0.785298  ,  0.24934553,  0.34408015, -0.06064266,  0.88819003,
        0.11927018, -0.07060729, -0.5066343 , -0.07347614, -0.24474396,
        0.6426907 ,  0.30355987, -0.06062137,  0.18441862, -0.14712624,
       -0.30907133, -0.35710052,  0.0520143 ,  0.3008788 ,  0.30269837,
        0.06726049, -0.19259965,  0.2945421 , -0.30008936, -0.21

In [68]:
w2v_model.wv.similarity('good','like')

0.9998647

In [69]:
vocab=list(w2v_model.wv.key_to_index.keys())
print("The total number of words are : ",len(vocab))

The total number of words are :  8310


In [70]:
w2v_model.wv.most_similar('like')

[('look', 0.9999458193778992),
 ('get', 0.9999440908432007),
 ('point', 0.9999410510063171),
 ('could', 0.9999406933784485),
 ('sure', 0.9999405741691589),
 ('way', 0.9999405741691589),
 ('make', 0.9999405741691589),
 ('first', 0.9999405145645142),
 ('may', 0.9999402165412903),
 ('left', 0.9999396800994873)]

In [71]:
w2v_model.wv.similarity('good','like')

0.9998647

In [72]:
print("The no of words :",len(vocab))

The no of words : 8310


In [73]:
word_vec_dict={}
for word in vocab:
  word_vec_dict[word]=w2v_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict))


The no of key-value pairs :  8310


In [74]:

for word in vocab[:5]:
  print(word_vec_dict[word])

[ 0.05977252  0.8337216   0.10976542  0.36348465  0.02302567 -0.6690695
  0.5180043   1.3355737   0.2970722  -0.18464443 -0.08126971 -0.5365142
 -0.01518593 -0.01027349 -0.4910026  -0.5360433   0.3193925  -0.1986805
  0.13189892 -0.30331755 -0.4455245  -0.097418    0.5639801   0.13058676
  0.5868306   0.17390239 -0.5130116  -0.08153793 -0.3035508  -0.5819596
 -0.01086684 -0.37812197 -0.00576689 -0.1249825  -0.17744602  0.23063433
  0.4308188  -0.70327306  0.0363547  -0.23221052 -0.4086513   0.11733863
  0.12458585 -0.5901146   0.26629725  0.68112427  0.2088517   0.30091712
 -0.0506034   0.77161163  0.09488811 -0.05809205 -0.44701067 -0.05666011
 -0.20698686  0.5645266   0.26723355 -0.05548063  0.1588139  -0.12131637
 -0.2650333  -0.30887687  0.03797302  0.26410398  0.24464484  0.06458479
 -0.16219422  0.26920736 -0.27348977 -0.18735404 -0.11101606  0.3840447
  0.6500882  -0.63154745  0.02647547  0.2995403  -0.53862035  0.08421961
 -0.16555308  0.47416288 -0.2320645  -0.79532856  0.0162

In [75]:
df['clean_review']=df['review'].apply(clean_reviews)

In [76]:
maxi=-1
for i,rev in enumerate(df['clean_review']):
    tokens=rev.split()
    if(len(tokens)>maxi):
        maxi=len(tokens)
print(maxi)

455


In [77]:
tok = Tokenizer()
tok.fit_on_texts(df['clean_review'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['clean_review'])

In [78]:
max_rev_len=1565  # max lenght of a review
vocab_size = len(tok.word_index) + 1  # total no of words
embed_dim=300 # embedding dimension as choosen in word2vec constructor

In [79]:
pad_rev= pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
pad_rev.shape 

(299, 1565)

In [80]:
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector

In [81]:
print(embed_matrix[14])

[ 0.05653781  0.78084302  0.09102517  0.33659336  0.02492869 -0.63183379
  0.47118786  1.24981368  0.28339848 -0.16488972 -0.08179015 -0.50740945
 -0.0123985  -0.01444817 -0.45808718 -0.5109874   0.30070269 -0.20652786
  0.13537645 -0.27852017 -0.42502365 -0.07800104  0.52577454  0.12367938
  0.54673386  0.17385848 -0.48768422 -0.08152618 -0.27416494 -0.55040282
 -0.01340769 -0.37257954 -0.00650043 -0.10268549 -0.1696758   0.22505562
  0.40403828 -0.6547752   0.03086614 -0.22048157 -0.38893089  0.10818976
  0.12516741 -0.55248344  0.24855003  0.64226973  0.20297481  0.27352807
 -0.0507049   0.72208476  0.09628251 -0.053627   -0.40998846 -0.05571096
 -0.1939158   0.5284161   0.24927111 -0.05226457  0.14598984 -0.12573306
 -0.24910277 -0.29560885  0.03448204  0.25589949  0.24580097  0.05857323
 -0.1568896   0.2441323  -0.24791658 -0.16944148 -0.10013677  0.35049582
  0.61903769 -0.5935604   0.02259468  0.28327468 -0.52092409  0.07521256
 -0.14640166  0.44378629 -0.22287062 -0.7506392   0

In [82]:
# prepare train and val sets first
Y=keras.utils.to_categorical(df['sentiment'])  # one hot target as required by NN.
x_train,x_test,y_train,y_test=train_test_split(pad_rev,Y,test_size=0.20,random_state=42)


In [83]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_rev_len,embeddings_initializer=Constant(embed_matrix)))
model.add(LSTM(64,return_sequences=False))
model.add(Flatten())
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.50))
model.add(Dropout(0.20))
model.add(Dense(2,activation='sigmoid'))

In [84]:
from keras.optimizers import RMSprop
model.compile(optimizer="RMSprop",loss='binary_crossentropy',metrics=['accuracy'])


In [85]:
epochs=10
batch_size=64

In [86]:
model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(x_test,y_test))

Epoch 1/10


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 731ms/step - accuracy: 0.5726 - loss: 0.6931 - val_accuracy: 0.4667 - val_loss: 0.6934
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 563ms/step - accuracy: 0.5627 - loss: 0.6927 - val_accuracy: 0.4667 - val_loss: 0.6935
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 556ms/step - accuracy: 0.5429 - loss: 0.6927 - val_accuracy: 0.4667 - val_loss: 0.6936
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 560ms/step - accuracy: 0.5663 - loss: 0.6922 - val_accuracy: 0.4667 - val_loss: 0.6938
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 545ms/step - accuracy: 0.5210 - loss: 0.6928 - val_accuracy: 0.4667 - val_loss: 0.6939
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 530ms/step - accuracy: 0.5554 - loss: 0.6920 - val_accuracy: 0.4667 - val_loss: 0.6940
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x7b4b4b5e9070>

In [87]:
model.evaluate(x_test,y_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.4361 - loss: 0.6954


[0.6943997740745544, 0.46666666865348816]

In [100]:
model.summary()

In [93]:
test_sent="i hate this movie"
test_sent=clean_reviews(test_sent)
model.predict(pad_sequences(tok.texts_to_sequences([test_sent]),maxlen=max_rev_len,padding='post'))

# print label
label = ['Negative','Positive']
prediction = model.predict(pad_sequences(tok.texts_to_sequences([test_sent]),maxlen=max_rev_len,padding='post'))

print(label[np.argmin(prediction[0])])



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Positive
