In [35]:
import numpy as np
import pandas as pd

In [36]:
data=pd.read_csv('C:/Users/Aman Raj/data_science/stanford_sentiment/IMDB Dataset.csv')

In [37]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### converting positive and negative to 1's & 0's

In [38]:
data['sentiment']=np.where(data['sentiment']=='positive',1,0)

In [39]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [40]:
np.unique(data['sentiment'])

array([0, 1])

In [41]:
data['sentiment'].sum()

25000

In [42]:
len(data['sentiment'])

50000

### Now we know we have 5000 data and we have converted 'sentiments' into 1 and 0

### we will now remove punctuations and stopwords

In [43]:
from string import punctuation

In [44]:
data['review']=data['review'].apply(lambda x: ''.join(wd for wd in x.lower() if wd not in punctuation))

In [45]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [46]:
from nltk.corpus import stopwords

In [47]:
stop=stopwords.words('english')

In [48]:
data['review']=data['review'].apply(lambda x: ' '.join(wd for wd in x.split() if wd not in stop))

In [49]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


## splitting data into train and test split

In [50]:
x=data['review']
y=data['sentiment']

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
x_train,x_test,y_train,y_test=train_test_split(x,y)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(37500,)
(12500,)
(37500,)
(12500,)


In [53]:
np.random.seed(1000)
num_most_freq_word_to_include=5000
max_rev_len_keras=500
embedding_vec_len=32

In [54]:
train_rev_list=x_train.tolist()
test_rev_list=x_test.tolist()

In [55]:
all_rev_list=x_train.tolist() + x_test.tolist()

## Tokenizing the x data and padding it

In [56]:
from tensorflow import keras

In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [61]:
tokenizer= Tokenizer(num_words=5000)
tokenizer.fit_on_texts(all_rev_list)

In [64]:
train_rev_tokenized=tokenizer.texts_to_sequences(train_rev_list)
x_train=pad_sequences(train_rev_tokenized,maxlen=500)
test_rev_tokenized=tokenizer.texts_to_sequences(test_rev_list)

In [65]:
x_test=pad_sequences(test_rev_tokenized,maxlen=500)


In [66]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(37500, 500)
(12500, 500)
(37500,)
(12500,)


In [67]:
train_rev_list[0]

'insisting martin luther kings inspirational spirit resides american civil liberties inside hearts minds people everywhere danish helmer niels arden oplev transplants belief 1969 danish middle school specifically works way crusade young boy named frits janus dissing rathke oppressively rigid churlishly abusive headmaster svendsen bent mejding adapted true story performances executed certain aplomb refreshing command varied characters keeps involving battle ideologies 13 yearold demented disciplinarian gives way inherent humour awkward shifts mood disorients despite keeping shrewdly cynical vein dead poets society matilda treads familiar path continued precise service young protagonist including personal subplot rounds frits young boy becoming young man manages raise film rousing family film nose right money'

In [68]:
x_train[0]
x_train.shape

(37500, 500)

# Build the model

In [69]:
from tensorflow.keras.layers import Embedding,Dropout,Conv1D,MaxPool1D,LSTM,Dense

In [70]:
from tensorflow.keras.models import Sequential

In [71]:
model=Sequential()

model.add(Embedding(num_most_freq_word_to_include, max_rev_len_keras, input_length=32))
          
model.add(LSTM(units=100,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
         
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [72]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 500)           2500000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               240400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 2,740,501
Trainable params: 2,740,501
Non-trainable params: 0
_________________________________________________________________


# Training

In [73]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [74]:
print(x_train.shape)
print(x_test.shape)


(37500, 500)
(12500, 500)


In [75]:
print(y_train.shape)
print(y_test.shape)

(37500,)
(12500,)


In [76]:
model.fit(x_train,y_train,epochs=3,batch_size=100,validation_data=[x_test,y_test])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x2b9b9aca308>

# Model Evaluation

In [77]:
from sklearn.metrics import roc_auc_score

In [78]:
y_pred=model.predict(x_test)



In [79]:
print("AUC score: %.4f" %roc_auc_score(y_test,y_pred))

AUC score: 0.9501


## Testing the model

In [80]:
pred=np.where(model.predict(x_test))

In [83]:
samp=pad_sequences(tokenizer.texts_to_sequences(['I hate this movie.There is no story in it']),maxlen=max_rev_len_keras)

In [84]:
np.where(model.predict(samp)>0.5,'Positive review','Negative review')[0][0]

'Negative review'

In [85]:
samp_2=pad_sequences(tokenizer.texts_to_sequences(['I love this movie.There is nice story in it']),maxlen=max_rev_len_keras)

In [86]:
np.where(model.predict(samp_2)>0.5,'Positive review','Negative review')[0][0]

'Positive review'