In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train=pd.read_csv('/content/drive/MyDrive/Project data/Movie review sentiment analysis/labeledTrainData.tsv', delimiter="\t")
test=pd.read_csv('/content/drive/MyDrive/Project data/Movie review sentiment analysis/testData.tsv', delimiter="\t")

In [None]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


Data Cleaning and text preprocessing

In [None]:
def preprocessor(review):
    #Removing HTML markup
    review_text = BeautifulSoup(review).get_text()
    
    #Remove non-letters using re
    review_text = re.sub('[^a-zA-Z]', " ", review_text)
    
    review_text = review_text.lower()
    review_text = review_text.split() #Tokenize
    
    #Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in review_text if word not in stopwords.words('english')]
    output = ' '.join(words)
    
    return output
    
    

In [None]:
corpus = []
for i in range(len(train)):
    corpus.append(preprocessor(train['review'][i]))

KeyboardInterrupt: ignored

Using Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features = 5000) 
X_bow = vectorizer.fit_transform(corpus)

X_bow = X_bow.toarray()

In [None]:
X_bow.shape

(25000, 5000)

In [None]:
y = train['sentiment']

In [None]:
from sklearn.model_selection import train_test_split
X_bowtrain, X_bowtest, y_train, y_test = train_test_split(X_bow, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(X_bowtrain, y_train )

In [None]:
y_pred = forest.predict(X_bowtest)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      3738
           1       0.85      0.84      0.85      3762

    accuracy                           0.85      7500
   macro avg       0.85      0.85      0.85      7500
weighted avg       0.85      0.85      0.85      7500

[[3201  537]
 [ 602 3160]]


Using Keras Word Embedding

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

In [None]:
### Vocabulary size
voc_size=10000

In [None]:
corpus[1]

'classic war world timothi hine entertain film obvious goe great effort length faith recreat h g well classic book mr hine succe watch film appreci fact standard predict hollywood fare come everi year e g spielberg version tom cruis slightest resembl book obvious everyon look differ thing movi envis amateur critic look critic everyth other rate movi import base like entertain peopl never agre critic enjoy effort mr hine put faith h g well classic novel found entertain made easi overlook critic perceiv shortcom'

In [None]:
#One-hot encoding
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
#onehot_repr

In [None]:
len(corpus)

25000

In [None]:
len(max(onehot_repr, key=len))

1416

In [None]:
sent_length = 200
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[9879 3724 8513 ... 4378 2674 4914]
 [   0    0    0 ... 6799 3648 6528]
 [6410 4477 1794 ... 7219 4594 6958]
 ...
 [   0    0    0 ... 8513 7219 9906]
 [   0    0    0 ...    5 1714 3778]
 [   0    0    0 ... 3543 2058 6065]]


In [None]:

X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.30, random_state=42)

In [None]:
embedding_vector_features=100
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 100)          1000000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [None]:

early_stopping = EarlyStopping(
    min_delta=0.01, # minimium amount of change to count as an improvement
    patience=10, # how many epochs to wait before stopping
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=1024,
    epochs=100,
    callbacks=[early_stopping], # put your callbacks in a list
    verbose=2,  # turn off training log
)

Epoch 1/100
18/18 - 4s - loss: 0.6900 - accuracy: 0.6375 - val_loss: 0.6215 - val_accuracy: 0.6552
Epoch 2/100
18/18 - 2s - loss: 0.5670 - accuracy: 0.7636 - val_loss: 0.4837 - val_accuracy: 0.8003
Epoch 3/100
18/18 - 2s - loss: 0.3683 - accuracy: 0.8544 - val_loss: 0.3500 - val_accuracy: 0.8573
Epoch 4/100
18/18 - 2s - loss: 0.2668 - accuracy: 0.8967 - val_loss: 0.3190 - val_accuracy: 0.8691
Epoch 5/100
18/18 - 2s - loss: 0.2078 - accuracy: 0.9253 - val_loss: 0.3380 - val_accuracy: 0.8692
Epoch 6/100
18/18 - 2s - loss: 0.1678 - accuracy: 0.9441 - val_loss: 0.3573 - val_accuracy: 0.8672
Epoch 7/100
18/18 - 2s - loss: 0.1427 - accuracy: 0.9559 - val_loss: 0.3914 - val_accuracy: 0.8635
Epoch 8/100
18/18 - 2s - loss: 0.1223 - accuracy: 0.9637 - val_loss: 0.4227 - val_accuracy: 0.8575
Epoch 9/100
18/18 - 2s - loss: 0.1034 - accuracy: 0.9721 - val_loss: 0.4189 - val_accuracy: 0.8555
Epoch 10/100
18/18 - 2s - loss: 0.0914 - accuracy: 0.9753 - val_loss: 0.5049 - val_accuracy: 0.8495
Epoch 11/

In [None]:
y_pred = model.predict_classes(X_test)
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.86      0.88      0.87      3738
           1       0.88      0.86      0.87      3762

    accuracy                           0.87      7500
   macro avg       0.87      0.87      0.87      7500
weighted avg       0.87      0.87      0.87      7500

[[3297  441]
 [ 541 3221]]


Adding Dropout Layer

In [None]:

## Creating model
embedding_vector_features=100
dropout_model=Sequential()
dropout_model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
dropout_model.add(Dropout(0.3))
dropout_model.add(LSTM(100))
dropout_model.add(Dropout(0.3))
dropout_model.add(Dense(1,activation='sigmoid'))
dropout_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
dropout = dropout_model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=256,
    epochs=100,
    callbacks=[early_stopping], # put your callbacks in a list
    verbose=2,  # turn off training log
)

Epoch 1/100
69/69 - 5s - loss: 0.5608 - accuracy: 0.7063 - val_loss: 0.4109 - val_accuracy: 0.8115
Epoch 2/100
69/69 - 3s - loss: 0.3147 - accuracy: 0.8740 - val_loss: 0.3099 - val_accuracy: 0.8684
Epoch 3/100
69/69 - 3s - loss: 0.2177 - accuracy: 0.9190 - val_loss: 0.3166 - val_accuracy: 0.8723
Epoch 4/100
69/69 - 3s - loss: 0.1725 - accuracy: 0.9377 - val_loss: 0.3396 - val_accuracy: 0.8608
Epoch 5/100
69/69 - 3s - loss: 0.1382 - accuracy: 0.9534 - val_loss: 0.4378 - val_accuracy: 0.8576
Epoch 6/100
69/69 - 3s - loss: 0.1211 - accuracy: 0.9572 - val_loss: 0.3997 - val_accuracy: 0.8539
Epoch 7/100
69/69 - 3s - loss: 0.1049 - accuracy: 0.9648 - val_loss: 0.4514 - val_accuracy: 0.8511
Epoch 8/100
69/69 - 3s - loss: 0.0847 - accuracy: 0.9722 - val_loss: 0.5127 - val_accuracy: 0.8519
Epoch 9/100
69/69 - 3s - loss: 0.0673 - accuracy: 0.9774 - val_loss: 0.5792 - val_accuracy: 0.8449
Epoch 10/100
69/69 - 3s - loss: 0.0665 - accuracy: 0.9782 - val_loss: 0.5331 - val_accuracy: 0.8460
Epoch 11/

In [None]:
y_pred_dropout = dropout_model.predict_classes(X_test)
y_pred_dropout



array([[0],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int32)

In [None]:
print(metrics.classification_report(y_test, y_pred_dropout))
print(metrics.confusion_matrix(y_test, y_pred_dropout))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87      3738
           1       0.86      0.87      0.87      3762

    accuracy                           0.87      7500
   macro avg       0.87      0.87      0.87      7500
weighted avg       0.87      0.87      0.87      7500

[[3224  514]
 [ 473 3289]]



## Using Word2Vec model trained on unlabeled data

### Vector Averaging

In [None]:
from keras.preprocessing.text import Tokenizer
import gensim

In [None]:
w2v = gensim.models.Word2Vec.load("/content/drive/MyDrive/Project data/Movie review sentiment analysis/Word2Vec_model")

In [None]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [None]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000 == 0:
           print ("Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       #
       # Increment the counter
       counter = counter + 1
    return reviewFeatureVecs

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['review'], train['sentiment'], test_size=0.30, random_state=42)

In [None]:
train_reviews = []
for review in X_train:
    train_reviews.append(preprocessor( review))

In [None]:
test_reviews = []
for review in X_test:
    test_reviews.append(preprocessor( review))

In [None]:
X_train = getAvgFeatureVecs( train_reviews, w2v, 300)

Review 0 of 17500




Review 1000 of 17500
Review 2000 of 17500
Review 3000 of 17500
Review 4000 of 17500
Review 5000 of 17500
Review 6000 of 17500
Review 7000 of 17500
Review 8000 of 17500
Review 9000 of 17500
Review 10000 of 17500
Review 11000 of 17500
Review 12000 of 17500
Review 13000 of 17500
Review 14000 of 17500
Review 15000 of 17500
Review 16000 of 17500
Review 17000 of 17500


In [None]:
X_test = getAvgFeatureVecs( test_reviews, w2v, 300)

Review 0 of 7500




Review 1000 of 7500
Review 2000 of 7500
Review 3000 of 7500
Review 4000 of 7500
Review 5000 of 7500
Review 6000 of 7500
Review 7000 of 7500


In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

In [None]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = forest.predict(X_test)
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.59      0.59      3738
           1       0.59      0.59      0.59      3762

    accuracy                           0.59      7500
   macro avg       0.59      0.59      0.59      7500
weighted avg       0.59      0.59      0.59      7500

[[2195 1543]
 [1535 2227]]
