# Case Study - NLP - Finding review is a positive or negative

In [3]:
# Importing Important pakages

import numpy as np  
import re  
import nltk  
from sklearn.datasets import load_files  
nltk.download('stopwords')  
import pickle  
from nltk.corpus import stopwords  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\achintj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\achintj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Importing Machine Learning and preprocessing packages

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import pandas as pd

In [6]:
# Importing movie data.
# Data was extracted and was put into simple txt format.
# the folder txt_sentoken contains two seperate folders 'neg' for negative review and 'pos' for positive review.

movie_data = load_files(r"D:\MyWork\DSWork\Project - Resume Ranker\txt_sentoken")

In [61]:
# Looking at first review
movie_data.data[0:1]

[b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! \nparts of this are actually so absurd , that they would fit right in with dogma . \nyes , the film is

In [8]:
# Seperating  feature data from target data
X,Y = movie_data.data,movie_data.target

In [9]:
# Text preprocessing on feature data 
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    documents.append(document) 

In [62]:
# Checking if text is cleaned or not
documents[0][:500]

'arnold schwarzenegger ha been an icon for action enthusiast since the late 80 but lately his film have been very sloppy and the one liner are getting worse nit hard seeing arnold a mr freeze in batman and robin especially when he say ton of ice joke but hey he got 15 million what it matter to him nonce again arnold ha signed to do another expensive blockbuster that can compare with the like of the terminator series true lie and even eraser nin this so called dark thriller the devil gabriel byrne'

In [11]:
# Count Vectorization for feature data, this step will remove all stop words like 'is', 'the', 'a'.

from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(documents).toarray()  

In [12]:
print(vectorizer.vocabulary_)

{'action': 18, 'since': 1194, 'late': 703, '80': 7, 'liner': 735, 'getting': 539, 'worse': 1483, 'nit': 888, 'hard': 578, 'seeing': 1152, 'mr': 830, 'batman': 113, 'robin': 1110, 'especially': 411, 'say': 1130, 'ice': 618, 'joke': 671, 'got': 555, 'million': 807, 'matter': 781, 'nonce': 902, 'another': 62, 'series': 1163, 'true': 1377, 'lie': 728, 'even': 413, 'nin': 886, 'called': 180, 'dark': 300, 'thriller': 1341, 'devil': 331, 'come': 242, 'upon': 1398, 'earth': 376, 'woman': 1471, 'happens': 576, 'every': 417, 'year': 1494, 'basically': 112, 'world': 1482, 'apparently': 72, 'god': 549, 'man': 765, 'nwith': 935, 'help': 588, 'kevin': 681, 'stop': 1259, 'nothing': 906, 'let': 726, 'take': 1301, 'actually': 22, 'would': 1487, 'fit': 485, 'right': 1105, 'nyes': 936, 'weak': 1439, 'better': 130, 'make': 762, 'enough': 401, 'look': 744, 'star': 1245, 'definitely': 317, 'seem': 1153, 'type': 1390, 'see': 1151, 'nsure': 916, 'gave': 529, 'well': 1445, 'known': 693, 'seemed': 1154, 'confus

In [13]:
# Tfidf Transformer will give more weights to less occuring words

from sklearn.feature_extraction.text import TfidfTransformer  
tfidfconverter = TfidfTransformer()  
X = tfidfconverter.fit_transform(X).toarray() 

In [16]:
print(tfidfconverter.idf_[:10])

[4.06880284 3.37565566 4.2193757  3.99623215 3.94744198 4.10159266
 4.11276596 4.09054283 3.71105321 3.48941455]


In [17]:
# Spliting feature data and target data into training data and testing data

from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) 

In [18]:
# Importing grid search for best params 
from sklearn.model_selection import GridSearchCV

In [21]:
params = {
    'n_estimators': [10,100,500,1000,5000],
    'random_state' : [1,2,3,4,5]
}

classifier = RandomForestClassifier()  

In [23]:
gs = GridSearchCV(classifier,param_grid=params,cv = 5,n_jobs= -1)

In [25]:
# Training Data (this takes time)
gs.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [10, 100, 500, 1000, 5000], 'random_state': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
# Finding best params
gs.best_params_

{'n_estimators': 5000, 'random_state': 3}

In [27]:
# Best cv score
gs.best_score_

0.82125

In [28]:
# Predicting results for test data
y_pred = gs.predict(X_test)  

In [29]:
# Confusion matrix and score for Random Forest
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred)) 

[[176  32]
 [ 32 160]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       208
           1       0.83      0.83      0.83       192

   micro avg       0.84      0.84      0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

0.84


In [30]:
# Checking and comparing with MultinomialNB
classifier = MultinomialNB()  
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test)  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred)) 

[[160  48]
 [ 34 158]]
              precision    recall  f1-score   support

           0       0.82      0.77      0.80       208
           1       0.77      0.82      0.79       192

   micro avg       0.80      0.80      0.80       400
   macro avg       0.80      0.80      0.79       400
weighted avg       0.80      0.80      0.80       400

0.795


In [31]:
# Checking and comparing with SVC

classifier = SVC(kernel='rbf',C=1000, gamma='auto')  
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test)  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred)) 

[[168  40]
 [ 28 164]]
              precision    recall  f1-score   support

           0       0.86      0.81      0.83       208
           1       0.80      0.85      0.83       192

   micro avg       0.83      0.83      0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

0.83


In [32]:
# Checking and comparing with LogisticRegression
classifier = LogisticRegression(solver = 'lbfgs')  
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test)  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred)) 

[[168  40]
 [ 28 164]]
              precision    recall  f1-score   support

           0       0.86      0.81      0.83       208
           1       0.80      0.85      0.83       192

   micro avg       0.83      0.83      0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

0.83


# Finding similar reviews with cosine similarity, this can be used to find profiles with similar details

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
res = cosine_similarity(X)

In [36]:
res.shape

(2000, 2000)

In [50]:
d = pd.Series(data =res[99:100][0])

In [51]:
d.sort_values(ascending=False)[:10]

99      1.000000
1869    0.443045
1355    0.411150
173     0.382429
1635    0.287857
1044    0.265687
1961    0.260260
1750    0.256967
242     0.250926
481     0.240499
dtype: float64

In [None]:
# Most simolar review to review at 99 is review at 1869 with a 44% match

In [54]:
print(movie_data.data[99])

b'if there were a subject just screaming to be made into a film , it was studio 54 , the late steve rubell\'s infamous new york discotheque , which came to symbolize the taboo-smashing excesses of the disco era\'s heyday : easy drugs and even easier sex between everyone , all set to a thumping dance beat . \nso when miramax started production on _54_ last year , the buzz of hype and resulting anticipation began : an edgy film as down-and-dirty as the club itself , written and directed by a promising first-timer ( mark christopher ) and starring some hot young talent ( salma hayek , neve campbell , newcomer ryan phillippe , and mike myers in his dramatic debut ) . \nmy , how easily does the worm turn . \nhype turned into damage control when word got out about 11th-hour reshoots ( wrapped only a month ago ) , a rash of studio-imposed edits that left virtually the entire cast and crew ( especially christopher ) unhappy , and the very likely possibility that miramax would not screen the fi

In [55]:
movie_data.data[1869]

b'for a movie about disco-era excess , " 54 " comes up surprisingly short on the sleazy happenings at the titular late 1970s and early 1980s manhattan dance club . \nthink of it as a sort of " boogie nights " -lite - where that similarly-structured and -set portrayal of the porn industry was loaded with salacious goings-on and skidded on a destructive midpoint tone shift , " 54 " leaves the kinkier details to your imagination . \nit never needlessly revels in its seedy subject matter , but it thankfully never resorts to preaching , either . \nin fact , were it not for the rampant drug use and the intermittent dark moment , " 54 " would be about as feathery as the time period\'s popular female hairstyle . \nstudio 54 was a haven for the rich , famous and generally good-looking before it closed its doors forever in 1986 . \ninside , anything went and , we are told , everything did . \nthe film\'s fictional main character is shane o\'shea ( ryan phillippe , the jerky jock from " i know wh