In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import en_core_web_sm
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
from spacy.lang.en import English
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
#np.random.seed(42)

In [2]:
movie_reviews = pd.read_csv("C:/Users/Ankur/Desktop/Data_Science_Statistics/Mani/Projects/movie_reviews.csv")

In [3]:
movie_reviews

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
movie_reviews.shape

(50000, 2)

In [5]:
#Describing the dataset
movie_reviews.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [6]:
movie_reviews.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [7]:
movie_reviews['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [8]:
movie_reviews.isna().sum()

review       0
sentiment    0
dtype: int64

In [9]:
import re

In [10]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the html strips and square brackets
def rem_strips_sq_brackets(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

movie_reviews['review']=movie_reviews['review'].apply(rem_strips_sq_brackets)

In [11]:
movie_reviews['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [12]:
# Text tokenization
from spacy.lang.en import English
nlp =English()
moview_reviews_string = str(movie_reviews['review'])
movie_reviews.review.head().tolist
   
doc = nlp(moview_reviews_string)

token_list=[]
for token in doc:
    token_list.append(token.text)
print(token_list)

['0', '       ', 'One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', '...', '\n', '1', '       ', 'A', 'wonderful', 'little', 'production', '.', 'The', 'filming', 'tec', '...', '\n', '2', '       ', 'I', 'thought', 'this', 'was', 'a', 'wonderful', 'way', 'to', 'spend', 'ti', '...', '\n', '3', '       ', 'Basically', 'there', "'s", 'a', 'family', 'where', 'a', 'little', 'boy', '...', '\n', '4', '       ', 'Petter', 'Mattei', "'s", '"', 'Love', 'in', 'the', 'Time', 'of', 'Money', '"', 'is', '...', '\n                               ', '...', '                       \n', '49995', '   ', 'I', 'thought', 'this', 'movie', 'did', 'a', 'down', 'right', 'good', 'job', '...', '\n', '49996', '   ', 'Bad', 'plot', ',', 'bad', 'dialogue', ',', 'bad', 'acting', ',', 'idiotic', 'di', '...', '\n', '49997', '   ', 'I', 'am', 'a', 'Catholic', 'taught', 'in', 'parochial', 'elementary', '...', '\n', '49998', '   ', 'I', "'m", 'going', 'to', 'have', 'to', 'disagree', 'with', 'the', 'previo

In [13]:
# removing special characters and numbers
movie_reviews['review'].str.lower()
movie_reviews['review'] = movie_reviews['review'].apply(lambda x: re.sub(r'[^a-zA-z0-9\s]',"",x))
movie_reviews['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production The filming tech...
2        I thought this was a wonderful way to spend ti...
3        Basically theres a family where a little boy J...
4        Petter Matteis Love in the Time of Money is a ...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I am a Catholic taught in parochial elementary...
49998    Im going to have to disagree with the previous...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [14]:
# removing stopwords
#nltk.download('stopwords')
from nltk.corpus import stopwords
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [15]:
movie_reviews['review'] = movie_reviews['review'].apply( lambda x : " ".join(word for word in x.split() if word not in stopword_list))

In [16]:
movie_reviews['review']

0        One reviewers mentioned watching 1 Oz episode ...
1        A wonderful little production The filming tech...
2        I thought wonderful way spend time hot summer ...
3        Basically theres family little boy Jake thinks...
4        Petter Matteis Love Time Money visually stunni...
                               ...                        
49995    I thought movie right good job It wasnt creati...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I Catholic taught parochial elementary schools...
49998    Im going disagree previous comment side Maltin...
49999    No one expects Star Trek movies high art fans ...
Name: review, Length: 50000, dtype: object

In [17]:
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(movie_reviews['sentiment'])
sentiment_data

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [18]:
#split the train and test dataset
from sklearn.model_selection import train_test_split
review =movie_reviews["review"]
#sentiment = movie_reviews["sentiment"]
train_review, test_review, train_sentiment, test_sentiment = train_test_split(review, sentiment_data,stratify=sentiment_data, test_size=0.2, random_state=0)

In [19]:
cv = CountVectorizer(binary=False,min_df=0,max_df=1,ngram_range=(1,3))

# transformed train reviews
cv_train_reviews = cv.fit_transform(train_review)

#print(cv.get_feature_names)

# transformed test reviews
cv_test_reviews = cv.transform(test_review)

In [20]:
print('Shape of transformed train reviews ',cv_train_reviews.shape)
print('Shape of transformed test reviews ',cv_test_reviews.shape)

Shape of transformed train reviews  (40000, 6857860)
Shape of transformed test reviews  (10000, 6857860)


In [21]:
train_review

38414    The notion marital fidelity portrayed film see...
24010    What good film Made Men great action movie lot...
29873    Joe Don Baker He great Walking Tall good bitpa...
2868     Monarch Cove one best Friday nights drama show...
15107    This film unbelievable whole premise bunkum fa...
                               ...                        
26219    In Luchino Viscontis film Death Venice beauty ...
46599    Total disgrace Truly awful The screenplay dial...
36788    This yet another gem pen Daniele Thompson fact...
5038     I saw movie first came It official selection T...
6732     This show full action everything needed make a...
Name: review, Length: 40000, dtype: object

In [22]:
# Modelling the dataset using logistic regression

model = LogisticRegression(max_iter=1000,C=0.1)

In [23]:
model.fit(cv_train_reviews,train_sentiment)

  return f(**kwargs)


LogisticRegression(C=0.1, max_iter=1000)

model.fit(cv_train_reviews,train_sentiment)

In [24]:
predictions = model.predict(cv_test_reviews)
print(predictions)

[0 1 1 ... 1 0 0]


In [25]:
print(classification_report(test_sentiment, predictions))

              precision    recall  f1-score   support

           0       0.64      0.91      0.75      5000
           1       0.85      0.49      0.62      5000

    accuracy                           0.70     10000
   macro avg       0.74      0.70      0.69     10000
weighted avg       0.74      0.70      0.69     10000



In [None]:
# Modelling the dataset using Naive Bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
gnb = MultinomialNB()

In [28]:
gnb.fit(cv_train_reviews,train_sentiment)

  return f(**kwargs)


MultinomialNB()

In [29]:
gnb_predictions = gnb.predict(cv_test_reviews)
print(gnb_predictions)

[0 1 1 ... 1 0 1]


In [30]:
print(classification_report(test_sentiment, gnb_predictions))

              precision    recall  f1-score   support

           0       0.73      0.78      0.75      5000
           1       0.76      0.72      0.74      5000

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



In [31]:
# Modelling the dataset using SVM
#from sklearn.svm import SVC
#svc_model = SVC(C=0.1)


In [None]:
#svc_model.fit(cv_train_reviews,train_sentiment)

  return f(**kwargs)


In [None]:
#svc_predictions = svc_model.predict(cv_test_reviews)
#print(svc_predictions)

In [None]:
#print(classification_report(test_sentiment, svc_predictions))

In [31]:
# Modelling the dataset using Random Forest
#from sklearn.ensemble import RandomForestClassifier
#clf_random_forest = RandomForestClassifier(n_estimators=150)


In [None]:
#clf_random_forest.fit(cv_train_reviews,train_sentiment)


  clf_random_forest.fit(cv_train_reviews,train_sentiment)


In [None]:
#rf_pred = clf_random_forest.predict(cv_test_reviews)
#print(rf_pred)

In [None]:
#print(classification_report(test_sentiment, rf_pred))