In [1]:
# Setting Path

import os
path = os.getcwd()

# Import Packages

import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

import spacy
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('punkt')

nlp = spacy.load("en_core_web_sm")
punct = string.punctuation
stem = PorterStemmer()
lemma = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import pickle

import warnings
warnings.filterwarnings('ignore')


In [2]:
root = pd.read_csv('data/reviews.csv')

In [3]:
root.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Pre Processing

In [4]:
root.shape

(50000, 2)

In [5]:
pos_data = root[root['sentiment'] == 'positive']
neg_data = root[root['sentiment'] == 'negative']

In [6]:
pos_data.shape

(25000, 2)

In [7]:
neg_data.shape

(25000, 2)

In [8]:
pos_sam = pos_data.sample(5000)
neg_sam = neg_data.sample(5000)

In [9]:
pos_sam.shape

(5000, 2)

In [10]:
neg_sam.shape

(5000, 2)

In [11]:
data = pd.concat([pos_sam,neg_sam]).reset_index(drop=True)

In [12]:
data.shape

(10000, 2)

In [13]:
data.head()

Unnamed: 0,review,sentiment
0,Peaches is truly a marvelous film. I write thi...,positive
1,I liked this movie. I saw it to a packed house...,positive
2,I tell you although it is funny how how this m...,positive
3,Mina Kumari exhibits more style and grace just...,positive
4,Alain Chabat claims this movie as his original...,positive


In [14]:
data.isnull().sum()


review       0
sentiment    0
dtype: int64

In [15]:
x = data['review']
y = data['sentiment']

In [16]:
y.value_counts()

sentiment
positive    5000
negative    5000
Name: count, dtype: int64

In [17]:
def convert(text):
    if text == 'positive':
        return 1
    else:
        return 0

In [18]:
y = y.apply(convert)

In [19]:
x.head()

0    Peaches is truly a marvelous film. I write thi...
1    I liked this movie. I saw it to a packed house...
2    I tell you although it is funny how how this m...
3    Mina Kumari exhibits more style and grace just...
4    Alain Chabat claims this movie as his original...
Name: review, dtype: object

In [20]:
x[0]

"Peaches is truly a marvelous film. I write this to refute a review from someone called 'Auscrit', that has appeared on this site. First of all the idea that either Monahans first film 'The Interview' is somehow TV is an extraordinary statement. Here is a film that has been significantly praised around the world as is simply one of the best Australian Films ever made. It fully deserved to win best picture. Peaches is a brave, bold and courageous departure. For me it works on every level and I have now seen it twice. Monahan is a filmmaker who is demonstrating great skill and incredible sensitivity. For 'Auscrit' to make the comment that it is another TV movie etc and that Hugo Weaving is no good simply does not 'get' the film. Or more particularly does not want to get it. Frankly it is the sort of comment that one expects from either another filmmaker who is jealous or bitter or both. Or someone from inside the industry either distribution, exhibition or bureaucracy. Your average punte

In [21]:
def replace_text(rev):
    
    reviews = re.sub(r"what's", "what is ", rev)
    reviews = re.sub(r"\'s", " is", reviews)
    reviews = re.sub(r"\'ve", " have ", reviews)
    reviews = re.sub(r"can't", "cannot ", reviews)
    reviews = re.sub(r"n't", " not ", reviews)
    reviews = re.sub(r"i'm", "i am ", reviews)
    reviews = re.sub(r"\'re", " are ", reviews)
    reviews = re.sub(r"\'d", " would ", reviews)
    reviews = re.sub(r"\'ll", " will ", reviews)
    reviews = re.sub(r"\'scuse", " excuse ", reviews)
    reviews = re.sub('\W', ' ', reviews)
    reviews = re.sub('\s+', ' ', reviews)
    reviews = reviews.strip(' ')
    
    return reviews

In [22]:
def cleaned_text(rev):
      
    reviews = re.sub(r'\[[0-9]*\]', ' ',rev)
    reviews = re.sub(r'\s+', ' ', reviews)
    reviews = re.sub('[^a-zA-Z]', ' ', reviews )
    reviews = re.sub(r'\s+', ' ', reviews)
    reviews = re.sub(r'\W*\b\w{1,3}\b', "",reviews)
    reviews = reviews.strip()
    
  
    return reviews

In [23]:
def remove_stopwords(rev):
    
    stop_words = set(stopwords.words('english'))
    
    tokens = word_tokenize(rev)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stop_words]
    reviews = ' '.join(tokens)
    
    return reviews

In [24]:
def lemmatize(rev):
    
    doc = nlp(rev)
    reviews = [words.lemma_ for words in doc]
    reviews = ' '.join(reviews)
    
    return reviews

In [25]:
x = x.apply(replace_text)

In [26]:
x = x.apply(cleaned_text)

In [27]:
x = x.apply(remove_stopwords)

In [28]:
x = x.apply(lemmatize)

In [29]:
x = x.apply(cleaned_text)

In [30]:
x[0]

'peach truly marvelous film write refute review someone call auscrit appear site first idea either monahan first film interview somehow extraordinary statement film significantly praise around world simply good australian film ever make fully deserve good picture peach brave bold courageous departure work every level twice monahan filmmaker demonstrate great skill incredible sensitivity auscrit make comment another movie hugo weave good simply film particularly want frankly sort comment expect either another filmmaker jealous bitter someone inside industry either distribution exhibition bureaucracy average punter find write comment like notice comment site reference film sommersault wonder people think look unfortunately australia time release push like something wrong film manipulation medium pretty common reality similarity film right passage film unfortunately film nothing could tell minute dimensional film anxiety peach comparison master piece personally wait monahan next clearly a

In [31]:
x.head()

0    peach truly marvelous film write refute review...
1    like movie packed house toronto international ...
2    tell although funny many swear word sure numbe...
3    mina kumari exhibit style grace move stand flo...
4    alain chabat claim movie original idea theme r...
Name: review, dtype: object

In [32]:
vectorizer = TfidfVectorizer()

In [33]:
X = vectorizer.fit_transform(x).toarray()

In [34]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
X_train.shape
y_test.shape

(2000,)

In [37]:
model = RandomForestClassifier()

In [38]:
model.fit(X_train,y_train)

In [39]:
y_pred = model.predict(X_test)

In [40]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.826


In [41]:
neg_data.iloc[45].review

"Oh God, I must have seen this when I was only 11 or twelve, (don't ask how) I may have been young, but I wasn't stupid. Anyone could see that this is a bad movie, nasty, gross, unscary and very silly. I've seen more impressive effects at Disneyland, I've seen better performances at a school play, And I've seen more convincing crocodiles at the zoo, where they do nothing but sit in the water, ignoring the children tapping on the glass.<br /><br />The story is set in northern Australia. A handful of ambitious young people, are trying out a new water sport, surfing in shark filled waters. It soon becomes evident that something more dangerous is in the water. After they learn what, they get the help of a grizzly middle aged fisherman, who wants to kill the animal to avenge the eating of his family.<br /><br />I think I have seen every crocodile film made in the last fifteen years, the best of which is Lake Placid, and the worse of which is its sequel. Blood Surf would have to be the secon

In [42]:
pos_data.iloc[45].review

'Marion Davies stars in this remarkable comedy "Show People" released by MGM in 1928. Davies plays a hick from Savannah, Georgia, who arrives in Hollywood with her father (Dell Henderson). The jalopy they arrive in is a hoot - as is Davies outrageous southern costume. Davies lands a job in slapstick comedy, not what she wants, but it brings her success. She meets fellow slapstick star William Haines, who is immediately smitten with her. Well, Davies then gets a job at a more prestigious studio ("High Art Studios") and lands a job in stuffy period pieces. A handsome but fake actor (Andre Telefair) shows her the ropes of how to be the typical pretentious Hollywood star. Davies abandons her slapstick friend and father for the good life, but of course learns that is not who she really is. Marion Davies is wonderful throughout, as she - outrageously - runs the gamut of emotions required of a "serious" actress. William Haines is his usual wonderful comedic self, and there are cameos by Charl

In [43]:
review = 'Marion Davies stars in this remarkable comedy "Show People" released by MGM in 1928. Davies plays a hick from Savannah, Georgia, who arrives in Hollywood with her father (Dell Henderson). The jalopy they arrive in is a hoot - as is Davies outrageous southern costume. Davies lands a job in slapstick comedy, not what she wants, but it brings her success. She meets fellow slapstick star William Haines, who is immediately smitten with her. Well, Davies then gets a job at a more prestigious studio ("High Art Studios") and lands a job in stuffy period pieces. A handsome but fake actor (Andre Telefair) shows her the ropes of how to be the typical pretentious Hollywood star.'
review_vec = vectorizer.transform([review]).toarray()

In [44]:
predictions = model.predict(review_vec)

for i in range(len(predictions)):

    sentiment = "positive" if predictions[i] > 0.5 else "negative"
    print(f"Predicted sentiment: {sentiment}")
    print("-----------------------------")


Predicted sentiment: positive
-----------------------------


In [45]:
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))

In [46]:
pickle.dump(model,open('model.pkl','wb'))