# Sentimental Analysis

## Imports

In [55]:
import pandas as pd
import pickle

In [56]:
data_path = "../data/imdb_shuffle.csv"

In [57]:
df=pd.read_csv(data_path)
# df=df[:40000]
df = df[:500]
df.head()

Unnamed: 0,Review,Rating,Sentiment,Label
0,I've long heard that to get their start in 'le...,4,0,1
1,i was having a horrid day but this movie grabb...,8,1,2
2,I was very interested in seeing this movie des...,1,0,0
3,"I think that ""Key West"" might do well as a DVD...",10,1,3
4,I have recently become a huge fan of Patton Os...,1,0,0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     500 non-null    object
 1   Rating     500 non-null    int64 
 2   Sentiment  500 non-null    int64 
 3   Label      500 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 15.8+ KB


In [59]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import contextualSpellCheck
nlp= spacy.load('en_core_web_sm')
# contextualSpellCheck.add_to_pipe(nlp)

# Build a list of stopwords to use to filter

In [60]:
stopwords = list(STOP_WORDS)

In [61]:
docx = nlp("This is how John Walker was walking. He was also running beside the lawn.")

# Lemmatizing of tokens

In [62]:
for word in docx:
    print(word.text,"Lemma =>",word.lemma_)

This Lemma => this
is Lemma => be
how Lemma => how
John Lemma => John
Walker Lemma => Walker
was Lemma => be
walking Lemma => walk
. Lemma => .
He Lemma => he
was Lemma => be
also Lemma => also
running Lemma => run
beside Lemma => beside
the Lemma => the
lawn Lemma => lawn
. Lemma => .


# Lemma that are not pronouns

In [63]:
for word in docx:
    if word.lemma_ != "-PRON-":
        print(word.lemma_.lower().strip())

this
be
how
john
walker
be
walk
.
he
be
also
run
beside
the
lawn
.


# List Comprehensions of our Lemma

In [64]:
[word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in docx]

['this',
 'be',
 'how',
 'john',
 'walker',
 'be',
 'walk',
 '.',
 'he',
 'be',
 'also',
 'run',
 'beside',
 'the',
 'lawn',
 '.']

# Filtering out Stopwords and Punctuations

In [65]:

for word in docx:
    if word.is_stop == False and not word.is_punct:
#     if word.is_stop != True and not word.is_punct:
        print(word)

John
Walker
walking
running
lawn


# Stop words and Punctuation In List Comprehension

In [66]:
[ word for word in docx if word.is_stop == False and not word.is_punct ]

[John, Walker, walking, running, lawn]

In [67]:
# Use the punctuations of string module
import string
punctuations = string.punctuation

In [68]:
# Creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [69]:
def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations]
    return mytokens

In [70]:
# spacy_tokenizer("This is how John Walker was walking. He was also running beside the lawn.")

# ML Packages

In [71]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [72]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [73]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [74]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [75]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [76]:
# Features and Labels
X = df['Review']
ylabels = df['Sentiment']

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=0)

In [78]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [79]:
# Fit our data
pipe.fit(X_train,y_train)

In [81]:
test_load = pickle.load(open("test.pkl","rb"))

In [82]:
# Predicting with a test dataset
sample_prediction = pipe.predict(X_test)

In [83]:
# Prediction Results
# 1 = Positive review
# 0 = Negative review
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)

one of my favorite lines in Shakespeare.<br /><br />i.e. *we're not finished with you by a long shot* so not only does Shylock not get his pound of flesh, or the 3,000 or the 6,000 or the 36,000 (each of the 6 parts were a ducat) ducats, in a matter of minutes he is ruined by having to forfeit all his possessions. and his daughter has long abandoned him already.<br /><br />vengeance is a dish best served cold. but Shylock's attempt at revenge totally backfires. <br /><br />I suspect this play was and is popular because it caters to the wish we have for justice. but the hard reality is the world is engulfed in injustice and most of it stands. a few big names get tossed in jail, sme gang punks lose their turf to the 'good guys' but in reality most of the time it's the other way around.<br /><br />but not in this play. the long howls of racism and antisemitism forgets that it could well have been any other social outcast group that gets the comeuppance, it's just that the money lenders of

In [84]:
# Accuracy
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_prediction))

Accuracy:  0.75
Accuracy:  1.0


In [85]:
# Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))

Accuracy:  1.0


In [86]:
# Another random review
pipe.predict(["This was a great movie"])

array([1])

In [87]:
example = ["I do enjoy my job",
 "What a poor product!,I will have to get a new one",
 "I feel amazing!"]

In [88]:
pipe.predict(example)

array([1, 1, 1])

In [89]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe_tfid = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

In [90]:
pipe_tfid.fit(X_train,y_train)

In [91]:
sample_prediction1 = pipe_tfid.predict(X_test)

In [92]:
for (sample,pred) in zip(X_test,sample_prediction1):
    print(sample,"Prediction=>", pred)

one of my favorite lines in Shakespeare.<br /><br />i.e. *we're not finished with you by a long shot* so not only does Shylock not get his pound of flesh, or the 3,000 or the 6,000 or the 36,000 (each of the 6 parts were a ducat) ducats, in a matter of minutes he is ruined by having to forfeit all his possessions. and his daughter has long abandoned him already.<br /><br />vengeance is a dish best served cold. but Shylock's attempt at revenge totally backfires. <br /><br />I suspect this play was and is popular because it caters to the wish we have for justice. but the hard reality is the world is engulfed in injustice and most of it stands. a few big names get tossed in jail, sme gang punks lose their turf to the 'good guys' but in reality most of the time it's the other way around.<br /><br />but not in this play. the long howls of racism and antisemitism forgets that it could well have been any other social outcast group that gets the comeuppance, it's just that the money lenders of

In [93]:
print("Accuracy: ",pipe_tfid.score(X_test,y_test))
print("Accuracy: ",pipe_tfid.score(X_test,sample_prediction1))

Accuracy:  0.78
Accuracy:  1.0


In [95]:
# Another random review
pipe_tfid.predict(["This was a great movie"])

array([1])

In [96]:
example = ["I do enjoy my job",
 "What a poor product!,I will have to get a new one",
 "I feel amazing!"]

In [97]:
pipe_tfid.predict(example)

array([1, 0, 1])

In [98]:
pipe_tfid.predict(["Donald Trump will participate in the LIV Golf Bedminster pro-am alongside Caitlyn Jenner, Charles Barkley, and two NFL icons"])

array([1])