# Felipe Castillo
# Bag of Words Meets Bags of Popcorn Assignment
# Data Mining
# 03/29/2022

In [191]:
import pandas as pd 
from textblob import TextBlob,classifiers
from sklearn.model_selection import train_test_split
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import os 
import sys
import nltk
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
  

#Change path to week one folder for DSC-550
#Current directory in documents
os.chdir('C:\DataScience_DSC_550\Week3\labeledTrainData.tsv')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cast6\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cast6\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
#!{sys.executable} -m pip install vaderSentiment

# 1. Import the movie review data 

In [180]:
# Import data
#Load tsv file
BWMBP_df = pd.read_csv("labeledTrainData.tsv", sep='\t')

#copying an instance of BWMBP_df to be used for vader data set.
BWMBP_vader_df = BWMBP_df.copy()

In [78]:
#Ensure that the data is loaded properly.
#Loading firt 3 rows.
BWMBP_df.head(3)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


# 2. How many of each positive and negative reviews are there?

In [115]:
#Postive and negative reviews based off sentiment score 
#There are 12500 negative and 12500 postive reviews
print(BWMBP_df['sentiment'].value_counts())

0    12500
1    12500
Name: sentiment, dtype: int64


# 3.TextBlob to classify each review postive or negative

Sections points:
1. If polartiy score is greater then or eqaul to 0 its postive.
2. If polartiy score is lower then 0 its negative.

In [4]:
#Function to return polarity
def getPolarity(text):
    return TextBlob(text).polarity

In [108]:
#Function returns the classification score 
def getAnalysis(sentimentScore):
    #Initializing score to nothing, and returning it after score is evaluated.
    classification_score = ''
    #If score is greater then or equalto zero then 1
    if float(sentimentScore) >= 0:
        #postive
        classification_score = 1
    #Score has to be lower then 0 at this point
    else:
        #negative
        classification_score = 0
    #Return 0 or 1
    return classification_score
        
        
        
        

In [81]:
#Making a new column with polarity score 
BWMBP_df['score_polarity'] = BWMBP_df['review'].apply(getPolarity)

In [83]:
#polarity score 
BWMBP_df.head(2)

Unnamed: 0,id,sentiment,review,score_polarity
0,5814_8,1,With all this stuff going down at the moment w...,0.001277
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349


In [84]:
#Getting the analysis which returns the classification sentiment score.
#Returns a 0 or 1
BWMBP_df['sentiment_score'] = BWMBP_df['score_polarity'].apply(getAnalysis)

In [85]:
#sentiment score output
BWMBP_df.head(3)

Unnamed: 0,id,sentiment,review,score_polarity,sentiment_score
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0


In [86]:
#Overall there 19017 postive and 5983 negative values.
print(BWMBP_df['sentiment_score'].value_counts())

1    19017
0     5983
Name: sentiment_score, dtype: int64


# 4.Check the accuracy of this model. 

In [87]:
#Accuracy of textblob is 68.
accuracy_score(BWMBP_df['sentiment'],BWMBP_df['sentiment_score'])

0.68524

#Is this model better than random guessing?

This model is better than random guessing. In random guessing the assumption is that half of the data is negative, and half is positive. That would be equivalent to guessing someone gender. There is 50 / 50 chance that its either a female/male or positive/negative. Rather the model that is specifically looking for descriptions and classifying based off those descriptions.  For example, if one of the descriptions was long hair the classification model could be set to output female. There are men with long hair so it wouldn’t be 100 percent accurate.  The point is the model is not randomly guessing. Overall, the model should be better. 

# Vader Extra Credit

# TextBlob to classify each review postive or negative


In [99]:
#BWMBP_vader_df
#Instantiating SentimentIntensityAnalyzer class to be used to find polarity.
sid_obj = SentimentIntensityAnalyzer()
def getVaderPolarity(text):
    #returns a polarity object
    return sid_obj.polarity_scores(text)
    

In [100]:
#Making a new column with Varder polarity.
#Applying function getVaderPolarity
BWMBP_vader_df['varder_polarity_dic'] = BWMBP_vader_df['review'].apply(getVaderPolarity)

In [101]:
#The object contains neg,neu, ps, compound
BWMBP_vader_df.head(3)

Unnamed: 0,id,sentiment,review,varder_polarity_dic
0,5814_8,1,With all this stuff going down at the moment w...,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co..."


In [102]:
#The object on its on is not useful, taking the compound score.
#Compound scores give you whether its leaning negative or positive. Uses ratio from the pos,neg, neu to determine that.
#Making a new column compound_score 
#For each review sentence, apply a function that returns the sid_obj compound score
BWMBP_vader_df['compound_score'] = BWMBP_vader_df['review'].apply(lambda x:sid_obj.polarity_scores(x)['compound'])

In [103]:
#Gives compound score
BWMBP_vader_df.head(2)

Unnamed: 0,id,sentiment,review,varder_polarity_dic,compound_score
0,5814_8,1,With all this stuff going down at the moment w...,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",-0.8879
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.9736


In [109]:
#Making a new column with vader sentiment score based of compound score
#Already, have a function that classifies each value. getAnalysis returns a 0 or 1 based off the value.

BWMBP_vader_df['vr_sentiment_score'] = BWMBP_vader_df['compound_score'].apply(getAnalysis)

In [113]:
#Shows vr(Vader) sentiment score
BWMBP_vader_df.head(2)

Unnamed: 0,id,sentiment,review,varder_polarity_dic,compound_score,vr_sentiment_score
0,5814_8,1,With all this stuff going down at the moment w...,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",-0.8879,0
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.9736,1


In [114]:
#Overall there are 16611 postive and 8389 negative.

print(BWMBP_vader_df['vr_sentiment_score'].value_counts())

1    16611
0     8389
Name: vr_sentiment_score, dtype: int64


# Check the accuracy of this model. 

In [112]:
#The overall accuracy is 69
accuracy_score(BWMBP_vader_df['sentiment'],BWMBP_vader_df['vr_sentiment_score'])

0.69404

#Is this model better than random guessing?
The Vader model is better than random guessing. In random guessing the assumption is that half of the data is negative, and half is positive. The Vader model works like text blob. And uses classification to outline a score. The score between both is similar to one another. Overall this model is also better then random guessing.

# Part 2: Prepping Text For a Custom Model 

In [140]:
BWMBP_MOD_DF = BWMBP_df.copy()
#1.Convert all text to lowercase letters.
BWMBP_MOD_DF['review'] = BWMBP_MOD_DF['review'].str.lower()
#2.Remove punctuation and special characters from the text.
BWMBP_MOD_DF['review'] = BWMBP_MOD_DF['review'].str.replace(r'[^\w\s]+', '')

  BWMBP_MOD_DF['review'] = BWMBP_MOD_DF['review'].str.replace(r'[^\w\s]+', '')


In [141]:
#3. Removing stop words
#Seting stop_words to instance of stopwords class.
#words set to english
stop_words = stopwords.words('english')
#Updateing current review column
#For each review apply function if word is not in stop words then we keep the word
BWMBP_MOD_DF['review'] = BWMBP_MOD_DF['review'].apply(lambda x:[word for word in x.split() if word not in (stop_words)])


In [142]:
#Words instead of sentences
BWMBP_MOD_DF.head(3)

Unnamed: 0,id,sentiment,review
0,5814_8,1,"[stuff, going, moment, mj, ive, started, liste..."
1,2381_9,1,"[classic, war, worlds, timothy, hines, enterta..."
2,7759_3,0,"[film, starts, manager, nicholas, bell, giving..."


In [143]:
#Setter stemmer object.
stemmer = PorterStemmer()

In [144]:
#Wanting the whole sentence and not just the individual word
#Wanting the porter stemmer and words to be returned stemmed
def PorterStemmerSentenceConversion(sentence):
    #takes in sentence 
    updatedSentence = ''
    #Each word will be stem individually
    for word in sentence:
        #Setting a temp value to store stem word
        temp_sentence = (stemmer.stem(word))
        #adding it to one sentence instead of keeping them seperate
        updatedSentence = updatedSentence + ' '+ temp_sentence
    #returning the combind words
    return updatedSentence

In [145]:
#Taking the index count of each sentence in review
for i in range(len(BWMBP_MOD_DF['review'])):
    #Applying function to retrieve updated sentence
    #Gets the sentence at [i] index
    temp_stemmer_converted = PorterStemmerSentenceConversion(BWMBP_MOD_DF['review'][i])
    #Updates the review column at index i with updated stemmed sentence
    BWMBP_MOD_DF['review'][i] = temp_stemmer_converted
    
        
    
                                 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BWMBP_MOD_DF['review'][i] = temp_stemmer_converted


In [146]:
#Shows updated sentences
BWMBP_MOD_DF.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff go moment mj ive start listen music wat...
1,2381_9,1,classic war world timothi hine entertain film...
2,7759_3,0,film start manag nichola bell give welcom inv...
3,3630_4,0,must assum prais film greatest film opera eve...
4,9495_8,1,superbl trashi wondrous unpretenti 80 exploit...
5,8196_8,1,dont know peopl think bad movi got pretti goo...
6,7166_2,0,movi could good come way short cheesi special...
7,10633_1,0,watch video friend hous im glad wast money bu...
8,319_1,0,friend mine bought film 1 even grossli overpr...
9,8713_10,1,br br movi full refer like mad max ii wild on...


In [157]:
#Will hold all review sentences
#Easier to work with at this state
sentence_list = []

In [158]:
#taking sentence and inserting in sentence list
for sentence in BWMBP_MOD_DF['review']:
    sentence_list.append(sentence)
    
  
    

In [192]:
#sentence list holds all sentences 
print(sentence_list[:2])

[' stuff go moment mj ive start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obviou messag drug bad mkaybr br visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice himbr br actual featur film bit final start 20 minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj musicbr br lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scenebr br bottom line movi peopl like mj

In [159]:
#Setting vectorizer to instance of CountVectorizer class
vectorizer = CountVectorizer()


In [166]:
#fitting 
bag_of_words = vectorizer.fit(sentence_list)
#transforming 
bag_of_words = vectorizer.transform(sentence_list)

In [179]:
# rows remain the the same from both data sets
#making sure all columns are the same
print("Original Data frame {}".format(BWMBP_df.shape))
print("Bag of word dimension {}".format(bag_of_words.shape))


Original Data frame (25000, 3)
Bag of word dimension (25000, 92532)


# 6. Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text.

In [194]:
#Setting tfidf to instance of TfidfVectorizer class
tfidf = TfidfVectorizer()
#Inputting the argument sentence list 
featured_matrix = tfidf.fit_transform(sentence_list)

In [195]:
featured_matrix

<25000x92532 sparse matrix of type '<class 'numpy.float64'>'
	with 2439335 stored elements in Compressed Sparse Row format>

In [205]:
#The shape is the same as before
featured_matrix.shape

(25000, 92532)

In [197]:
#Outputing words
tfidf.vocabulary_

{'stuff': 78431,
 'go': 34036,
 'moment': 53274,
 'mj': 53050,
 'ive': 42552,
 'start': 77281,
 'listen': 47712,
 'music': 54873,
 'watch': 88643,
 'odd': 57954,
 'documentari': 23638,
 'wiz': 90414,
 'moonwalk': 53593,
 'mayb': 50836,
 'want': 88391,
 'get': 33406,
 'certain': 15018,
 'insight': 41469,
 'guy': 35732,
 'thought': 81954,
 'realli': 66772,
 'cool': 18641,
 'eighti': 25912,
 'make': 49560,
 'mind': 52491,
 'whether': 89628,
 'guilti': 35533,
 'innoc': 41392,
 'part': 60697,
 'biographi': 10300,
 'featur': 29368,
 'film': 29887,
 'rememb': 67579,
 'see': 71778,
 'cinema': 16357,
 'origin': 59156,
 'releas': 67461,
 'subtl': 78716,
 'messag': 51856,
 'feel': 29421,
 'toward': 83357,
 'press': 64213,
 'also': 4693,
 'obviou': 57850,
 'drug': 24753,
 'bad': 7981,
 'mkaybr': 53055,
 'br': 11884,
 'visual': 87820,
 'impress': 40707,
 'cours': 19169,
 'michael': 52070,
 'jackson': 42639,
 'unless': 85930,
 'remot': 67617,
 'like': 47408,
 'anyway': 5947,
 'hate': 36899,
 'find':