In [1]:
import pandas as pd
from textblob import TextBlob
from sklearn.metrics import accuracy_score

Part 1: Using the TextBlob Sentiment Analyzer

In [2]:
#Import the movie review data as a data frame and ensure that the data is loaded properly.
data = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [3]:
#Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than or equal to zero 
#is a positive sentiment and less than 0 is a negative sentiment.
def find_polarity(txt):
    blob = TextBlob(txt)
    return blob.sentences[0].sentiment.polarity

In [4]:
data['polarity'] = data['review'].apply(find_polarity)
data

Unnamed: 0,id,sentiment,review,polarity
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",-0.107407
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",0.356667
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",0.800000
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",1.000000
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",1.000000
...,...,...,...,...
24995,"""3453_3""",0,"""It seems like more consideration has gone int...",0.102083
24996,"""5064_1""",0,"""I don't believe they made this film. Complete...",0.000000
24997,"""10905_3""",0,"""Guy is a loser. Can't get girls, needs to bui...",0.000000
24998,"""10194_3""",0,"""This 30 minute documentary Buñuel made in the...",0.050000


In [5]:
def find_sentiment(txt):
        if find_polarity(txt) >= 0:
            return 1
        else:
            return 0

In [6]:
data['textblobsentiments'] = data['review'].apply(find_sentiment)
data

Unnamed: 0,id,sentiment,review,polarity,textblobsentiments
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",-0.107407,0
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",0.356667,1
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",0.800000,1
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",1.000000,1
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",1.000000,1
...,...,...,...,...,...
24995,"""3453_3""",0,"""It seems like more consideration has gone int...",0.102083,1
24996,"""5064_1""",0,"""I don't believe they made this film. Complete...",0.000000,1
24997,"""10905_3""",0,"""Guy is a loser. Can't get girls, needs to bui...",0.000000,1
24998,"""10194_3""",0,"""This 30 minute documentary Buñuel made in the...",0.050000,1


Positive Reviews: 19,001
Negative Reviews: 5,982

In [15]:
#Check the accuracy of this model. Is this model better than random guessing?
accuracy_score(data['sentiment'], data['textblobsentiments'])

0.60296

According to this it was 60% accurate, i would say it's more time efficient then random guessing, but 60% isn't that accurate. 

Part 2: Prepping Text for a Custom Model

In [3]:
import unicodedata
import sys
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
review = data["review"]

In [5]:
#Convert all text to lowercase letters.
def decapitalizer(string: str) -> str:
    return string.lower()

In [6]:
noncapital = [decapitalizer(string) for string in review]

In [7]:
#Remove punctuation and special characters from the text.
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P'))

In [8]:
noperiods = [string.translate(punctuation) for string in noncapital]

In [9]:
#Remove stop words.
stop_words = stopwords.words('english')

In [10]:
nostopwords = [word for word in noperiods if word not in stop_words]

In [11]:
#Apply NLTK’s PorterStemmer.
porter = PorterStemmer()

In [12]:
appliedstemmer = [porter.stem(word) for word in nostopwords]

In [15]:
#Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector for a single movie
#review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). Display the dimensions of your bag-of-words 
#matrix. The number of rows in this matrix should be the same as the number of rows in your original data frame.
count = CountVectorizer()
bag_of_words = count.fit_transform(appliedstemmer)

In [16]:
count.get_feature_names()



['00',
 '000',
 '0000000000001',
 '000001',
 '00000110',
 '0001',
 '00015',
 '001',
 '0010',
 '002',
 '00383042',
 '006',
 '007',
 '0079',
 '0080',
 '0083',
 '00s',
 '01',
 '010',
 '01000',
 '010makes',
 '0110',
 '012310',
 '0130',
 '013007',
 '02',
 '029',
 '03',
 '0310',
 '03oct2009',
 '04',
 '041',
 '048',
 '05',
 '050',
 '0510',
 '053105',
 '06',
 '06th',
 '07',
 '079',
 '07kiloton',
 '08',
 '081006',
 '087',
 '089',
 '08th',
 '09',
 '09082009',
 '091505',
 '0f',
 '0ne',
 '0r',
 '0s',
 '0stars',
 '0when',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000000',
 '10000000',
 '1000000000000',
 '1000000000000010000000000000',
 '10002000',
 '1000lb',
 '1000s',
 '1001',
 '100200',
 '100am',
 '100b',
 '100hell',
 '100kin',
 '100m',
 '100min',
 '100minute',
 '100mph',
 '100percent',
 '100plus',
 '100s',
 '100square',
 '100th',
 '100thgrade',
 '100x',
 '100yards',
 '100year',
 '100yearold',
 '101',
 '1010',
 '1010seek',
 '1011',
 '1012',
 '1013',
 '1014',
 '101499',
 '1015',
 '101503',
 '

In [18]:
#Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie reviews 
#(see section 6.9 in the Machine Learning with Python Cookbook). Display the dimensions of your tf-idf matrix. These dimensions 
#should be the same as your bag-of-words matrix.
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transfrom(appliedstemmer)

In [19]:
tfidf.vocabulary_

{'with': 110216,
 'all': 4694,
 'this': 100105,
 'stuff': 95933,
 'going': 41889,
 'down': 29564,
 'at': 7957,
 'the': 99556,
 'moment': 64914,
 'mj': 64627,
 'ive': 52172,
 'started': 94555,
 'listening': 58227,
 'to': 101158,
 'his': 46831,
 'music': 66781,
 'watching': 108290,
 'odd': 70365,
 'documentary': 28825,
 'here': 46003,
 'and': 5684,
 'there': 99794,
 'watched': 108278,
 'wiz': 110297,
 'moonwalker': 65284,
 'again': 3914,
 'maybe': 62006,
 'just': 53645,
 'want': 107977,
 'get': 40959,
 'certain': 17597,
 'insight': 50746,
 'into': 51302,
 'guy': 43743,
 'who': 109555,
 'thought': 100261,
 'was': 108169,
 'really': 81330,
 'cool': 22292,
 'in': 49724,
 'eighties': 31523,
 'make': 60429,
 'up': 105585,
 'my': 66968,
 'mind': 63926,
 'whether': 109359,
 'he': 45328,
 'is': 51712,
 'guilty': 43514,
 'or': 71563,
 'innocent': 50633,
 'part': 73602,
 'biography': 11819,
 'feature': 36022,
 'film': 36669,
 'which': 109365,
 'remember': 82571,
 'see': 87873,
 'cinema': 19187,
 '