In [1]:
#Import general python packages
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Part 1: Using the TextBlob Sentiment Analyzer

### 1. Import the movie review data as a data frame and ensure that the data is loaded properly.

In [2]:
# Read the "labeledTrainData.tsv" file 
df =pd.read_csv('C:\\Users\\Yousof\\Desktop\\DSC 550\\Week 3\\labeledTrainData.tsv', delimiter='\t')
df

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


### 2. How many of each positive and negative reviews are there?

In [3]:
# Import textblob
from textblob import TextBlob
# Import the nltk library
import nltk

In [4]:
# Displaying shape of dataset
# Displaying posetive and negetive reviews with sentiment analysis
print('df size:' ,df.shape)
df.groupby("sentiment").count()

df size: (25000, 3)


Unnamed: 0_level_0,id,review
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12500,12500
1,12500,12500


### 3. Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.

In [5]:
#! pip install textblob


In [6]:
from textblob import TextBlob

In [7]:

def sentiment_func(review):
    try:
        return TextBlob(review).sentiment 
    except:
        return None 

In [8]:
# create 'review_pol_sub' to display apply sentiment_func
df['review_pol_sub'] = df['review'].apply(sentiment_func)


In [9]:
# Create new column 'review_pol_sub'
# Review polarity and subjectivity of each reviews 
df['polarity'] = df['review_pol_sub'].apply(lambda x: x[0])
df['subjectivity'] = df['review_pol_sub'].apply(lambda x: x[1])

In [10]:
df.head()

Unnamed: 0,id,sentiment,review,review_pol_sub,polarity,subjectivity
0,5814_8,1,With all this stuff going down at the moment w...,"(0.001276742581090417, 0.6067460317460317)",0.001277,0.606746
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","(0.2563492063492064, 0.5311111111111111)",0.256349,0.531111
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"(-0.05394123606889564, 0.5629331306990881)",-0.053941,0.562933
3,3630_4,0,It must be assumed that those who praised this...,"(0.1347530864197531, 0.4929012345679012)",0.134753,0.492901
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"(-0.024841720779220786, 0.45981782106782115)",-0.024842,0.459818


### 4. Check the accuracy of this model. Is this model better than random guessing?

In [11]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
nb_samples = 1000
x, y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(xtrain, ytrain)

LogisticRegression()

In [12]:
print(accuracy_score(ytest, model.predict(xtest)))

0.985


Based on the accuricy of model result I think this model is better than random guessing

In [13]:
# Counting posetive and negetive result
#df.TR.value_count()

# Part 2: Prepping Text for a Custom Model

### 1. Convert all text to lowercase letters.

In [14]:
# Convert 'review' column to lowercase
df['review'].str.lower()



0        with all this stuff going down at the moment w...
1        \the classic war of the worlds\" by timothy hi...
2        the film starts with a manager (nicholas bell)...
3        it must be assumed that those who praised this...
4        superbly trashy and wondrously unpretentious 8...
                               ...                        
24995    it seems like more consideration has gone into...
24996    i don't believe they made this film. completel...
24997    guy is a loser. can't get girls, needs to buil...
24998    this 30 minute documentary buñuel made in the ...
24999    i saw this movie as a child and it broke my he...
Name: review, Length: 25000, dtype: object

In [15]:
def lower(string: str):
    return string.lower()
df["newReview"] = df["review"].apply(lambda d : lower(d))
df["newReview"] 

0        with all this stuff going down at the moment w...
1        \the classic war of the worlds\" by timothy hi...
2        the film starts with a manager (nicholas bell)...
3        it must be assumed that those who praised this...
4        superbly trashy and wondrously unpretentious 8...
                               ...                        
24995    it seems like more consideration has gone into...
24996    i don't believe they made this film. completel...
24997    guy is a loser. can't get girls, needs to buil...
24998    this 30 minute documentary buñuel made in the ...
24999    i saw this movie as a child and it broke my he...
Name: newReview, Length: 25000, dtype: object

### 2. Remove punctuation and special characters from the text.


In [16]:
# Remove punctuation and keep the lower case
df['review'].str.replace('[^\w\s]','')
#df['review'].str.replace('[^\w\s]','').str.lower()


  df['review'].str.replace('[^\w\s]','')


0        With all this stuff going down at the moment w...
1        The Classic War of the Worlds by Timothy Hines...
2        The film starts with a manager Nicholas Bell g...
3        It must be assumed that those who praised this...
4        Superbly trashy and wondrously unpretentious 8...
                               ...                        
24995    It seems like more consideration has gone into...
24996    I dont believe they made this film Completely ...
24997    Guy is a loser Cant get girls needs to build u...
24998    This 30 minute documentary Buñuel made in the ...
24999    I saw this movie as a child and it broke my he...
Name: review, Length: 25000, dtype: object

### 3. Remove stop words.

In [17]:
# Remove stop words
#Convert 'review' column to lowercase
# Remove punctuation and special characters from the 'review' column
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df["review"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

#df["review"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))\
#.str.replace('[^\w\s]','').str.lower()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yousof\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        With stuff going moment MJ i've started listen...
1        \The Classic War Worlds\" Timothy Hines entert...
2        The film starts manager (Nicholas Bell) giving...
3        It must assumed praised film (\the greatest fi...
4        Superbly trashy wondrously unpretentious 80's ...
                               ...                        
24995    It seems like consideration gone IMDb reviews ...
24996    I believe made film. Completely unnecessary. T...
24997    Guy loser. Can't get girls, needs build up, pi...
24998    This 30 minute documentary Buñuel made early 1...
24999    I saw movie child broke heart! No story unfini...
Name: review, Length: 25000, dtype: object

### 4. Apply NLTK’s PorterStemmer.

In [18]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [19]:
import string
import re
pd.set_option('display.max_colwidth', 100)
stopwords = nltk.corpus.stopwords.words('english')


In [20]:
def clean_text(text):
    text = "".join([c for c in text if c not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [21]:
#df['review_nostop'] = df['review'].apply(lambda  x: clean_text(x.lower()))
#df.head()
df['review_nostop'] = df['review'].apply(lambda  x: clean_text(x))
df.head()

Unnamed: 0,id,sentiment,review,review_pol_sub,polarity,subjectivity,newReview,review_nostop
0,5814_8,1,"With all this stuff going down at the moment with MJ i've started listening to his music, watchi...","(0.001276742581090417, 0.6067460317460317)",0.001277,0.606746,"with all this stuff going down at the moment with mj i've started listening to his music, watchi...","[With, stuff, going, moment, MJ, ive, started, listening, music, watching, odd, documentary, wat..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hines is a very entertaining film that obviously goe...","(0.2563492063492064, 0.5311111111111111)",0.256349,0.531111,"\the classic war of the worlds\"" by timothy hines is a very entertaining film that obviously goe...","[The, Classic, War, Worlds, Timothy, Hines, entertaining, film, obviously, goes, great, effort, ..."
2,7759_3,0,The film starts with a manager (Nicholas Bell) giving welcome investors (Robert Carradine) to Pr...,"(-0.05394123606889564, 0.5629331306990881)",-0.053941,0.562933,the film starts with a manager (nicholas bell) giving welcome investors (robert carradine) to pr...,"[The, film, starts, manager, Nicholas, Bell, giving, welcome, investors, Robert, Carradine, Prim..."
3,3630_4,0,"It must be assumed that those who praised this film (\the greatest filmed opera ever,\"" didn't I...","(0.1347530864197531, 0.4929012345679012)",0.134753,0.492901,"it must be assumed that those who praised this film (\the greatest filmed opera ever,\"" didn't i...","[It, must, assumed, praised, film, greatest, filmed, opera, ever, didnt, I, read, somewhere, eit..."
4,9495_8,1,"Superbly trashy and wondrously unpretentious 80's exploitation, hooray! The pre-credits opening ...","(-0.024841720779220786, 0.45981782106782115)",-0.024842,0.459818,"superbly trashy and wondrously unpretentious 80's exploitation, hooray! the pre-credits opening ...","[Superbly, trashy, wondrously, unpretentious, 80s, exploitation, hooray, The, precredits, openin..."


In [22]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()


In [23]:
# Stem the text
def stemming (tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [24]:
df['review_stemmed'] = df['review_nostop'].apply(lambda x: stemming(x))
df.head()

Unnamed: 0,id,sentiment,review,review_pol_sub,polarity,subjectivity,newReview,review_nostop,review_stemmed
0,5814_8,1,"With all this stuff going down at the moment with MJ i've started listening to his music, watchi...","(0.001276742581090417, 0.6067460317460317)",0.001277,0.606746,"with all this stuff going down at the moment with mj i've started listening to his music, watchi...","[With, stuff, going, moment, MJ, ive, started, listening, music, watching, odd, documentary, wat...","[with, stuff, go, moment, MJ, ive, start, listen, music, watch, odd, documentari, watch, the, wi..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hines is a very entertaining film that obviously goe...","(0.2563492063492064, 0.5311111111111111)",0.256349,0.531111,"\the classic war of the worlds\"" by timothy hines is a very entertaining film that obviously goe...","[The, Classic, War, Worlds, Timothy, Hines, entertaining, film, obviously, goes, great, effort, ...","[the, classic, war, world, timothi, hine, entertain, film, obvious, goe, great, effort, length, ..."
2,7759_3,0,The film starts with a manager (Nicholas Bell) giving welcome investors (Robert Carradine) to Pr...,"(-0.05394123606889564, 0.5629331306990881)",-0.053941,0.562933,the film starts with a manager (nicholas bell) giving welcome investors (robert carradine) to pr...,"[The, film, starts, manager, Nicholas, Bell, giving, welcome, investors, Robert, Carradine, Prim...","[the, film, start, manag, nichola, bell, give, welcom, investor, robert, carradin, primal, park,..."
3,3630_4,0,"It must be assumed that those who praised this film (\the greatest filmed opera ever,\"" didn't I...","(0.1347530864197531, 0.4929012345679012)",0.134753,0.492901,"it must be assumed that those who praised this film (\the greatest filmed opera ever,\"" didn't i...","[It, must, assumed, praised, film, greatest, filmed, opera, ever, didnt, I, read, somewhere, eit...","[It, must, assum, prais, film, greatest, film, opera, ever, didnt, I, read, somewher, either, do..."
4,9495_8,1,"Superbly trashy and wondrously unpretentious 80's exploitation, hooray! The pre-credits opening ...","(-0.024841720779220786, 0.45981782106782115)",-0.024842,0.459818,"superbly trashy and wondrously unpretentious 80's exploitation, hooray! the pre-credits opening ...","[Superbly, trashy, wondrously, unpretentious, 80s, exploitation, hooray, The, precredits, openin...","[superbl, trashi, wondrous, unpretenti, 80, exploit, hooray, the, precredit, open, sequenc, some..."


### 5. Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same as the number of rows in your original data frame.

In [25]:
# Import library
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
phrases = df['review_stemmed']
phrases


0        [with, stuff, go, moment, MJ, ive, start, listen, music, watch, odd, documentari, watch, the, wi...
1        [the, classic, war, world, timothi, hine, entertain, film, obvious, goe, great, effort, length, ...
2        [the, film, start, manag, nichola, bell, give, welcom, investor, robert, carradin, primal, park,...
3        [It, must, assum, prais, film, greatest, film, opera, ever, didnt, I, read, somewher, either, do...
4        [superbl, trashi, wondrous, unpretenti, 80, exploit, hooray, the, precredit, open, sequenc, some...
                                                        ...                                                 
24995    [It, seem, like, consider, gone, imdb, review, film, went, sourcebr, br, here, review, without, ...
24996    [I, dont, believ, made, film, complet, unnecessari, the, first, film, okay, but, need, sequel, c...
24997    [guy, loser, cant, get, girl, need, build, pick, stronger, success, guy, etc, seen, saw, move, I...
24998    [thi, 30, 

In [27]:
count = CountVectorizer()
# Create dictionry vectorizer
dicvectorizer = DictVectorizer(sparse=True)

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer

count_vec = MultiLabelBinarizer()
mlb = count_vec.fit(df["review_stemmed"])
pd.DataFrame(mlb.transform(df["review_stemmed"]), columns=[mlb.classes_])

Unnamed: 0,Unnamed: 1,0,00,000,0000000000001,000001,00000110,0001,00015,001,...,éveri,êxtase,ís,ísnt,ø,østbye,über,überannoy,überspi,üvegtigri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(df['newReview'])
bag_of_words

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3445861 stored elements in Compressed Sparse Row format>

In [30]:
# Show Feature matrix
bag_of_words.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [31]:
# Show feature names
count.get_feature_names()

['00',
 '000',
 '0000000000001',
 '00001',
 '00015',
 '000s',
 '001',
 '003830',
 '006',
 '007',
 '0079',
 '0080',
 '0083',
 '0093638',
 '00am',
 '00pm',
 '00s',
 '01',
 '01pm',
 '02',
 '020410',
 '029',
 '03',
 '04',
 '041',
 '05',
 '050',
 '06',
 '06th',
 '07',
 '08',
 '087',
 '089',
 '08th',
 '09',
 '0f',
 '0ne',
 '0r',
 '0s',
 '10',
 '100',
 '1000',
 '1000000',
 '10000000000000',
 '1000lb',
 '1000s',
 '1001',
 '100b',
 '100k',
 '100m',
 '100min',
 '100mph',
 '100s',
 '100th',
 '100x',
 '100yards',
 '101',
 '101st',
 '102',
 '102nd',
 '103',
 '104',
 '1040',
 '1040a',
 '1040s',
 '105',
 '1050',
 '105lbs',
 '106',
 '106min',
 '107',
 '108',
 '109',
 '10am',
 '10lines',
 '10mil',
 '10min',
 '10minutes',
 '10p',
 '10pm',
 '10s',
 '10star',
 '10th',
 '10x',
 '10yr',
 '11',
 '110',
 '1100',
 '11001001',
 '1100ad',
 '111',
 '112',
 '1138',
 '114',
 '1146',
 '115',
 '116',
 '117',
 '11f',
 '11m',
 '11th',
 '12',
 '120',
 '1200',
 '1200f',
 '1201',
 '1202',
 '123',
 '12383499143743701',
 '1

### 6. Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie reviews (see section 6.9 in the Machine Learning with Python Cookbook). Display the dimensions of your tf-idf matrix. These dimensions should be the same as your bag-of-words matrix.


In [32]:
# Load libraries
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
tfidf = TfidfVectorizer()

In [34]:
feature_matrix = tfidf.fit_transform(df["newReview"])
feature_matrix

<25000x74849 sparse matrix of type '<class 'numpy.float64'>'
	with 3445861 stored elements in Compressed Sparse Row format>

In [35]:
# Show tf.idf feature matrix as dense matrix
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04823248, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [36]:
# Show feature names
tfidf.vocabulary_

{'with': 73342,
 'all': 2662,
 'this': 66562,
 'stuff': 63783,
 'going': 27963,
 'down': 19854,
 'at': 4753,
 'the': 66339,
 'moment': 43526,
 'mj': 43300,
 've': 70920,
 'started': 62903,
 'listening': 38991,
 'to': 67125,
 'his': 31095,
 'music': 44529,
 'watching': 72259,
 'odd': 46634,
 'documentary': 19380,
 'here': 30670,
 'and': 3258,
 'there': 66432,
 'watched': 72253,
 'wiz': 73394,
 'moonwalker': 43761,
 'again': 2148,
 'maybe': 41519,
 'just': 35787,
 'want': 72088,
 'get': 27304,
 'certain': 11378,
 'insight': 33819,
 'into': 34255,
 'guy': 29233,
 'who': 72904,
 'thought': 66621,
 'was': 72196,
 'really': 53839,
 'cool': 14695,
 'in': 33004,
 'eighties': 21129,
 'make': 40421,
 'up': 70331,
 'my': 44639,
 'mind': 42844,
 'whether': 72768,
 'he': 30211,
 'is': 34585,
 'guilty': 29057,
 'or': 47142,
 'innocent': 33729,
 'part': 48525,
 'biography': 7404,
 'feature': 24077,
 'film': 24536,
 'which': 72773,
 'remember': 54776,
 'see': 58585,
 'cinema': 12453,
 'when': 72753,
 