# Part 1: Using the TextBlob Sentiment Analyzer

In [24]:
#Import the movie review data as a data frame and ensure that the data is loaded properly.
import pandas as pd
movie_review = pd.read_csv('labeledTrainData.tsv', sep='\t', header=0)
movie_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [25]:
#How many of each positive and negative reviews are there?
movie_review.loc[:,['sentiment']].value_counts()

#there are 12500 positive and negarive reviews in the dataset.

sentiment
0            12500
1            12500
dtype: int64

In [26]:
pip install -U textblob

Note: you may need to restart the kernel to use updated packages.


In [27]:
#Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than or equal 
#to zero is a positive sentiment and less than 0 is a negative sentiment.


from textblob import TextBlob
movie_review['polarity'] = movie_review['review'].apply(lambda movie_review: TextBlob(movie_review).sentiment.polarity)
movie_review.head()

Unnamed: 0,id,sentiment,review,polarity
0,5814_8,1,With all this stuff going down at the moment w...,0.001277
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941
3,3630_4,0,It must be assumed that those who praised this...,0.134753
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842


In [28]:
def func(row):
    if row['polarity'] >= 0 :
        return 1
    else:
        return 0

movie_review['sentiment_tb'] = movie_review.apply(func, axis=1)
movie_review.head()

Unnamed: 0,id,sentiment,review,polarity,sentiment_tb
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0


In [29]:
from sklearn.metrics import classification_report

print(classification_report(movie_review['sentiment'].values, movie_review['sentiment_tb'].values))

              precision    recall  f1-score   support

           0       0.89      0.42      0.57     12500
           1       0.62      0.95      0.75     12500

    accuracy                           0.69     25000
   macro avg       0.75      0.69      0.66     25000
weighted avg       0.75      0.69      0.66     25000



In [30]:
from sklearn.metrics import accuracy_score

print("Accuracy: ", accuracy_score(movie_review['sentiment'].values, movie_review['sentiment_tb'].values))

Accuracy:  0.68524


In [31]:
#For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps (3) and (4).


In [32]:
pip install vaderSentiment

Note: you may need to restart the kernel to use updated packages.


In [33]:
# import SentimentIntensityAnalyzer class
# from vaderSentiment.vaderSentiment module.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
movie_review['polarity_vader'] = movie_review['review'].apply(lambda movie_review: sa.polarity_scores(movie_review))
movie_review.head()

Unnamed: 0,id,sentiment,review,polarity,sentiment_tb,polarity_vader
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co..."
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co..."


In [34]:
def func_vader(row):
    if row['polarity_vader']['compound'] >= 0 :
        return 1
    else:
        return 0

movie_review['sentiment_vader'] = movie_review.apply(func, axis=1)
movie_review.head()

Unnamed: 0,id,sentiment,review,polarity,sentiment_tb,polarity_vader,sentiment_vader
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0


In [35]:
print("Accuracy from VADER: ", accuracy_score(movie_review['sentiment'].values, movie_review['sentiment_vader'].values))

Accuracy from VADER:  0.68524


# Part 2: Prepping Text for a Custom Model

In [36]:
#Convert all text to lowercase letters.
movie_review = pd.read_csv('labeledTrainData.tsv', sep='\t', header=0)
#movie_review.head()
movie_review['review'] = movie_review['review'].apply(lambda x: x.lower())
movie_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi..."
2,7759_3,0,the film starts with a manager (nicholas bell)...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


In [37]:
#Remove punctuation and special characters from the text.
import re
movie_review['review'] = movie_review['review'].apply(lambda x: re.sub('\W+',' ',x))
movie_review.head()


Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,the classic war of the worlds by timothy hine...
2,7759_3,0,the film starts with a manager nicholas bell g...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


In [38]:
import nltk

In [39]:
#remove stop words
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def remove_stop_word(s):
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(s)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = ''

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence = filtered_sentence+w+' '
    return filtered_sentence
            

movie_review['review_wo_stop'] = movie_review['review'].apply(lambda x: remove_stop_word(x))
movie_review.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atanu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,id,sentiment,review,review_wo_stop
0,5814_8,1,with all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,2381_9,1,the classic war of the worlds by timothy hine...,classic war worlds timothy hines entertaining ...
2,7759_3,0,the film starts with a manager nicholas bell g...,film starts manager nicholas bell giving welco...
3,3630_4,0,it must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious 80 ex...


In [40]:
#Apply NLTK’s PorterStemmer.

# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

def apply_porterstemmer(s):
    word_tokens = word_tokenize(s)
    stemmed_sentence = ''
    for w in word_tokens:
        if w.isalpha():
            stemmed_sentence=stemmed_sentence+ps.stem(w)+' '
    return stemmed_sentence

movie_review['stemmed_sentence'] = movie_review['review_wo_stop'].apply(lambda x: apply_porterstemmer(x))
movie_review.head()


Unnamed: 0,id,sentiment,review,review_wo_stop,stemmed_sentence
0,5814_8,1,with all this stuff going down at the moment w...,stuff going moment mj started listening music ...,stuff go moment mj start listen music watch od...
1,2381_9,1,the classic war of the worlds by timothy hine...,classic war worlds timothy hines entertaining ...,classic war world timothi hine entertain film ...
2,7759_3,0,the film starts with a manager nicholas bell g...,film starts manager nicholas bell giving welco...,film start manag nichola bell give welcom inve...
3,3630_4,0,it must be assumed that those who praised this...,must assumed praised film greatest filmed oper...,must assum prais film greatest film opera ever...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious 80 ex...,superbl trashi wondrous unpretenti exploit hoo...


5.Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same as the number of rows in your original data frame.

In [41]:
movie_review.loc[:,['stemmed_sentence']].shape 

(25000, 1)

In [42]:
movie_review.loc[:,['stemmed_sentence']].head()

Unnamed: 0,stemmed_sentence
0,stuff go moment mj start listen music watch od...
1,classic war world timothi hine entertain film ...
2,film start manag nichola bell give welcom inve...
3,must assum prais film greatest film opera ever...
4,superbl trashi wondrous unpretenti exploit hoo...


In [45]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# Use the stemmed_sentence column
matrix = vectorizer.fit_transform(movie_review.stemmed_sentence)
print(matrix.shape)
#counts = pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names(), index=movie_review.stemmed_sentence)
#counts.head()


(25000, 49856)


6. Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie reviews (see section 6.9 in the Machine Learning with Python Cookbook). Display the dimensions of your tf-idf matrix. These dimensions should be the same as your bag-of-words matrix.

In [46]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
 
# Initialize the model
vectorizer = TfidfVectorizer()
 
# Train the model
matrix =  vectorizer.fit_transform(movie_review.stemmed_sentence)
print(matrix.shape)
#tf_idf = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names(), index=movie_review.stemmed_sentence)
#tf_idf.head()

(25000, 49856)
