## Exercise 3.2: Sentiment Analysis and Preprocessing Text

### Part1: Using the TextBlob Sentiment Analyzer

In [4]:
# Import the required packages

import pandas as pd 
import numpy as np
import re as re
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import html

In [5]:
# 1. Import the movie review data as a data frame and ensure that the data is loaded properly.

reviews = pd.read_csv("/Users/anjanibonda/Data-Science/DSC550/Week3_Sentiment_Analysis/labeledTrainData.tsv",sep="\t")
reviews.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [6]:
# 2. How many of each positive and negative reviews are there?

#Display the counts for each sentiment, positive and negative
reviews.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [7]:
# 3. Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than 
#    or equal to zero is a positive sentiment and less than 0 is a negative sentiment.

from textblob import TextBlob

# Create a new column 'sentiment_score' using the TextBlob polarity score
reviews['sentiment_score'] = reviews['review'].apply(lambda text: TextBlob(text).sentiment.polarity)

In [8]:
# Create a function to analyze the polarity scores to determine positivity, neutrality or negativity
def getAnalysis(score):
  if score < 0:
    return '0'
  else:
    return '1'

In [9]:
# Create a new column by applying the function to the sentiment score

reviews['sentiment_textblob'] = reviews['sentiment_score'].apply(getAnalysis)
reviews.head()

Unnamed: 0,id,sentiment,review,sentiment_score,sentiment_textblob
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0


In [10]:
reviews.sentiment_textblob.value_counts()

1    19017
0     5983
Name: sentiment_textblob, dtype: int64

In [11]:
# 4. Check the accuracy of this model. Is this model better than random guessing?

print("Accurate positive sentiment prediction by textBlob :", sum((reviews['sentiment'] > 0) & (reviews['sentiment_score'] >= 0)))
print("Accurate negative sentiment prediction by textBlob :", sum((reviews['sentiment'] <= 0) & (reviews['sentiment_score'] < 0)))

Accurate positive sentiment prediction by textBlob : 11824
Accurate negative sentiment prediction by textBlob : 5307


Total # of agreements by textBlob: 11824 + 5307 = 18483

Total # of samples: 25000

Accuracy of textBlob = (18483/25000)*100 = 73.932%

Accuracy of textBlob model is about 74%. This is definitely better than random guessing which would be of 50% accuracy  with either yes or no.

In [12]:
# 5. For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps
#   (3) and (4).

#Importing vader module
import nltk
nltk.downloader.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/anjanibonda/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anjanibonda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anjanibonda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
analyzer = SentimentIntensityAnalyzer()
reviews['negative'] = [analyzer.polarity_scores(x)['neg'] for x in reviews['review']]
reviews['neutral'] = [analyzer.polarity_scores(x)['neu'] for x in reviews['review']]
reviews['positive'] = [analyzer.polarity_scores(x)['pos'] for x in reviews['review']]
reviews['compound'] = [analyzer.polarity_scores(x)['compound'] for x in reviews['review']]
reviews['total'] = reviews['positive'] - reviews['negative']

In [14]:
#Calculating positive and negative review sentiment analysis count by Vader
print("Number of rows in the data set with positive reviews in dataset per vader Analysis :", sum(reviews['compound'] >= 0))
print("Number of rows in the data set with Negative reviews in dataset per vader Analysis :", sum(reviews['compound'] < 0))

Number of rows in the data set with positive reviews in dataset per vader Analysis : 16475
Number of rows in the data set with Negative reviews in dataset per vader Analysis : 8525


In [15]:
#Calculating Accuracy of textBlob where labelled test data and VADER preduction for sentiment are matching
print("Accurate positive sentiment prediction by vader :", sum((reviews['sentiment'] > 0) & (reviews['compound'] >= 0)))
print("Accurate negative sentiment prediction by vader :", sum((reviews['sentiment'] <= 0) & (reviews['compound'] < 0)))

Accurate positive sentiment prediction by vader : 10657
Accurate negative sentiment prediction by vader : 6682


Total # of agreements by VADER: 10657+6682 = 17339

Total # of samples: 25000

Accuracy of VADER = (17339/25000)*100 = 69.356%

Accuracy of VADER model is about 70%. This is definitely better than random guessing which would be of 50% accuracy with either yes or no.

### Part 2: Prepping Text for a Custom Model

In [16]:
# 1. Convert all text to lowercase letters.
# 2. Remove punctuation and special characters from the text.
# 3. Remove stop words.

#creating function to clean text
def cleantext(text):
    text = BeautifulSoup(text).get_text() #beautifying text
    letters_only = re.sub("[^a-zA-Z]", " ", text) # clean the html charecters (non text)
    words = letters_only.lower().split()   # convert to lower text                        
    stops = set(stopwords.words("english")) # setting stop words to remove                  
    main_words = [w for w in words if not w in stops]   
    return( " ".join( main_words )) 

In [17]:
#applying clean function on the data frame and creating a new column with clean text
reviews['cleanreview'] = reviews['review'].apply(cleantext)



In [18]:
# 4. Apply NLTK’s PorterStemmer.

# import these modules
from nltk.stem import PorterStemmer

In [19]:
#Applying porterstemmer on clean_review
ps = PorterStemmer()
reviews['cleanreview'] = reviews['cleanreview'].apply(lambda review: ps.stem(review))

In [20]:
# importing word_tokenize

from nltk import word_tokenize

# extracting and prinitng tokenized values sample
corpora = reviews['cleanreview'].values
tokenized = [word_tokenize(corpus) for corpus in corpora]

print(tokenized[1111])

['call', 'episode', 'brilliant', 'feels', 'like', 'little', 'say', 'keeps', 'excellent', 'work', 'season', 'premiere', 'reductive', 'cause', 'never', 'far', 'great', 'sopranos', 'episode', 'far', 'fact', 'title', 'might', 'smug', 'invitation', 'real', 'fans', 'yet', 'join', 'club', 'picking', 'junior', 'left', 'putting', 'bullet', 'nephew', 'gut', 'mistaking', 'crook', 'killed', 'first', 'season', 'story', 'begins', 'tony', 'absolutely', 'fine', 'recollection', 'whatsoever', 'happened', 'attending', 'kind', 'convention', 'speaking', 'normal', 'accent', 'seems', 'something', 'wrong', 'papers', 'apparently', 'tony', 'soprano', 'kevin', 'finnerty', 'least', 'group', 'people', 'think', 'mess', 'sorted', 'leave', 'hotel', 'naturally', 'pure', 'sopranos', 'tradition', 'turns', 'nothing', 'dream', 'tony', 'actually', 'coma', 'doctors', 'uncertain', 'regarding', 'fate', 'family', 'friends', 'worried', 'sick', 'junior', 'refusing', 'believe', 'whole', 'thing', 'actually', 'happened', 'unfortuna

In [21]:
#shape of train data frame
reviews.shape

(25000, 11)

In [22]:
# 5. Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector 
#    for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). 
#    Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same as 
#    the number of rows in your original data frame.

#Creating bag_of_words matrix from clean review
count = CountVectorizer()
bag_of_words = count.fit_transform(reviews['cleanreview'])


In [23]:
bag_of_words #Size of bag_of_words

<25000x75529 sparse matrix of type '<class 'numpy.int64'>'
	with 2446144 stored elements in Compressed Sparse Row format>

In [24]:
# 6. Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie reviews 
#    (see section 6.9 in the Machine Learning with Python Cookbook). Display the dimensions of your tf-idf matrix. 
#    These dimensions should be the same as your bag-of-words matrix.

# Import tf-idf encoding from sklearn library
from sklearn.feature_extraction.text import TfidfVectorizer

# Define some hiperparameters of encoded
vectorizer = TfidfVectorizer()

# Create the training set with the words encoded as features of the reviews
train_data_features = vectorizer.fit_transform(reviews['cleanreview'])

print(train_data_features.shape)

(25000, 75529)


In [25]:
# Import the logistic regression model from sklearn 

from sklearn.linear_model import LogisticRegression

# Define the model
model = LogisticRegression(random_state=0, solver='lbfgs',
                            multi_class='multinomial')
# Train model
model.fit(train_data_features, reviews['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(multi_class='multinomial', random_state=0)

In [26]:
# Testing the model against entire train data from original trained data

# Read the test data
test = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", \
                   quoting=3 )
print(test.shape)

# Clean the text of all reviews in the training set
print("Cleaning and parsing the test set movie reviews...\n")
test['clean_review'] = test['review'].apply(cleantext)

# Create the test set with the words encoded as features of the reviews
test_data_features = vectorizer.transform(test['clean_review'])


# Use the logistic regression model to make sentiment label predictions
result = model.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"],"original_sentiment":test["sentiment"] ,"sentiment_custom":result})
output.head()

(25000, 3)
Cleaning and parsing the test set movie reviews...





Unnamed: 0,id,original_sentiment,sentiment_custom
0,"""5814_8""",1,1
1,"""2381_9""",1,1
2,"""7759_3""",0,0
3,"""3630_4""",0,0
4,"""9495_8""",1,0


In [27]:
# Calculating positive and negative review sentiment analysis count by custom model
print("Number of rows in the data set with positive reviews in dataset per custom model :", sum(output['sentiment_custom'] == 1))
print("Number of rows in the data set with negative reviews in dataset per custom model :", sum(output['sentiment_custom'] == 0))

Number of rows in the data set with positive reviews in dataset per custom model : 12611
Number of rows in the data set with negative reviews in dataset per custom model : 12389


In [28]:
# Calculating Accuracy of custom model where labelled test data and VADER preduction for sentiment are matching
print("Accurate positive sentiment prediction by custom model :", sum((output['original_sentiment'] == 1) & (output['sentiment_custom'] == 1)))
print("Accurate negative sentiment prediction by custom model :", sum((output['original_sentiment'] == 0) & (output['sentiment_custom'] == 0)))

Accurate positive sentiment prediction by custom model : 11997
Accurate negative sentiment prediction by custom model : 11886


Total # of agreements by custom model: 11997+11886 = 23883'

Total # of samples: 25000

Accuracy of Custom Model = (23883/25000)*100 = 95.532%

Accuracy of Custom model is about 95.5%. This is definitely better than random guessing which would be of 50% accuracy with either yes or no.


In [29]:
# Read the test data
test2 = pd.read_csv("/Users/anjanibonda/Data-Science/DSC550/Week3_Sentiment_Analysis/labeledTrainData.tsv", header=0, delimiter="\t", \
                   quoting=3 )
print(test2.shape)


# Clean the text of all reviews in the training set
print("Cleaning and parsing the test set movie reviews...\n")
test2['clean_review'] = test2['review'].apply(cleantext)

# Create the test set with the words encoded as features of the reviews
test_data_features = vectorizer.transform(test2['clean_review'])


# Use the logistic regression model to make sentiment label predictions
result = model.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output_test = pd.DataFrame( data={"id":test2["id"],"sentiment":result})
output_test.head()

(25000, 3)
Cleaning and parsing the test set movie reviews...





Unnamed: 0,id,sentiment
0,"""5814_8""",1
1,"""2381_9""",1
2,"""7759_3""",0
3,"""3630_4""",0
4,"""9495_8""",0


In [30]:
output_test.to_csv("test_result.csv", index=False, quoting=3 )