In [2]:
%pip install vaderSentiment


Collecting vaderSentimentNote: you may need to restart the kernel to use updated packages.

  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
# Depending on which device is being used, this script could take as long as 2.5 
# hours to finish running. 
# This script requires the processed data as "clean_reviews.csv"

In [5]:
# imports 
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
# reading in clean data
data_df = pd.read_csv('clean_reviews.csv')

In [7]:
analyzer = SentimentIntensityAnalyzer()

In [8]:
# making a function to split on years 
def make_df(year): 
    ''' This function takes in a year as a string and 
    returns a dataframe of all of the reviews from the year 
    with a new column that has just the year.
    '''
    year_col = []
    year_df = data_df[data_df['reviewTime'].str.contains(year)]
    for i in range(year_df.shape[0]):
        year_col.append(f'{year}')
    year_df['year'] = year_col
    return year_df 

In [None]:
# make data frames of each year
year_list = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
             '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
thousand_df = make_df(year_list[0])
one_df = make_df(year_list[1])
two_df = make_df(year_list[2])
three_df = make_df(year_list[3])
four_df = make_df(year_list[4])
five_df = make_df(year_list[5])
six_df = make_df(year_list[6])
seven_df = make_df(year_list[7])
eight_df = make_df(year_list[8])
nine_df = make_df(year_list[9])
ten_df = make_df(year_list[10])
eleven_df = make_df(year_list[11])
twelve_df = make_df(year_list[12])
thirteen_df = make_df(year_list[13])
fourteen_df = make_df(year_list[14])
fifteen_df = make_df(year_list[15])
sixteen_df = make_df(year_list[16])
seventeen_df = make_df(year_list[17])
eighteen_df = make_df(year_list[18])


In [12]:
# sentiment function
def get_sent(dataframe):
    '''This function takes the sentiment score 
    of the review of a text. The model we are using can 
    only take up to 512 characters at once, so if a review is more than 512
    characters, it will be split into 512 character chunks and the 
    average of those sections will be its sentiment score.
    This returns a list of the sentiment scores'''
    review_list = dataframe['reviewText'].tolist()
    scores = []
    for i in range(len(review_list)):
        abs_score = analyzer.polarity_scores(str(review_list[i]))
        compound_score = abs_score['compound']
        scale_score = (compound_score + 1) / 2
        scores.append(scale_score)
    return scores

In [None]:
# running for 2000-2006
# these had to be run year by year to run on a laptop
test_list = [thousand_df, one_df, two_df, three_df, four_df, five_df, six_df]
# adding a sentiment score column to each year dataframe 
for i in range(len(test_list)):
    scores = get_sent(test_list[i])
    test_list[i]['sentimentScore'] = scores

In [None]:
# getting sentiment for 2007-2012
test_list = [seven_df, eight_df, nine_df, ten_df, eleven_df, twelve_df]
# adding a sentiment score column to each year dataframe 
for i in range(len(test_list)):
    scores = get_sent(test_list[i])
    test_list[i]['sentimentScore'] = scores

In [None]:

# running for 2013-2018
test_list = [thirteen_df, fourteen_df, fifteen_df, sixteen_df, seventeen_df, eighteen_df]
# adding a sentiment score column to each year dataframe 
for i in range(len(test_list)):
    scores = get_sent(test_list[i])
    test_list[i]['sentimentScore'] = scores

In [26]:
# concatonating all of the year data frames together
concat_list = [thousand_df, one_df, two_df, three_df, four_df, five_df, six_df, seven_df, 
               eight_df, nine_df, ten_df, eleven_df, twelve_df, thirteen_df, fourteen_df, fifteen_df, sixteen_df, 
               seventeen_df, eighteen_df]
results_df = pd.concat(concat_list)
results_df.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewText,year,sentimentScore
143,negative,"12 22, 2000",AE95Z3K6GVIC3,B00003JAU7,,It is worst piece of crap I ever had to instal...,2000,0.0367
144,positive,"08 23, 2000",ARXU3FESTWMJJ,B00003JAU7,,"I got tired of the Win98 crashes, so decided t...",2000,0.98755
145,positive,"06 26, 2000",A2G0O4Y8QE10AE,B00004TYCR,,I bought this program 2.5 years ago and have b...,2000,0.9814
127,positive,"02 1, 2001",A1P4RH7KMJ1SV2,B00003IRBU,{'Format:': ' Video Game'},I have now played all 3 of the Nancy Drew myst...,2001,0.9164
128,positive,"04 10, 2001",A1IU7S4HCK1XK0,B00003JAU9,,I couldn't wait to ditch the Windows ME that c...,2001,0.5871


In [27]:
sent_list = results_df['sentimentScore'].to_list()
classes = []
for i in range(len(sent_list)):
    if sent_list[i] < 0.4:
        classes.append('negative')
    else:
        classes.append('positive')
results_df['classPrediction'] = classes

In [28]:
results_df.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewText,year,sentimentScore,classPrediction
143,negative,"12 22, 2000",AE95Z3K6GVIC3,B00003JAU7,,It is worst piece of crap I ever had to instal...,2000,0.0367,negative
144,positive,"08 23, 2000",ARXU3FESTWMJJ,B00003JAU7,,"I got tired of the Win98 crashes, so decided t...",2000,0.98755,positive
145,positive,"06 26, 2000",A2G0O4Y8QE10AE,B00004TYCR,,I bought this program 2.5 years ago and have b...,2000,0.9814,positive
127,positive,"02 1, 2001",A1P4RH7KMJ1SV2,B00003IRBU,{'Format:': ' Video Game'},I have now played all 3 of the Nancy Drew myst...,2001,0.9164,positive
128,positive,"04 10, 2001",A1IU7S4HCK1XK0,B00003JAU9,,I couldn't wait to ditch the Windows ME that c...,2001,0.5871,positive


In [29]:
# writing results into a csv
results_df.to_csv('results.csv', index = False)