In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import operator
from nltk import tokenize

# 1. Load Data
We load a total of 6 data files, 2 random samples of 5K, 10K, 15K from a dataset of the reviews that have more than 40 characters of the top 100 most-reviewed-businesses.

In [2]:
def load_data(file_path):
    with open(file_path, 'r', errors='ignore' ) as json_file: 
        data = pd.read_json(json_file)
    data = data.reset_index()
    data['text_len'] = [len(t) for t in data['text']]
    return data

In [3]:
# Load 6 datasets of reviews: 2 random samples of review count 50K, 100K, 500K
data_1 = load_data('./files/2sample_5000_1.json')
data_2 = load_data('./files/2sample_5000_2.json')
data_3 = load_data('./files/2sample_10000_1.json')
data_4 = load_data('./files/2sample_10000_2.json')
data_5 = load_data('./files/2sample_15000_1.json')
data_6 = load_data('./files/2sample_15000_2.json')

In [4]:
data_1_reviews = data_1.loc[:,['text']]
data_2_reviews = data_2.loc[:,['text']]
data_3_reviews = data_3.loc[:,['text']]
data_4_reviews = data_4.loc[:,['text']]
data_5_reviews = data_5.loc[:,['text']]
data_6_reviews = data_6.loc[:,['text']]

In [5]:
data_1_reviews['tokenized_text'] = data_1_reviews['text'].apply(tokenize.sent_tokenize)
data_2_reviews['tokenized_text'] = data_2_reviews['text'].apply(tokenize.sent_tokenize) 
data_3_reviews['tokenized_text'] = data_3_reviews['text'].apply(tokenize.sent_tokenize) 
data_4_reviews['tokenized_text'] = data_4_reviews['text'].apply(tokenize.sent_tokenize) 
data_5_reviews['tokenized_text'] = data_5_reviews['text'].apply(tokenize.sent_tokenize) 
data_6_reviews['tokenized_text'] = data_6_reviews['text'].apply(tokenize.sent_tokenize) 

# 2. Use Out-of-box sentiment analyzers to find sentiment of review text

We used VADER form the NLTK library and TextBlob.
I attempted to use StanfordCoreNLP, but decided not to use it because it took too long to run. The sentiment tool in StanfordCoreNLP provided scores per sentence, which would be a comporable package especially to VADER, as the VADER documentation suggests to first tokenize text into sentences and use the sentiment analyzer per sentence. 
Ref: https://github.com/cjhutto/vaderSentiment/blob/master/vaderSentiment/vaderSentiment.py
We found the mean of sentiment scores from each sentence of a review using both the VADER and TextBlob package.

In [6]:
# function that finds mean sentiment score of a list of sentences
def avg_sentence_sentiment(sentence_list, analyzer):
    review_sentiments = 0.0
    
    for sentence in sentence_list:
        if (analyzer == 'vader'):
            vs = SentimentIntensityAnalyzer().polarity_scores(sentence)
            review_sentiments += vs["compound"]
        elif (analyzer == 'textblob'):
            t = TextBlob(sentence)
            review_sentiments += t.sentiment[0]
    return (review_sentiments / len(sentence_list))

# function that adds a column each for the sentiment score for the vader and textblob package
def find_sentiment(dataset):
    dataset['vader_tok'] = dataset['tokenized_text'].apply(lambda x: avg_sentence_sentiment(x,'vader')) 
    dataset['textblob_tok'] = dataset['tokenized_text'].apply(lambda x: avg_sentence_sentiment(x,'textblob')) 

In [7]:
# Find sentiments for all data samples
find_sentiment(data_1_reviews)

In [8]:
find_sentiment(data_2_reviews)

In [9]:
find_sentiment(data_3_reviews)

In [10]:
find_sentiment(data_4_reviews)

In [11]:
find_sentiment(data_5_reviews)

In [12]:
find_sentiment(data_6_reviews)

## Add the stars column onto the sentiment dataframe and save as a CSVs

In [13]:
df_stars_sent_1 = pd.DataFrame(data = data_1.loc[:,'stars'], columns=['stars'])
df_stars_sent_2 = pd.DataFrame(data = data_2.loc[:,'stars'], columns=['stars'])
df_stars_sent_3 = pd.DataFrame(data = data_3.loc[:,'stars'], columns=['stars'])
df_stars_sent_4 = pd.DataFrame(data = data_4.loc[:,'stars'], columns=['stars'])
df_stars_sent_5 = pd.DataFrame(data = data_3.loc[:,'stars'], columns=['stars'])
df_stars_sent_6 = pd.DataFrame(data = data_4.loc[:,'stars'], columns=['stars'])

In [14]:
df_stars_sent_1 = df_stars_sent_1.join(data_1_reviews)
df_stars_sent_2 = df_stars_sent_2.join(data_2_reviews)
df_stars_sent_3 = df_stars_sent_3.join(data_3_reviews)
df_stars_sent_4 = df_stars_sent_4.join(data_4_reviews)
df_stars_sent_5 = df_stars_sent_5.join(data_5_reviews)
df_stars_sent_6 = df_stars_sent_6.join(data_6_reviews)

In [15]:
df_stars_sent_1.to_csv("./files/data_1.csv")
df_stars_sent_2.to_csv("./files/data_2.csv")
df_stars_sent_3.to_csv("./files/data_3.csv")
df_stars_sent_4.to_csv("./files/data_4.csv")
df_stars_sent_5.to_csv("./files/data_5.csv")
df_stars_sent_5.to_csv("./files/data_6.csv")