This file represents our implementation of a Semantic Analysis model using Scikit-learn's Random Forest model. We use this model to determine positive or negative sentiment of Yelp reviews.

This implementation is heavily based on the Semantic Analysis model implementation in this Github repository: https://github.com/asathiya007/nba-trending-teams

In [None]:
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
import statistics 
import time 

In [None]:
#Kaggle direct access
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

#Downloads file online
!kaggle datasets download -d yelp-dataset/yelp-dataset
!unzip yelp-dataset.zip

Saving kaggle.json to kaggle.json
ref                                                             title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
meirnizri/covid19-dataset                                       COVID-19 Dataset                                      5MB  2022-11-13 15:47:17           6607        195  1.0              
madhurpant/world-deaths-and-causes-1990-2019                    World Deaths and Causes (1990 - 2019)               442KB  2022-11-29 07:09:27           1214         30  1.0              
thedevastator/jobs-dataset-from-glassdoor                       Salary Prediction                                     3MB  2022-11-16 13:52:31           4358        100  1.0              
thedevastator/how-much-sle

In [None]:
def _get_data(): 
    # read data from JSON file 
    size = 150000
    review = pd.read_json('yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)
    
    #Gets reviews 100000-150000 
    lst = []
    for chunk_review in review:
      lst.append(chunk_review)
      break
    df_review = pd.concat(lst)
    df_review = df_review.iloc[100000:150000]

    # extract review text and labels  
    dataset = df_review[['stars', 'text']].copy(deep=True)
    dataset['label'] = np.where(dataset['stars'] >= 4, 1, 0)
    
    # return dataset 
    return dataset 

def get_clean_tokens(review): 
    # tokenize review 
    tokens = review.split() 

    # clean each token 
    clean_tokens =[]
    for token in tokens: 
        token = token.strip() 

        # remove non-alphaneumeric characters
        regex = re.compile('[^a-zA-Z0-9]')
        token = regex.sub('', token)

        # record cleaned token
        if len(token) != 0: 
            clean_tokens.append(token)

    # return list of clean tokens 
    return clean_tokens
    
def _tokenize_reviews(dataset): 
    # get clean tokens of tweet
    dataset['text'] = dataset['text'].apply(
        lambda review: get_clean_tokens(review))
    
    # return dataset 
    return dataset 

def _normalize_reviews(dataset): 
    # normalize review text using stemming 
    stemmer = SnowballStemmer('english')
    dataset['text'] = dataset['text'].apply(lambda tokens: 
        [stemmer.stem(token) for token in tokens])
    
    # return dataset 
    return dataset

def _remove_stopwords_from_reviews(dataset):
    # remove stopwords
    nltk.download('stopwords')
    eng_stopwords = stopwords.words('english')
    dataset['text'] = dataset['text'].apply(lambda tokens: 
        [token for token in tokens if token not in eng_stopwords])
    
    # return dataset 
    return dataset 

def _vectorize_reviews(x_train, x_test): 
    # vectorize using counts 
    count_vectorizer = CountVectorizer(stop_words='english', 
        max_features=10000)
    x_train_counts = count_vectorizer.fit_transform(x_train)
    x_test_counts = count_vectorizer.transform(x_test)
    
    # vectorize from counts using tf-idf 
    tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True)
    x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
    x_test_tfidf = tfidf_transformer.transform(x_test_counts)
    
    # return dataset 
    return x_train_tfidf, x_test_tfidf, count_vectorizer, tfidf_transformer

def _save_count_vectorizer(count_vectorizer): 
    # save count vectorizer 
    with open('./count_vectorizer.pkl', 'wb') as f: 
        pickle.dump(count_vectorizer, f)

def load_count_vectorizer(): 
    # load count vectorizer 
    with open('./count_vectorizer.pkl', 'rb') as f: 
        count_vectorizer = pickle.load(f)
    
    # return count vectorizer 
    return count_vectorizer

def _save_tfidf_transformer(tfidf_transformer): 
    # save count vectorizer 
    with open('./tfidf_transformer.pkl', 'wb') as f: 
        pickle.dump(tfidf_transformer, f)

def load_tfidf_transformer(): 
    # load count vectorizer 
    with open('./tfidf_transformer.pkl', 'rb') as f: 
        tfidf_transformer = pickle.load(f)
    
    # return count vectorizer 
    return tfidf_transformer

def process_data(holdout=0.2): 
    # get data from file 
    dataset = _get_data() 
    
    # tokenize tweets
    dataset = _tokenize_reviews(dataset)
    
    # normalize tweet text 
    dataset = _normalize_reviews(dataset)
    
    # remove stopwords 
    dataset = _remove_stopwords_from_reviews(dataset)
    
    # join reviews, split data into train and test sets 
    dataset['text'] = dataset['text'].apply(lambda tokens: ' '.join(tokens))
    x_train, x_test, y_train, y_test = train_test_split(dataset['text'], 
        dataset['label'], test_size=holdout, shuffle=True) 
    
    # vectorize features
    x_train, x_test, count_vectorizer, tfidf_transformer = _vectorize_reviews(
        x_train, x_test)

    # save count vectorizer 
    _save_count_vectorizer(count_vectorizer)

    # save tfidf transformer 
    _save_tfidf_transformer(tfidf_transformer)
    
    # return final dataset 
    return x_train, x_test, y_train, y_test

def transform_review(review): 
    # load stemmer, vectorizers and stopwords 
    try: 
        eng_stopwords = stopwords.words('english')
    except: 
        nltk.download('stopwords')
        eng_stopwords = stopwords.words('english')
    snowball_stemmer = SnowballStemmer(language='english')
    count_vectorizer = load_count_vectorizer()
    tfidf_transformer = load_tfidf_transformer() 

    # preprocess review 
    tokens = get_clean_tokens(review)
    new_tokens = []
    for token in tokens: 
        new_tokens.append(snowball_stemmer.stem(token)) 
    tokens = new_tokens 
    tokens = list(filter(lambda token: token not in eng_stopwords, tokens))

    # transform review 
    review_counts = count_vectorizer.transform(pd.DataFrame([review])[0])
    review_tfidf = tfidf_transformer.transform(review_counts)

    # return transformed review 
    return review_tfidf

In [None]:
def save_random_forest(random_forest): 
    # save random forest model 
    joblib.dump(random_forest, './random_forest.joblib')

def load_random_forest(): 
    # load random forest model 
    random_forest = joblib.load('./random_forest.joblib')

    # return loaded model 
    return random_forest 

def fit_random_forest(x_train, x_test, y_train, y_test): 
    # fit random forest model 
    random_forest = RandomForestClassifier(n_estimators=10, max_depth=100)
    random_forest.fit(x_train, y_train)

    # evaluate random forest model 
    predictions = random_forest.predict(x_test)
    f1 = f1_score(y_test, predictions) 
    accuracy = accuracy_score(y_test, predictions)

    # save random forest model 
    save_random_forest(random_forest)

    # return model results 
    return random_forest, f1, accuracy

def predict_random_forest(review): 
    # transform review 
    review = transform_review(review)

    # load random forest model 
    random_forest = load_random_forest()

    # predict sentiment of review 
    predictions = random_forest.predict_proba(review)

    # return prediction 
    return predictions[0]

def predict_multiple_random_forest(reviews, prob=False):
    # load stemmer, vectorizers, and model
    snowball_stemmer = SnowballStemmer(language='english')
    count_vectorizer = load_count_vectorizer()
    tfidf_transformer = load_tfidf_transformer() 
    random_forest = load_random_forest() 

    # make predictions
    preds = [] 
    pred_times = [] 
    for review in reviews: 
        start = time.time() 
        # transform review
        try: 
            eng_stopwords = stopwords.words('english')
        except: 
            nltk.download('stopwords')
            eng_stopwords = stopwords.words('english')
        tokens = get_clean_tokens(review)
        new_tokens = []
        for token in tokens: 
            new_tokens.append(snowball_stemmer.stem(token)) 
        tokens = new_tokens 
        tokens = list(filter(lambda token: token not in eng_stopwords, tokens))
        review_counts = count_vectorizer.transform(pd.DataFrame([review])[0])
        review_tfidf = tfidf_transformer.transform(review_counts)

        # make prediction on review 
        if prob: 
            preds.append(random_forest.predict_proba(review_tfidf)[0])
        else: 
            preds.append(random_forest.predict(review_tfidf)[0])
        end = time.time() 
        pred_times.append(end - start)

    # return predictions and stats
    mean = statistics.mean(pred_times)
    median = statistics.median(pred_times)
    range = max(pred_times) - min(pred_times)
    variance = statistics.variance(pred_times)
    stdev = statistics.stdev(pred_times)
    stats = [mean, median, range, variance, stdev]
    return preds, stats

In [None]:
# define constants
ITERATIONS = 5

def test_data_processing_model_fitting(holdout): 
    # initialize lists 
    data_processing_times = [] 
    random_forest_fit_times = [] 
    random_forest_f1s = [] 
    random_forest_accuracies = [] 

    # run iterations and record results
    print('Testing data processing time and model fitting')
    for _ in range(ITERATIONS):

        # process data 
        start = time.time()
        x_train, x_test, y_train, y_test = process_data(holdout) 
        end = time.time()
        data_processing_times.append(end - start)

        # fit random forest model 
        start = time.time()
        _, f1, accuracy = fit_random_forest(x_train, x_test, y_train, y_test)
        end = time.time()
        random_forest_fit_times.append(end - start)
        random_forest_f1s.append(f1)
        random_forest_accuracies.append(accuracy)

    # compute additional statistics for each list 
    data = [
        data_processing_times, 
        random_forest_fit_times, 
        random_forest_f1s, 
        random_forest_accuracies
    ] 
    for i in range(len(data)): 
        data_list = data[i]
        mean = statistics.mean(data_list)
        median = statistics.median(data_list)
        rng = max(data_list) - min(data_list)
        variance = statistics.variance(data_list)
        stdev = statistics.stdev(data_list)
        data_list += [mean, median, rng, variance, stdev]
        data[i] = data_list 

    # save result as CSV 
    index = [
        'data processing time (s)',
        'random forest fit time (s)', 
        'random forest f1',
        'random forest accuracy'
    ]
    columns = ['iteration_' + str(i + 1) for i in range(ITERATIONS)] + [
        'mean', 'median', 'range', 'variance', 'stdev']
    data_processing_model_fitting = pd.DataFrame(data, index, columns)
    data_processing_model_fitting.to_csv(
        f'./data_processing_model_fitting_holdout={holdout}.csv')

def test_load_times(holdout):
    # test load times for vectorizers
    print('Testing load time for stemmer and vectorizers')
    stemmer_vectorizer_load_times = [] 
    for _ in range(ITERATIONS):
        start = time.time() 
        _ = SnowballStemmer(language='english')
        _ = load_count_vectorizer() 
        _ = load_tfidf_transformer()
        end = time.time() 
        stemmer_vectorizer_load_times.append(end - start)
    mean = statistics.mean(stemmer_vectorizer_load_times)
    median = statistics.median(stemmer_vectorizer_load_times)
    rng = max(stemmer_vectorizer_load_times) - min(
        stemmer_vectorizer_load_times)
    variance = statistics.variance(stemmer_vectorizer_load_times)
    stdev = statistics.stdev(stemmer_vectorizer_load_times)
    stemmer_vectorizer_load_times += [mean, median, rng, variance, stdev]

    # test load times for random forest model 
    print('Testing load time for random forest model')
    random_forest_load_times = [] 
    for _ in range(ITERATIONS):
        start = time.time() 
        _ = load_random_forest() 
        end = time.time() 
        random_forest_load_times.append(end - start)
    mean = statistics.mean(random_forest_load_times)
    median = statistics.median(random_forest_load_times)
    rng = max(random_forest_load_times) - min(random_forest_load_times)
    variance = statistics.variance(random_forest_load_times)
    stdev = statistics.stdev(random_forest_load_times)
    random_forest_load_times += [mean, median, rng, variance, stdev]
    
    # save result as CSV 
    index = [
        'stemmer and vectorizers load time (s)', 
        'random forest load time (s)'
    ]
    columns = ['iteration_' + str(i + 1) for i in range(ITERATIONS)] + [
        'mean', 'median', 'range', 'variance', 'stdev']
    data = [stemmer_vectorizer_load_times, random_forest_load_times]
    load_times = pd.DataFrame(data, index, columns)
    load_times.to_csv(f'./load_times_holdout={holdout}.csv')

def test_predictions_reviews(holdout): 
    # load data from csv file 
    size = 35000
    reviews_df = pd.read_json('yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

    lst = []
    for chunk_review in reviews_df:
      lst.append(chunk_review)
      break
    df_review = pd.concat(lst)
    df_review['label'] = np.where(df_review['stars'] >= 4, 1, 0)

    reviews = df_review['text']
    labels = df_review['label']

    # make predictions with random forest model 
    preds, stats = predict_multiple_random_forest(reviews)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    random_forest_stats = [accuracy, f1] + stats

    # save result as CSV 
    index = [
        'random forest'
    ]
    columns = [
        'accuracy',
        'f1',
        'mean_prediction_time', 
        'median_prediction_time', 
        'range_prediction_time', 
        'variance_prediction_time', 
        'stdev_prediction_time'
    ]
    data = [random_forest_stats]
    load_times = pd.DataFrame(data, index, columns)
    load_times.to_csv(f'./prediction_stats_holdout={holdout}.csv')

In [None]:
# run experiments
holdouts = [0.4, 0.3, 0.2]
for holdout in holdouts: 
    test_data_processing_model_fitting(holdout)
    test_load_times(holdout)
    test_predictions_reviews(holdout)

Testing data processing time and model fitting


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Testing load time for stemmer and vectorizers
Testing load time for random forest model
Testing data processing time and model fitting


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Testing load time for stemmer and vectorizers
Testing load time for random forest model
Testing data processing time and model fitting


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Testing load time for stemmer and vectorizers
Testing load time for random forest model


In [None]:
def predict_sentiment(data):

  rf_predictions_pos = []
  rf_predictions_neg = []

  # load stemmer and vectorizers 
  snowball_stemmer = SnowballStemmer(language='english')
  count_vectorizer = load_count_vectorizer()
  tfidf_transformer = load_tfidf_transformer() 
  random_forest = load_random_forest() 

  # find predictions for each review
  for index, d in data.iterrows():
    # transform review 
    review = d['text']
    try: 
        eng_stopwords = stopwords.words('english')
    except: 
        nltk.download('stopwords')
        eng_stopwords = stopwords.words('english')
    tokens = get_clean_tokens(review)
    new_tokens = []
    for token in tokens: 
        new_tokens.append(snowball_stemmer.stem(token)) 
    tokens = new_tokens 
    tokens = list(filter(lambda token: token not in eng_stopwords, tokens))
    review_counts = count_vectorizer.transform(pd.DataFrame([review])[0])
    review_tfidf = tfidf_transformer.transform(review_counts)

    rf_prediction = random_forest.predict_proba(review_tfidf)[0]
    
    rf_predictions_pos.append(rf_prediction[1])
    rf_predictions_neg.append(rf_prediction[0])

  # append prediction probabilities into dataframe
  data["RF Positive"] = rf_predictions_pos
  data["RF Negative"] = rf_predictions_neg

  data["Output"] = np.where(data['RF Positive'] >= data["RF Negative"], 1, 0)

  # print percentage of positive reviews
  percent_positive = data["Output"].mean()
  print("Percentage of positive reviews: ", percent_positive)

  return data, percent_positive