In [58]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast

from datetime import datetime

current_date = datetime.today().strftime('%Y%m%d')

INPUT_DATA_DIR = "../data/input"
OUTPUT_DATA_DIR = "../data/output"

INTERMEDIARY_CSV = os.path.join(OUTPUT_DATA_DIR, current_date + "_merged_preprocessed.csv")

NRC_DIR = os.path.join(INPUT_DATA_DIR, "nrc")
NRC_POSITIVE = os.path.join(NRC_DIR, os.path.join("OneFilePerEmotion", "positive-NRC-Emotion-Lexicon.txt"))
NRC_NEGATIVE = os.path.join(NRC_DIR, os.path.join("OneFilePerEmotion", "negative-NRC-Emotion-Lexicon.txt"))

NRC_ALL_POSITIVE = [NRC_POSITIVE]
NRC_ALL_NEGATIVE = [NRC_NEGATIVE]


OUTPUT_CSV_FILE = os.path.join(OUTPUT_DATA_DIR, current_date + "_predicted_rating.csv")

# Read the NRC emotion lexicon

In [59]:
positive = []

for pos in NRC_ALL_POSITIVE:
    with open(pos, 'r') as f:
        for line in f:
            line = line.strip()
            split = line.split()
            if int(split[1]) == 1:
                positive.append(split[0])

negative = []
for neg in NRC_ALL_NEGATIVE:
    with open(neg, 'r') as f:
        for line in f:
            line = line.strip()
            split = line.split()
            if int(split[1]) == 1:
                negative.append(split[0])

# Calculate the predicted rating for each review

In [60]:
df = pd.read_csv(INTERMEDIARY_CSV)
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ast.literal_eval(x))
def calculate_score(row):
    trimmed_array = row['cleaned_text']
    score = 0
    count = 0
    for word in trimmed_array:
        if word in positive and word in negative:
            continue
        elif word in positive:
            count += 1
            score += 5
        elif word in negative:
            count += 1
            score -= 1

    if count == 0:
        return -1
    predicted = score/count
    if (predicted < 1):
        return 1
    return score/count

def score_to_category(rating):
    if rating < 2.5:
        return 'bad'
    else:
        return 'good'

df['predicted_rating'] = df.apply(calculate_score, axis = 1)
df = df[df['predicted_rating'] != -1]

df['category'] = df['rating'].apply(score_to_category)

df['predicted_category'] = df['predicted_rating'].apply(score_to_category)

def rating_round(rating):
    return int(round(rating))

df['predicted_rating_discrete'] = df['predicted_rating'].apply(rating_round)

# Write the results to `data/output/result.csv`

In [62]:

df.to_csv(OUTPUT_CSV_FILE, index=False)