## Neighbourhood-based Method

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from collections import Counter
import os
import pathlib
import time
import operator
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
from sklearn.metrics import classification_report
import zipfile
import gdown

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [None]:
# Determines whether two words are considered a phrase according to 'Thumbs Up or Thumbs Down? 
# Semantic Orientation Applied to Unsupervised Classification of Reviews' by Peter D. Turney, 2002.

def is_phrase(word_index, words):
    first_tag = nltk.pos_tag([words[word_index]])[0][1]
    second_tag = nltk.pos_tag([words[word_index + 1]])[0][1]
    # Considers the tag of the word after the potential phrase, if there is one.
    try:
        third_tag = nltk.pos_tag([words[word_index + 2]])[0][1]
    except IndexError:
        pass
    
    if first_tag == 'JJ':
        if second_tag in ['NN', 'NNS']:
            return True
        elif second_tag == 'JJ':
            try:
                if third_tag not in ['NN', 'NNS']:
                    return True
            except NameError:
                return True
    elif first_tag in ['RB', 'RBR', 'RBS']:
        if second_tag == 'JJ':
            try:
                if third_tag not in ['NN', 'NNS']:
                    return True
            except NameError:
                return True
        elif second_tag in ['VB', 'VBD', 'VBN', 'VBG']:
            return True
    elif first_tag in ['NN', 'NNS']:
        if second_tag == 'JJ':
            try:
                if third_tag not in ['NN', 'NNS']:
                    return True
            except NameError:
                return True
    # If no correct combination is found, the word pair is not a phrase
    return False

In [None]:
# For each utterance that wasn't assigned a score (either by not containing a phrase at all
# or by not containing any with enough positive and negative words surrounding it), an average
# of the scores of the nearest previous and next utterance is used

def fill_nans(df):
    replacements = df['predicted_score'].copy()
    for i in range(len(df)):
        if np.isnan(df['predicted_score'][i]):
            new_values = []
            j = i-1
            while j >= 0:
                if not np.isnan(df['predicted_score'][j]):
                    new_values.append(1 / ((i - j) + 1) * df['predicted_score'][j])
                    break
                j -= 1
            j = i+1
            while j < len(df):
                if not np.isnan(df['predicted_score'][j]):
                    new_values.append(1 / ((j - i) + 1) * df['predicted_score'][j])
                    break
                j += 1
            if new_values:
                replacements[i] = np.mean(new_values)
            else:
                replacements[i] = 1
    df['predicted_score'] = replacements
    return df

In [None]:
# Assigns a sentiment score between 0 and 1 for each utterance in the dataframe

def add_scores(df, NEAR, pos_words, neg_words):
    # Considers the previous and upcoming NEAR words surrounding the phrase
    utt_lens = []
    utt_scores = []
    all_words = []
    utt_idx = 0
    df['predicted_score'] = np.nan
    
    
    # Gets the length of each utterance so the semantic scores are assigned to the
    # correct utterance, also a list with all words
    for utt in df.text:
        utt_lens.append(len([w for w in nltk.word_tokenize(utt) if w not in ['.', ',', '?', '!', '\'']]))
        all_words += [w for w in nltk.word_tokenize(utt) if w not in ['.', ',', '?', '!', '\'']]

    # Checks how many positive and negative words the enitre text contains
    word_counter = Counter([w.lower() for w in all_words])
    pos_hits = sum([word_counter[p] for p in pos_words]) + 0.01
    neg_hits = sum([word_counter[n] for n in neg_words]) + 0.01
    
    for i in range(len(all_words) - 1):
        # Keeps track of which utterance every possible phrase is in
        if i >= sum(utt_lens[:utt_idx+1]) or i == len(all_words) - 2:
            # When going to a new utterance, assigns the mean of the semantic scores
            # of each phrase in the previous utterance as the score of that utterance
            if utt_scores:
                df['predicted_score'][utt_idx] = np.mean(utt_scores)
            utt_idx += 1
            utt_scores = []
        # Creates list of neighbouring words of each phrase found, accounting for edge cases
        if is_phrase(i, all_words):
            neighbourhood = []
            if NEAR > i:
                neighbourhood += all_words[:i]
            else:
                neighbourhood += all_words[i-NEAR:i]
            if i != len(all_words) - 2:
                try:
                    neighbourhood += all_words[i+2:i+2+NEAR]
                except IndexError:
                    neighbourhood += all_words[i+2:len(words)]
            # Counts the amount of positive and negative words in the neighbourhood
            neighbourhood_counter = Counter([w.lower() for w in neighbourhood])
            pos_neigh_hits = sum([neighbourhood_counter[p] for p in pos_words]) + 0.01
            neg_neigh_hits = sum([neighbourhood_counter[n] for n in neg_words]) + 0.01
            # Applies the function from the paper to calculate the phrase's score
            if pos_neigh_hits > 2 and neg_neigh_hits > 2:
                score = np.log((pos_neigh_hits * neg_hits)/ (neg_neigh_hits * pos_hits))
                utt_scores.append(score)
                

    # Normalizes each predicted sentiment score
    df = fill_nans(df)
    abs_max = max([max(df['predicted_score']), -min(df['predicted_score'])])
    if abs_max:
        df.loc[:, 'predicted_score'] += abs_max
        df.loc[:, 'predicted_score'] /= (2*abs_max)
    df.loc[df.predicted_score >= 0.5, 'predicted_score'] = 1.0
    df.loc[df.predicted_score < 0.5, 'predicted_score'] = 0.0
        

    return df

In [None]:
# Returns a classification report that shows the performance of the predicted sentiment scores
# when compared to the actual sentiment scores. Takes n randomly chosen podcast episodes with at
# least min_len utterances from a folder with labeled csvs, or all of them if n is too large
# and outputs the classification report.

def compare_scores(labeled_folder, NEAR, pos_words, neg_words, n, min_len, output_dict):
    # Hides warnings
    pd.options.mode.chained_assignment = None
    
    # Gets all file names within the folder
    filenames = [f for f in listdir(labeled_folder) if isfile(join(labeled_folder, f))]
    np.random.shuffle(filenames)
    dfs = []
    c = 0
    # Looks for up to n podcast episodes with at least min_len utterances
    for filename in filenames:
        df = pd.read_csv(labeled_folder + '/' + filename, sep='\t')
        if len(df) >= min_len:
            dfs.append(df)
            c += 1
            if c == n:
                break
        
    # Returns the classification report for the performance of the classification of the 
    # sentiments.
    if len(dfs) == 1:
        df = add_scores(df, NEAR, pos_words, neg_words)
        return classification_report(list(df.sentiment_score.values), 
                                       list(df.predicted_score.values), zero_division=0, 
                                     output_dict=output_dict)
    total_scores = []
    total_predicted = []
    for df in dfs:
        df = add_scores(df, NEAR, pos_words, neg_words)
        total_scores += list(df.sentiment_score.values)
        total_predicted += list(df.predicted_score.values)
    return classification_report(total_scores, total_predicted, zero_division=0, 
                                 output_dict=output_dict)

In [None]:
# Obtains the best performing NEAR value and samples of the positive and negative word lists
# by randomly generating n parameter sets, doing a brief performance check on each, and then
# more extensively testing those that initially performed well. Returns the most optimal 
# parameter set.

def optimize_params(n, pos_words, neg_words):
    best_params = []
    best_accuracy = 0.0
    good_params = []
    
    # Randomly generates NEAR value between 50 and 1000 and samples of the word lists
    for i in range(n):
        NEAR = np.random.randint(1, 20) * 50
        sample_size = np.random.randint(len(pos_words) - 3, len(pos_words))
        pos_sample = np.random.choice(pos_words, sample_size, replace=False)
        neg_sample = np.random.choice(neg_words, sample_size, replace=False)
        metrics = compare_scores('separate_csv_files', NEAR, 
                                       pos_sample, neg_sample, 1, 50, True)
        # Parameter sets that predict both positive and negative sentiments and have decent
        # metrics in this initial test are considered for more extensive testing
        if '0.0' in metrics.keys() and '1.0' in metrics.keys():
            if metrics['accuracy'] > 0.6 and metrics['0.0']['recall'] > 0.45 and metrics['0.0']['recall'] > 0.45:
                good_params.append((NEAR, pos_sample, neg_sample))
                
    # All well-performing sets are then tested 30 times each, with the set that has the best
    #average accuracy then being returned
    for params in good_params:
        (NEAR, pos_sample, neg_sample) = params
        accuracy = compare_scores('separate_csv_files', NEAR, 
                                   pos_sample, neg_sample, 30, 50, True)['accuracy']
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = (NEAR, pos_sample, neg_sample)
    return best_params, best_accuracy

In [None]:
# Downloads the labeled .csv-files from Google Drive to use for validation.

gdown.download('https://drive.google.com/uc?id=1aqE8yS7Lf8GfljmFEuW5pd3i5S2raW1B', 'separate_csv_files.zip', quiet=False)
with zipfile.ZipFile('separate_csv_files.zip', 'r') as zip_ref:
    zip_ref.extractall('')

In [None]:
# # pos_words = ['good', 'great', 'nice', 'happy', 'easy', 'yes', 'yeah', 'love',
# #              'big', 'right', 'awesome', 'thank', 'thanks']
# # neg_words = ['bad', 'horrible', 'tough', 'sad', 'hard', 'no', 'shit', 'hate',
# #              'little', 'wrong', 'stupid', 'sorry']

# # optimized_params, best_accuracy = optimize_params(2000, pos_words, neg_words)

optimized_params = (350, ['yes', 'yeah', 'love', 'easy', 'great', 'nice', 'happy', 'awesome',
       'thanks', 'thank'], ['little', 'wrong', 'hard', 'stupid', 'horrible', 'shit', 'sad',
       'sorry', 'no', 'tough'])

print(compare_scores('separate_csv_files', optimized_params[0], optimized_params[1],
               optimized_params[2], 5, 150, False))