In [1]:
import sys
sys.path.append('../')

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

from utils.utils import Utils
from utils.preprocess import Preprocess

In [None]:
%%time
utils = Utils()
pre = Preprocess(mode="clean")

In [None]:
def compute_correlation(y_true, y_pred):
        y_pred = y_pred.flatten()
        y_true = y_true.flatten()
        return pearsonr(y_true, y_pred)

In [None]:
train_data = utils.read_data("../dataset/train/train-final.csv")
train_data.shape

In [None]:
word_weights = utils.get_dict("../resources/word-weights/empathy_word_weights.csv",
                             key_column="words",
                             value_column="weights")
len(word_weights)

# Normal Weights

## Training Data

In [None]:
train_essay = [pre.clean_text(text, remove_stopwords=False, lemmatize=True) for text in train_data.essay.values.tolist()]
train_essay_empathy_scores = utils.get_essay_empathy_distress_scores(train_essay,
                                                                     word_weights)
train_essay_empathy_scores.shape

In [None]:
train_essay_empathy_scores[:10]

In [None]:
train_empathy = np.reshape(train_data.gold_empathy.values.tolist(), (len(train_data.gold_empathy.values.tolist()), 1))
train_empathy.shape

In [None]:
train_cor = compute_correlation(train_empathy, train_essay_empathy_scores)
train_cor

## Dev Data

In [None]:
dev_data = utils.read_data("../dataset/dev/dev-final.csv")
print(dev_data.shape)

In [None]:
dev_essay = [pre.clean_text(text, remove_stopwords=False, lemmatize=True) for text in dev_data.essay.values.tolist()]
dev_essay_empathy_scores = utils.get_essay_empathy_distress_scores(dev_essay,
                                                                   word_weights)
dev_essay_empathy_scores.shape

In [None]:
dev_essay_empathy_scores[:10]

In [None]:
dev_empathy = np.reshape(dev_data.gold_empathy.values.tolist(), (len(dev_data.gold_empathy.values.tolist()), 1))
dev_empathy.shape

In [None]:
dev_cor = compute_correlation(dev_empathy, dev_essay_empathy_scores)
dev_cor

# Tan-Inverse Weights

In [None]:
arctan_word_weights = utils.get_dict("../resources/word-weights/empathy_word_weights.csv",
                             key_column="words",
                             value_column="weights-arctan")
len(arctan_word_weights)

## Training Data

In [None]:
train_essay = [pre.clean_text(text, remove_stopwords=False, lemmatize=True) for text in train_data.essay.values.tolist()]
train_essay_empathy_scores = utils.get_essay_empathy_distress_scores(train_essay,
                                                                     arctan_word_weights)
train_essay_empathy_scores.shape

In [None]:
train_essay_empathy_scores[:10]

In [None]:
train_empathy = np.reshape(train_data.gold_empathy.values.tolist(), (len(train_data.gold_empathy.values.tolist()), 1))
train_empathy.shape

In [None]:
train_cor = compute_correlation(train_empathy, train_essay_empathy_scores)
train_cor

## Dev Data

In [None]:
dev_essay = [pre.clean_text(text, remove_stopwords=False, lemmatize=True) for text in dev_data.essay.values.tolist()]
dev_essay_empathy_scores = utils.get_essay_empathy_distress_scores(dev_essay,
                                                                   arctan_word_weights)
dev_essay_empathy_scores.shape

In [None]:
dev_essay_empathy_scores[:10]

In [None]:
dev_empathy = np.reshape(dev_data.gold_empathy.values.tolist(), (len(dev_data.gold_empathy.values.tolist()), 1))
dev_empathy.shape

In [None]:
dev_cor = compute_correlation(dev_empathy, dev_essay_empathy_scores)
dev_cor