In [101]:
import string
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# nltk.download('wordnet')

In [111]:
data = 'data/WASSA23_conv_level_with_labels_train.tsv'
df = pd.read_table(data, header=0)
new_col = []
for names in df.columns:
    new_col.append(names.strip())
df.columns = new_col
df.drop(["conversation_id", "turn_id", "speaker_number", "article_id", "speaker_id", "essay_id"], axis=1, inplace=True)

X_data, y_data = df.loc[:, 'text'], df.drop('text', axis=1)
X_train, X_test, y_train , y_test = train_test_split(X_data, y_data, train_size=0.8)
#reset index of training examples
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)
y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)

- tokenization
- remove stop word and punctuatuons, numbers
- lematization
- vectorization

In [113]:
def word_preprocessor(sentence):
    stop_words = set(stopwords.words('english'))
    punctuations = set(string.punctuation)
    lem = WordNetLemmatizer().lemmatize
    sentence = word_tokenize(sentence)
    sentence = [word for word in sentence if word not in stop_words]
    sentence = [word for word in sentence if word not in punctuations]
    sentence_str = ' '.join(sentence)
    sentence = lem(sentence_str)
    return sentence #sentence

In [114]:
X_train = X_train.apply(word_preprocessor)
X_test = X_test.apply(word_preprocessor)

#convert labels to array
X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train[['EmotionalPolarity', 'Emotion', 'Empathy']]), np.array(y_test[['EmotionalPolarity', 'Emotion', 'Empathy']])


In [106]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7020,), (7020, 3), (1756,), (1756, 3))

In [115]:
regressor = make_pipeline(  
                        TfidfVectorizer(max_features=2048),
                        MultiOutputRegressor(Ridge())
                    )
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'MeanSquaredError: \t {mse} \nMeanAbsoluteError: \t {mae}')

MeanSquaredError: 	 0.374645740814843 
MeanAbsoluteError: 	 0.47661186823869245


In [116]:
y_pred[0:8], y_test[0:8]

(array([[1.66722709, 2.70281438, 2.72173451],
        [1.107097  , 1.71396771, 2.06135491],
        [0.83060879, 1.06831171, 0.7853409 ],
        [0.87883834, 1.06893675, 0.82497452],
        [1.22578631, 1.74892441, 1.12033538],
        [0.96092011, 2.30867005, 2.10327623],
        [1.37128991, 2.22366788, 1.77892981],
        [1.51079045, 2.53741374, 2.97419865]]),
 array([[2.    , 3.    , 3.6667],
        [2.    , 3.    , 3.3333],
        [1.    , 1.    , 0.6667],
        [1.    , 1.    , 1.    ],
        [1.    , 1.    , 2.    ],
        [1.6667, 3.    , 2.    ],
        [1.3333, 2.3333, 2.3333],
        [1.6667, 3.    , 2.6667]]))