In [168]:
import string
from matplotlib import transforms
import pandas as pd
import numpy as np
from torchtext.data.utils import get_tokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# nltk.download('wordnet')

In [227]:
data = 'data/WASSA23_conv_level_with_labels_train.tsv'
df = pd.read_table(data, header=0)
new_col = []
for names in df.columns:
    new_col.append(names.strip())
df.columns = new_col
df.drop(["conversation_id", "turn_id", "speaker_number", "article_id", "speaker_id", "essay_id"], axis=1, inplace=True)

X_data, y_data = df.loc[:, 'text'], df.drop('text', axis=1) #df.loc[:,'Emotion']
X_train, X_test, y_train , y_test = train_test_split(X_data, y_data, train_size=0.8)
#reset index of training examples
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)
y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)

In [199]:
X_train, X_test, y_train, y_test

(0       yeah they'd probably just switch to knives    ...
 1       I understood your point.  I just didn't really...
 2       it's terrible. i would most likely not anymore...
 3       Yea. I find that pacifying to an extent. I don...
 4       The whole thing is sad. I wonder what protocol...
                               ...                        
 7015    Look at all the rules at the airport and the p...
 7016    yeah I think they're up there with the worst c...
 7017    Yes, it's pretty draining.  I think people wou...
 7018    bye                                           ...
 7019    that's so crazy to try to wrap my head around....
 Name: text, Length: 7020, dtype: object,
 0       expand on what? how women deserve less pay?   ...
 1       I agree with you                              ...
 2       true it is amazing to read about              ...
 3       It changes how you feel and how you live.   Fo...
 4       I really wish I can do more                   ...
              

- tokenization
- remove stop word and punctuatuons, numbers
- lematization
- vectorization

In [200]:
def word_preprocessor(sentence):
    tok = get_tokenizer("basic_english")
    stop_words = set(stopwords.words('english'))
    punctuations = set(string.punctuation)
    lem = WordNetLemmatizer().lemmatize

    sentence = tok(sentence)
    sentence = [word for word in sentence if word not in stop_words]
    sentence = [word for word in sentence if word not in punctuations]
    sentence_str = ' '.join(sentence)
    sentence = lem(sentence_str)
    return sentence

In [228]:
X_train = X_train.apply(word_preprocessor)
X_test = X_test.apply(word_preprocessor)

#convert labels to array
X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train[['EmotionalPolarity', 'Emotion', 'Empathy']]), np.array(y_test[['EmotionalPolarity', 'Emotion', 'Empathy']])


X_train = np.reshape(X_train, (X_train.shape[0],1))
X_test = np.reshape(X_test, (X_test.shape[0],1))
X_train

In [234]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7020,), (7020, 3), (1756,), (1756, 3))

In [231]:
regressors = []

for i in range(y_train.shape[1]):
    regressor = make_pipeline(  
                        TfidfVectorizer(max_features=128),
                        LinearRegression()
                    )
    regressor.fit(X_train, y_train[:,i])
    regressors.append(regressor)
    
y_preds = [regressor.predict(X_test) for regressor in regressors]
y_pred = np.column_stack(y_preds)


In [233]:
y_test, y_pred

(array([[1.6667, 2.    , 1.    ],
        [2.    , 3.3333, 2.3333],
        [0.    , 2.    , 1.3333],
        ...,
        [1.6667, 2.6667, 2.3333],
        [1.3333, 2.6667, 2.3333],
        [0.3333, 1.6667, 2.6667]]),
 array([[1.27934404, 2.05607   , 1.90572553],
        [1.08407758, 2.36075325, 1.67917248],
        [1.17731092, 2.67748412, 2.5350846 ],
        ...,
        [1.54336128, 2.57310631, 2.1721677 ],
        [1.02477609, 1.95025561, 1.90657692],
        [1.07962897, 2.29965639, 2.35280091]]))

regressor = make_pipeline(  
                        TfidfVectorizer(max_features=128),
                        LinearRegression()
                    )
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

y_pred