In [107]:
import string
from matplotlib import transforms
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
# from torchvision.transforms import ToTensor
from torchtext.data.utils import get_tokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# nltk.download('wordnet')

In [99]:
data = 'data/WASSA23_conv_level_with_labels_train.tsv'
df = pd.read_table(data, header=0)
new_col = []
for names in df.columns:
    new_col.append(names.strip())
df.columns = new_col
df.drop(["conversation_id", "turn_id", "speaker_number", "article_id", "speaker_id", "essay_id"], axis=1, inplace=True)

X_data, y_data = df.loc[:, 'text'], df.drop('text', axis=1)
X_train, X_test, y_train , y_test = train_test_split(X_data, y_data, train_size=0.8)
#reset index of training examples
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)
y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)

In [94]:
X_train, X_test, y_train, y_test

(0       I always do that                              ...
 1       I remember hearing he jumped into the water he...
 2       This story is horrible.                       ...
 3       bye                                           ...
 4       it was safe to bathe and like brush your teeth...
                               ...                        
 7015    they need better overall infrastructure. reach...
 7016    bye                                           ...
 7017    yeah no doubt!                                ...
 7018    Hello! How do you feel about this story?      ...
 7019    Is he the one that said he thought muslim secu...
 Name: text, Length: 7020, dtype: object,
 0       I feel awful for them too. I hope scientists a...
 1       They are not easy to come by.  My son-in-law i...
 2       Yes!  It would be absolutely horrible to lost ...
 3       right? It's definitely a luxury that we take f...
 4       why not?                                      ...
              

- tokenization
- remove stop word and punctuatuons, numbers
- lematization
- vectorization

In [100]:
def word_preprocessor(sentence):
    tok = get_tokenizer("basic_english")
    stop_words = set(stopwords.words('english'))
    punctuations = set(string.punctuation)
    lem = WordNetLemmatizer().lemmatize

    sentence = tok(sentence)
    sentence = [word for word in sentence if word not in stop_words]
    sentence = [word for word in sentence if word not in punctuations]
    sentence_str = ' '.join(sentence)
    sentence = lem(sentence_str)
    return sentence

In [101]:
X_train = X_train.apply(word_preprocessor)
X_test = X_test.apply(word_preprocessor)

#convert labels to array
y_train, y_test = np.array(y_train[['EmotionalPolarity', 'Emotion', 'Empathy']]), np.array(y_test[['EmotionalPolarity', 'Emotion', 'Empathy']])

In [102]:
tfidf = TfidfVectorizer(max_features=128)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [103]:
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def collate_batch(text_data, label_data):
    text_list, label_list = [] , []
    for text, label in zip(text_data, label_data):
        processed_text = torch.tensor(text, dtype=torch.int64)
        text_list.append(processed_text)
        processed_label = torch.tensor(label, dtype=torch.int64)
        label_list.append(processed_label)
        
        return torch.stack(text_list).to(device), torch.stack(label_list).to(device)

In [110]:
class TextClassification(nn.Module):
    def __init__(self, embedding_size, num_classes):
        super().__init__()
        self.fc = nn.Linear(embedding_size, num_classes)
        
    def forward(self, text):
        output = self.fc(text)
        return F.log_softmax(output, dim=1)

torch.manual_seed(69)
model = TextClassification(128,3)
model.eval()


TextClassification(
  (fc): Linear(in_features=128, out_features=3, bias=True)
)

In [111]:
# loss function optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

TypeError: expected string or bytes-like object, got 'NoneType'