In [251]:
import string
from matplotlib import transforms
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from torchtext.data.utils import get_tokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# nltk.download('wordnet')

In [89]:
data = 'data/WASSA23_conv_level_with_labels_train.tsv'
df = pd.read_table(data, header=0)
new_col = []
for names in df.columns:
    new_col.append(names.strip())
df.columns = new_col
df.drop(["conversation_id", "turn_id", "speaker_number", "article_id", "speaker_id", "essay_id"], axis=1, inplace=True)
df.head()

Unnamed: 0,text,EmotionalPolarity,Emotion,Empathy
0,I feel very sad for the people. ...,2.0,3.0,3.3333
1,It's terrible. Not only the people but the ani...,2.0,4.0,3.3333
2,I felt really sorry for the sister that now ha...,2.0,3.6667,2.6667
3,"Yeah, it's going to be tough but i am sure she...",0.6667,3.0,2.0
4,"Yeah, we never know what we can do unless we a...",0.3333,2.3333,1.3333


In [226]:
X_data, y_data = df.loc[:, 'text'], df.drop('text', axis=1)
X_train, X_test, y_train , y_test = train_test_split(X_data, y_data, train_size=0.8)
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)
X_test.shape[0]

1756

- tokenization
- remove stop word and punctuatuons, numbers
- lematization
- vectorization

In [228]:
def word_preprocessor(sentence):
    tok = get_tokenizer("basic_english")
    stop_words = set(stopwords.words('english'))
    punctuations = set(string.punctuation)
    lem = WordNetLemmatizer().lemmatize

    sentence = tok(sentence)
    sentence = [word for word in sentence if word not in stop_words]
    sentence = [word for word in sentence if word not in punctuations]
    sentence_str = ' '.join(sentence)
    sentence = lem(sentence_str)
    return sentence

In [229]:
X_train = X_train.apply(word_preprocessor) 
X_test = X_test.apply(word_preprocessor) 

In [230]:
tfidf = TfidfVectorizer(max_features=128)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
# X_train.data()

In [247]:
X_train

<7020x128 sparse matrix of type '<class 'numpy.float64'>'
	with 23678 stored elements in Compressed Sparse Row format>

In [249]:
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def collate_batch(batch):
    text_list, label_list = [] , []
    for text, label in batch:
        processed_text = torch.tensor(text, dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(label)
        
        return text_list.to(device), label_list.to(device)

In [250]:
class TextClassification(nn.Module):
    def __init__(self, embedding_size, num_classes):
        super(TextClassification, self).__init__()
        self.fc = nn.Linear(embedding_size, num_classes)
        
    def forward(self, text):
        return F.log_softmax(self.fc(text))