In [209]:
import pandas as pd
import re
import torch

from sklearn.model_selection import train_test_split

from emotion_detection.loader import create_data_loader, split_data
from emotion_detection.models import TextClassificationParsBert, pytorch_model

from transformers import BertConfig, BertTokenizer, BertModel

## hyper-parameters

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

max_len = 128
train_batch_size = 128
valid_batch_size = 128
test_batch_size = 128

epoch = 3
EEVERY_EPOCH = 1000
lr = 2e-5
CLIP = 0.0


##  dataset process

In [162]:
from hazm import Stemmer, word_tokenize, Normalizer
data = pd.read_csv('/mnt/disk2/arshia.yousefinezhad/emotion_detection/data/preprocess_labelencoding_data.csv')

def preprocess_text(df):
    def process_text(text):
      text = re.sub(r'[^ا-ی!\s]', '', text)
      text = re.sub('[0-9]','',text)
    
      stemmer = Stemmer()
      text_tokens = word_tokenize(text)
      texts_clean = [stemmer.stem(word) for word in text_tokens]
      text = " ".join(texts_clean)
      return text
    df.combined_text = df.combined_text.apply(process_text)
    return df

data_prepared = preprocess_text(data)
data_prepared["tokens"] = data_prepared.combined_text.map(lambda text: text.split())

### Splitting Dataset

In [163]:
# Split dataset to train test validation
call_data, data_val = train_test_split(data,test_size=0.15,  random_state=42 , stratify=data.emotion)
data_train, data_test = train_test_split(call_data,test_size=0.1 ,  random_state=42 , stratify=call_data.emotion)

## Config

In [164]:
label_list = list(sorted(data_prepared['emotion'].unique()))

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {v: k for k, v in label2id.items()}

## Tokenizers

In [165]:
# tokenization
from collections import defaultdict

MAX_VOCAB = 30_000
token_freq = defaultdict(int)

for example in data_train.tokens:
    for token in example:
        token_freq[token] += 1

print('maximum vocab numbers: ', max(list(token_freq.values())))

token_freq = {key: value for key, value in token_freq.items() if value >= 3}

sorted_tokens = ['<pad>', '<unk>'] + sorted(token_freq.keys(), key=lambda token: token_freq[token], reverse=True)[:MAX_VOCAB - 2]

token2id = {token: idx for idx, token in enumerate(sorted_tokens)}
id2token = {idx: token for token, idx in token2id.items()}

maximum vocab numbers:  30517


In [166]:
import torch.nn.functional as F
import torch.nn as nn

class TextClassificationLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes):
        super(TextClassificationLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        output = self.fc(lstm_out.view(len(sentence), -1))
        return F.log_softmax(output, dim=1)

In [167]:
vocab_size = len(token2id) + 1
embedding_dim = 100
hidden_dim = 128
output_dim = len(label_list)

model = TextClassificationLSTM(embedding_dim, hidden_dim, vocab_size, output_dim)

In [178]:
def tokenize_function(example):
  return [token2id.get(token, token2id['<unk>']) for token in example]

data_train['tokens_ids'] = data_train.tokens.apply(tokenize_function)

In [201]:
label2id['عصبانی']

1

In [188]:
data_train

Unnamed: 0,emotion,combined_text,tokens,tokens_ids
8890,عصبانی,نمیدون چرا باید اینقد عصاب خورد داشته_با تو خی...,"[نمیدون, چرا, باید, اینقد, عصاب, خورد, داشته_ب...","[124, 27, 32, 550, 2615, 234, 807, 6, 12, 258,..."
10203,معمولی,خاله دیگه باشگاه نمیر نه یک ماهه نرف چراخوب می...,"[خاله, دیگه, باشگاه, نمیر, نه, یک, ماهه, نرف, ...","[591, 14, 678, 642, 17, 28, 1189, 1152, 8086, ..."
10186,معمولی,از برنامه ه بگو کدو برنامه برنامه دیگه حالا که...,"[از, برنامه, ه, بگو, کدو, برنامه, برنامه, دیگه...","[7, 371, 16, 131, 321, 371, 371, 14, 44, 4, 15..."
9060,غمگین و مضطرب,به هرچ خواس گف ناراح نبا نمیتون خیل ناراح باید...,"[به, هرچ, خواس, گف, ناراح, نبا, نمیتون, خیل, ن...","[3, 620, 367, 74, 211, 310, 191, 12, 211, 32, ..."
1663,عصبانی,صد بار به نگف وقت تو ماشین میشین درارو قفل کن ...,"[صد, بار, به, نگف, وقت, تو, ماشین, میشین, درار...","[488, 185, 3, 564, 136, 6, 171, 944, 8087, 170..."
...,...,...,...,...
3484,معمولی,زن تو چرا اینقدر میر پ رمال و دعا نویس میگ شای...,"[زن, تو, چرا, اینقدر, میر, پ, رمال, و, دعا, نو...","[242, 6, 27, 209, 118, 92, 12552, 5, 590, 1347..."
10272,معمولی,تازه بیدار شد نه خیل وقته همیشه همین موقعا بید...,"[تازه, بیدار, شد, نه, خیل, وقته, همیشه, همین, ...","[305, 379, 42, 17, 12, 621, 115, 88, 2344, 379..."
9509,معمولی,دا یه تحقیق ارایه میداد راجب رنگ واسه دانشگ ن...,"[دا, یه, تحقیق, ارایه, میداد, راجب, رنگ, واسه,...","[117, 23, 1622, 11782, 905, 965, 369, 127, 1, ..."
4254,معمولی,میشه بگ شما دو نفر چرا مدا با ه بحث م کنین چون...,"[میشه, بگ, شما, دو, نفر, چرا, مدا, با, ه, بحث,...","[36, 99, 39, 93, 286, 27, 633, 13, 16, 778, 11..."


In [207]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _, _row in batch.iterrows():
        label_list.append(_row.emotion)
        processed_text = torch.tensor(_row.tokens_ids, dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label2id[_row.emotion], dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [213]:
from torch.utils.data import Dataset, DataLoader
train_loader = DataLoader(data_train, batch_size=64, collate_fn=collate_batch, drop_last=True)

In [215]:
next(iter(train_loader))

KeyError: 0

: 

In [212]:
token_ids = [item['token_ids'] for item in data_train]
emotion_labels = [item['emotion'] for item in data_train]

max_lenght =  max([len(item['emotion_labels']) for item in data_train])

padded_token_ids = [ids + [0] * (max_lenght - len(ids)) for ids in token_ids]
padded_emotion_labels = [ids + [-1] * (max_lenght - len(ids)) for ids in emotion_labels]

TypeError: string indices must be integers, not 'str'

In [58]:
from torch.utils.data import Dataset, DataLoader
train_loader = DataLoader(train_data, batch_size=64, collate_fn=collate_fn, drop_last=True)
dev_loader = DataLoader(dev_ds, batch_size=64, collate_fn=collate_fn, drop_last=True)
test_loader = DataLoader(test_ds, batch_size=64, collate_fn=collate_fn, drop_last=True)


[' عاشقانه و خوشحال', 'عصبانی', 'غمگین و مضطرب', 'معمولی', 'هیجانی و متعجب']

In [None]:
train_data_loader = create_data_loader(train['combined_text'].to_numpy(), train['emotion'].to_numpy(), tokenizer, max_len, train_batch_size, label_list)
valid_data_loader = create_data_loader(valid['combined_text'].to_numpy(), valid['emotion'].to_numpy(), tokenizer, max_len, valid_batch_size, label_list)
test_data_loader = create_data_loader(test['combined_text'].to_numpy(), test['emotion'].to_numpy(), tokenizer, max_len, test_batch_size, label_list)

In [None]:
model = TextClassificationLSTM(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

## ParsBert Model

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

torch_model = pytorch_model(model, criterion, optimizer, scheduler)
torch_model.trainer(train_data_loader, valid_data_loader, test_data_loader, epoch=1)

100%|██████████| 73/73 [01:12<00:00,  1.00it/s]s]
100%|██████████| 9/9 [00:03<00:00,  2.29it/s]
Epochs... : 100%|██████████| 1/1 [01:16<00:00, 76.62s/it]


-----------------------------------------------------------
| end of epoch   1 | time: 76.62s | valid accuracy    0.644 
-----------------------------------------------------------
Checking the results of test dataset.


100%|██████████| 9/9 [00:04<00:00,  2.02it/s]


test accuracy    0.644


In [None]:
text = "خیلی کیفت کوکه! چی شده؟,آره ، آزمون رانندگیمو قبول شدم \
اولین بار بود؟,نه بابا 5 بار رد شده بودم \
پس باید بهم شیرینی بدی,چشم، اینقد خوشحالم که میتونم یه شهرو شیرینی بدم \
حالا نمیخواد ولخرجی کنی ، شکم مارو سیر کنی کافیه,بزن  بریم تجریش یه ناهار مشتی بهت بدم \
بابا شرمنده میکنی,بیا بریم امروز فقط عشق و حاله"

model_torch_model = torch_model.model

model = model.to("cpu")

print("این هست یک حالت %s " % id2label[torch_model.predict(text, tokenizer)])

این هست یک حالت معمولی 
