In [112]:
import spacy
import json
import numpy as np
import pandas as pd
import nltk
import torch.nn as nn
import torch
import torch.utils
import torch.utils.data
import textacy.preprocessing as tp
import fasttext
import fasttext.util
import tqdm
import pickle
import sklearn.metrics as sm

from keras.preprocessing.sequence import pad_sequences
from gensim.models import FastText

from abc import ABC, abstractmethod

In [2]:
class TextDistorter:
    def __init__(self, corpora):
        self._corpora = corpora
        self._wordfreq = nltk.FreqDist(
            [word.lower() for text in self._corpora for word in text]
        )

    def distort(self, text, k, multiple=False, char="*", digit="#"):
        word_set = [w[0] for w in self._wordfreq.most_common(k)]

        for word, i in zip(text, range(len(text))):
            if word.lower() not in word_set:
                text[i] = self._encode(word, multiple, char, digit)

        return text

    def _encode(self, word, multiple=False, char="*", digit="#"):
        result = ""

        char_found = False
        digit_found = False

        for c in word:
            if c.isalpha():
                if multiple or (not char_found and  multiple):
                    result += char
                    char_found = True
                    digit_found = False
            elif c.isdigit():
                if multiple or (not digit_found and not multiple):
                    result += digit
                    digit_found = True
                    char_found = False
            else:
                result += c
                char_found = False
                digit_found = False

        return result


texts = ["This is a test sentence", "This is another test sentence"]
words = [nltk.word_tokenize(text) for text in texts]

distorter = TextDistorter(words)
distorter.distort(words[0], 2, multiple=True)


['This', 'is', '*', '****', '********']

In [14]:
class Embeddings(ABC):
    @abstractmethod
    def __getitem__(self, token):
        pass

    @abstractmethod
    def get_vocab(self):
        pass

    @abstractmethod
    def get_embeddings(self):
        pass

    def word2vec(self, word):
        return self[word]
    
    def sentence2vec(self, sentence):
        return [self[word] for word in sentence]
    
    def doc2vec(self, doc):
        return [self.sentence2vec(sentence) for sentence in doc]


class CustomFastTextEmbeddings(Embeddings):
    def __init__(self, vocab_size=10000, embedding_size = 300):
        self._vocab_size = vocab_size
        self._embedding_size = embedding_size

        self._model = FastText(
            vector_size=self._embedding_size, 
            window=5, 
            max_vocab_size=vocab_size,
            min_count=1, 
            workers=4, 
            sg=1
        )

    @staticmethod
    def load(input):
        ft = CustomFastTextEmbeddings()
        ft._model = FastText.load(input)
        return ft

    def train(self, corpus, epochs=10):
        self._model.build_vocab(corpus)
        self._model.train(corpus, total_examples=self._model.corpus_count, epochs=epochs)

    def save(self, output):
        self._model.save(output)

    def __getitem__(self, token):
        return self._model.wv[token]
    
    def get_vocab(self):
        return self._model.wv.index_to_key
    
    def get_embeddings(self):
        return self._model.wv.vectors


class FastTextEmbeddings(Embeddings):
    def __init__(self):
        fasttext.util.download_model('en', if_exists='ignore')
        self._model = fasttext.load_model('cc.en.300.bin')

    def __getitem__(self, token):
        return self._model.get_word_vector(token)
    
    def get_vocab(self):
        return self._model.words
    
    def get_embeddings(self):
        embeddings = []
        for word in self._model.words:
            embeddings.append(self[word])

        return embeddings

In [90]:
class Corpus():
    def __init__(self, max_sent_len=None, max_doc_len=None):
        self._max_sent_len = max_sent_len
        self._max_doc_len = max_doc_len
        
        self._tokenizer = spacy.load("en_core_web_lg")

        self._docs_L = []
        self._docs_R = []
        self._labels = []
        
    def save(self, file:str):
        l = [
            {
                "doc_L": self._docs_L[i],
                "doc_R": self._docs_R[i],
                "label": self._labels[i]
            }
            for i in range(len(self._docs_L))
        ]

        json.dump(l, open(file, "w"))

    def open(self, file:str, preprocessed=False):
        if preprocessed:
            data = json.load(open(file, "r"))
            
            for item in data:
                self._docs_L.append(item["doc_L"])
                self._docs_R.append(item["doc_R"])
                self._labels.append(item["label"])

            return True

        self._df = pd.read_csv(file)

        columns = self._df.columns
        df_docs_L = self._df[columns[0]]
        df_docs_R = self._df[columns[1]]
        df_labels = self._df[columns[2]]

        self._docs_L = []
        for i, doc in enumerate(df_docs_L):
            if not isinstance(doc, str):
                doc = ""
            doc = self._preprocess_doc(doc)
            # doc = self._pad_sentences(doc)
            # doc = self._pad_doc(doc)
            self._docs_L.append(doc)
        
        self._docs_R = []
        for i, doc in enumerate(df_docs_R):
            if not isinstance(doc, str):
                doc = ""
            doc = self._preprocess_doc(doc)
            # doc = self._pad_sentences(doc)
            # doc = self._pad_doc(doc)
            self._docs_R.append(doc)
        
        self._labels = df_labels.tolist()

        return True
    
    def split(self, ratio=0.8):
        n = len(self._docs_L)
        m = int(n * ratio)

        train = Corpus()
        train._docs_L = self._docs_L[:m]
        train._docs_R = self._docs_R[:m]
        train._labels = self._labels[:m]

        test = Corpus()
        test._docs_L = self._docs_L[m:]
        test._docs_R = self._docs_R[m:]
        test._labels = self._labels[m:]

        return train, test
    
    @property
    def docs(self):
        return self._docs_L, self._docs_R
    
    @property
    def labels(self):
        return self._labels
    
    def get_all_docs(self):
        return self._docs_L + self._docs_R
    
    def get_all_sentences(self):
        return [sent for doc in self.get_all_docs() for sent in doc]

    def _preprocess_doc(self, doc):
        doc = tp.normalize.whitespace(doc)
        doc = tp.normalize.quotation_marks(doc)
        doc = tp.normalize.unicode(doc)

        doc = self._tokenizer(doc)
        doc = [[token.text for token in sent] for sent in doc.sents]

        return doc     

    def _pad_sentences(self, doc):
        padded = pad_sequences(
            doc, 
            maxlen=self._max_sent_len, 
            padding="post",
            truncating="post",
            dtype=object,
            value="<PAD>"
        ).tolist()
    
    def _pad_doc(self, doc):
        if len(doc) < self._max_doc_len:
            doc = doc + [['<PAD>'] * self._max_sent_len] * (self._max_doc_len - len(doc))
        else:
            doc = doc[:self._max_doc_len]
        return doc

    def _add_special_tokens(self, doc):
        result = []
        
        for sent in doc:
            sent = ['<SOS>'] + sent
            if len(sent) < self._max_sent_len:
                sent = sent + ['<PAD>'] * (self._max_sent_len - len(sent) - 1) + ['<EOS>']
                result.append(sent)
            else:
                while(len(sent) > 1):
                    if len(sent) < self._max_sent_len:
                        sent = sent + ['<PAD>'] * (self._max_sent_len - len(sent) - 1) + ['<EOS>']
                        result.append(sent)
                    else:
                        sent = sent[:self._max_sent_len - 1] + ['<ELB>']
                        result.append(sent)
                    sent = ['<SLB>'] + sent[self._max_sent_len - 1:]

        if len(result) < self._max_doc_len:
            result = result + [['<PAD>'] * self._max_sent_len] * (self._max_doc_len - len(result))

        return result

    def _make_vocabularies(self, docs):
        for doc in docs:
            for sent in doc:
                self._word_vocab.add(sent)
                for token in sent:
                    self._char_vocab.add(token)

In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, corpus: Corpus):    
        self._docs_L, self._docs_R = corpus.docs
        self._labels = corpus.labels

    def __len__(self):
        return len(self._docs_L)

    def __getitem__(self, idx):
        return {
            "doc_L": self._docs_L[idx],
            "doc_R": self._docs_R[idx],
            "label": self._labels[idx]
        }


In [91]:
train_corpus = Corpus()
train_corpus.open("data/train.csv")
train_corpus.save("data/train_no_pad.json")

dev_corpus = Corpus()
dev_corpus.open("data/dev.csv")
dev_corpus.save("data/dev_no_pad.json")

In [6]:
train_corpus = Corpus()
train_corpus.open("data/train.json", preprocessed=True)

dev_corpus = Corpus()
dev_corpus.open("data/dev.json", preprocessed=True)

True

In [26]:
cft = CustomFastTextEmbeddings()
cft.train(train_corpus.get_all_sentences())
cft.save("output/models/fasttext.model")

In [15]:
ft = FastTextEmbeddings()

In [105]:
class TextCNN(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()

        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tensor):
        embedded = tensor.unsqueeze(1)
        
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)
    
class Doc2VecCNN(nn.Module):
    def __init__(self, word_embedding_dim, sentence_embedding_dim, doc_embedding_dim, sentence_dropout=0.5, doc_dropout=0.5):
        super().__init__()

        self.sentence_cnn = TextCNN(word_embedding_dim, 100, [3, 4, 5], sentence_embedding_dim, sentence_dropout)
        self.doc_cnn = TextCNN(sentence_embedding_dim, 100, [3, 4, 5], doc_embedding_dim, doc_dropout)
        

    def forward(self, doc):
        sentence_tensors = [self.sentence_cnn(sentence) for sentence in doc]

        document_tensor = torch.stack(sentence_tensors)

        document_output = self.doc_cnn(document_tensor)
        return document_output


class AV(nn.Module):
    def __init__(self, doc_embedding_dim):
        super().__init__()

        self.doc2vec = Doc2VecCNN(300, 200, doc_embedding_dim)
    
    def forward(self, doc1, doc2):
        doc1_output = self.doc2vec(doc1)
        doc2_output = self.doc2vec(doc2)

        return torch.cosine_similarity(doc1_output, doc2_output)

In [31]:
# return metrics
def metrics(y_true, y_pred):
    acc = sm.accuracy_score(y_true, y_pred)
    macro_p = sm.precision_score(y_true, y_pred, average='macro')
    macro_r = sm.recall_score(y_true, y_pred, average='macro')
    macro_f1 = sm.f1_score(y_true, y_pred, average='macro')
    w_macro_p = sm.precision_score(y_true, y_pred, average='weighted')
    w_macro_r = sm.recall_score(y_true, y_pred, average='weighted')
    w_macro_f1 = sm.f1_score(y_true, y_pred, average='weighted')
    mcc = sm.matthews_corrcoef(y_true, y_pred)

    return {
        "Accuracy":acc, 
        "Macro-P":macro_p, 
        "Macro-R":macro_r, 
        "Macro-F1":macro_f1, 
        "W Macro-P":w_macro_p, 
        "W Macro-R":w_macro_r, 
        "W Macro-F1":w_macro_f1, 
        "MCC":mcc
    }

In [107]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    train_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm.tqdm(dataloader):
        
        docL = batch["doc_L"]
        docR = batch["doc_R"]
        labels = batch["label"]

        docL = docL.to(device)
        docR = docR.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        output = model(docL, docR)
        loss = criterion(output, labels.float())

        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        total_predictions += labels.size(0)
        correct_predictions += (output > 0.5).sum().item()

    train_loss /= len(dataloader)
    train_accuracy = correct_predictions / total_predictions

    return train_loss, train_accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    eval_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    y_true = []
    y_pred = []

    with torch.inference_mode():
        for batch in tqdm.tqdm(dataloader):
            docL = batch["doc_L"]
            docR = batch["doc_R"]
            labels = batch["label"]

            docL = docL.to(device)
            docR = docR.to(device)
            labels = labels.to(device)

            output = model(docL, docR)
            loss = criterion(output, labels.float())
            
            eval_loss += loss.item()
            total_predictions += labels.size(0)
            correct_predictions += (output > 0.5).sum().item()

            y_true.extend(labels.cpu().numpy())
            y_pred.extend((output > 0.5).cpu().numpy())

    eval_loss /= len(dataloader)
    eval_accuracy = correct_predictions / total_predictions

    eval_metrics = metrics(y_true, y_pred)

    return eval_loss, eval_accuracy, eval_metrics

In [70]:
def collate(batch):
    doc_L = [ft.doc2vec(item["doc_L"]) for item in batch]
    doc_R = [ft.doc2vec(item["doc_R"]) for item in batch]
    labels = [item["label"] for item in batch]

    doc_L = torch.tensor(doc_L)
    doc_R = torch.tensor(doc_R)
    labels = torch.tensor(labels)

    return {
        "doc_L": doc_L,
        "doc_R": doc_R,
        "label": labels
    }


In [33]:
BATCH_SIZE = 32
LEARING_RATE = 5e-5
EPOCHS = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [35]:
dev_dataset = Dataset(dev_corpus)
dev_dataset[0]

{'doc_L': [['Carol',
   ',',
   'Congratulations',
   '.',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>'],
  ['Vince',
   'Carol',
   'Coats',
   '10/12/2000',
   '04:56',
   'PM',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
  

In [37]:
train_dataset = Dataset(train_corpus)
train_dataset[0]

{'doc_L': [['Nick',
   '(',
   'Kevin',
   'Anderson',
   ')',
   'goes',
   'back',
   'to',
   'his',
   'hometown',
   'to',
   'take',
   'care',
   'of',
   'his',
   'dying',
   'mother',
   '(',
   'Kim',
   'Novak',
   ')',
   '.',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>'],
  ['There',
   'he',
   'encounters',
   'an',
   'old',
   'college',
   'buddy',
   '(',
   'Bill',
   'Pullman',
   ')',
   'and',
   'his',
   'beautiful',
   'wife',
   '(',
   'Pamela',
   'Gidley',
   ')',
   '.',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>',
   '<PAD>'

In [71]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)

In [108]:
model = AV(300).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARING_RATE)
criterion = nn.BCELoss()

results = []

for epoch in range(EPOCHS):
    train_loss, train_accuracy = train(model, train_dataloader, optimizer, criterion, device)
    
    torch.save(model.state_dict(), f"./output/models/approach1_epoch_{epoch}.model")

    eval_loss, eval_accuracy, eval_metrics = evaluate(model, dev_dataloader, criterion, device)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f} - Train Accuracy: {train_accuracy:.4f}")
    print(f"Eval Loss: {eval_loss:.4f} - Eval Accuracy: {eval_accuracy:.4f}")
    print(eval_metrics)

    results.append({
        "epoch": epoch,
        "train_loss": train_loss,
        "train_accuracy": train_accuracy,
        "eval_loss": eval_loss,
        "eval_accuracy": eval_accuracy,
        "eval_metrics": eval_metrics
    })

  0%|          | 0/938 [00:00<?, ?it/s]

100%|██████████| 938/938 [2:07:27<00:00,  8.15s/it]  
100%|██████████| 188/188 [24:15<00:00,  7.74s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/10
Train Loss: 0.6990 - Train Accuracy: 0.5495
Eval Loss: 1.7605 - Eval Accuracy: 1.0000
{'Accuracy': 0.5018333333333334, 'Macro-P': 0.2509166666666667, 'Macro-R': 0.5, 'Macro-F1': 0.33414715347908114, 'W Macro-P': 0.25183669444444445, 'W Macro-R': 0.5018333333333334, 'W Macro-F1': 0.33537235970850443, 'MCC': 0.0}


100%|██████████| 938/938 [2:03:27<00:00,  7.90s/it]  
100%|██████████| 188/188 [24:12<00:00,  7.72s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/10
Train Loss: 0.6974 - Train Accuracy: 0.5462
Eval Loss: 1.7294 - Eval Accuracy: 1.0000
{'Accuracy': 0.5018333333333334, 'Macro-P': 0.2509166666666667, 'Macro-R': 0.5, 'Macro-F1': 0.33414715347908114, 'W Macro-P': 0.25183669444444445, 'W Macro-R': 0.5018333333333334, 'W Macro-F1': 0.33537235970850443, 'MCC': 0.0}


100%|██████████| 938/938 [2:04:41<00:00,  7.98s/it]  
100%|██████████| 188/188 [24:15<00:00,  7.74s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/10
Train Loss: 0.6954 - Train Accuracy: 0.5446
Eval Loss: 1.5539 - Eval Accuracy: 1.0000
{'Accuracy': 0.5018333333333334, 'Macro-P': 0.2509166666666667, 'Macro-R': 0.5, 'Macro-F1': 0.33414715347908114, 'W Macro-P': 0.25183669444444445, 'W Macro-R': 0.5018333333333334, 'W Macro-F1': 0.33537235970850443, 'MCC': 0.0}


 76%|███████▋  | 716/938 [1:34:24<29:16,  7.91s/it]  


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [110]:
results

[{'epoch': 0,
  'train_loss': 0.699,
  'train_accuracy': 0.5495,
  'eval_loss': 1.7605,
  'eval_accuracy': 1.0,
  'eval_metrics': {'Accuracy': 0.5018333333333334,
   'Macro-P': 0.2509166666666667,
   'Macro-R': 0.5,
   'Macro-F1': 0.33414715347908114,
   'W Macro-P': 0.25183669444444445,
   'W Macro-R': 0.5018333333333334,
   'W Macro-F1': 0.33537235970850443,
   'MCC': 0.0}},
 {'epoch': 1,
  'train_loss': 0.6974,
  'train_accuracy': 0.5462,
  'eval_loss': 1.7294,
  'eval_accuracy': 1.0,
  'eval_metrics': {'Accuracy': 0.5018333333333334,
   'Macro-P': 0.2509166666666667,
   'Macro-R': 0.5,
   'Macro-F1': 0.33414715347908114,
   'W Macro-P': 0.25183669444444445,
   'W Macro-R': 0.5018333333333334,
   'W Macro-F1': 0.33537235970850443,
   'MCC': 0.0}},
 {'epoch': 2,
  'train_loss': 0.6954,
  'train_accuracy': 0.5446,
  'eval_loss': 1.5539,
  'eval_accuracy': 1.0,
  'eval_metrics': {'Accuracy': 0.5018333333333334,
   'Macro-P': 0.2509166666666667,
   'Macro-R': 0.5,
   'Macro-F1': 0.33414

In [113]:
with open("./output/performance/result.pkl", "wb") as f:
    pickle.dump(results, f)