In [1]:
%pip install datasets
%pip install seqeval

Note: you may need to restart the kernel to use updated packages.



In [16]:
import pandas as pd
import numpy as np
import nltk
import torch.nn as nn
import torch
import utils

from datasets import Dataset
from datasets import load_dataset

In [7]:
class TextDistorter:
    def __init__(self, corpora):
        self._corpora = corpora
        self._wordfreq = nltk.FreqDist(
            [word.lower() for text in self._corpora for word in text]
        )

    def distort(self, text, k, multiple=False, char="*", digit="#"):
        word_set = [w[0] for w in self._wordfreq.most_common(k)]

        for word, i in zip(text, range(len(text))):
            if word.lower() not in word_set:
                text[i] = self._encode(word, multiple, char, digit)

        return text

    def _encode(self, word, multiple=False, char="*", digit="#"):
        result = ""

        char_found = False
        digit_found = False

        for c in word:
            if c.isalpha():
                if multiple or (not char_found and  multiple):
                    result += char
                    char_found = True
                    digit_found = False
            elif c.isdigit():
                if multiple or (not digit_found and not multiple):
                    result += digit
                    digit_found = True
                    char_found = False
            else:
                result += c
                char_found = False
                digit_found = False

        return result


texts = ["This is a test sentence", "This is another test sentence"]
words = [nltk.word_tokenize(text) for text in texts]

distorter = TextDistorter(words)
distorter.distort(words[0], 2, multiple=True)


['This', 'is', '*', '****', '********']

In [8]:
corpus = utils.Corpus()
corpus.open("data/dev.json", preprocessed=True)

True

In [15]:
utils.CustomFastTextEmbeddings


AttributeError: module 'utils' has no attribute 'CustomFastTextEmbeddings'

In [None]:
class TextCNN(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()

        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tensor):
        embedded = tensor.unsqueeze(1)
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)
    
class AVCNN(nn.Module):
    def __init__(self, word_embedding_dim, sentence_embedding_dim, doc_embedding_dim):
        super().__init__()

        self.sentence_cnn = TextCNN(word_embedding_dim, 100, [3, 4, 5], sentence_embedding_dim, 0.5)
        self.doc_cnn = TextCNN(sentence_embedding_dim, 100, [3, 4, 5], doc_embedding_dim, 0.5)

    def forward(self, doc):
        # Pass each sentence through the sentence CNN
        sentence_tensors = [self.sentence_cnn(sentence) for sentence in doc]

        # Concatenate the inputs to obtain the tensor for the document
        document_tensor = torch.cat(sentence_tensors, dim=0)

        # Pass the document tensor through the document CNN
        document_output = self.doc_cnn(document_tensor)
        return document_output
