# Useful packages, installations, and imports for this exercise

In [None]:
# Run This Cell!
# This just imports some packages which we will be using for this exercise

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import linear_model

from scipy import sparse

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from gensim.corpora.dictionary import Dictionary
from collections import Counter, defaultdict



import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# The Dataset

In [None]:
# TODO
!wget https://raw.githubusercontent.com/dbamman/nlp23/main/HW1/train.txt
!wget https://raw.githubusercontent.com/dbamman/nlp23/main/HW1/dev.txt
!wget https://raw.githubusercontent.com/dbamman/nlp23/main/HW1/test.txt

--2025-02-10 04:00:29--  https://raw.githubusercontent.com/dbamman/nlp23/main/HW1/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1322055 (1.3M) [text/plain]
Saving to: ‘train.txt’


2025-02-10 04:00:29 (85.4 MB/s) - ‘train.txt’ saved [1322055/1322055]

--2025-02-10 04:00:29--  https://raw.githubusercontent.com/dbamman/nlp23/main/HW1/dev.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1309909 (1.2M) [text/plain]
Saving to: ‘dev.txt’


2025-02-10 04:00:29 (94.0 MB/s) - ‘dev.txt’ saved [1309909/1309909]

--2025-02-10 0

In [None]:
trainingFile = "train.txt"
evaluationFile = "dev.txt"
testFile = "test.txt"

In [None]:
# Based on Info159 - Natural Language Processing HW 1 by Prof. David Bamman

class FeaturizedDataLoader:
    def __init__(self, feature_method, min_feature_count=1):
          self.feature_vocab = {}
          self.feature_method = feature_method
          self.min_feature_count = min_feature_count
          self.trainX, self.trainY, self.trainOrig = self.process(trainingFile, training=True)
          self.devX, self.devY, self.devOrig = self.process(evaluationFile, training=False)
          self.testX, _, self.testOrig = self.process(testFile, training=False)

          # Read data from file
    def load_data(self, filename):
        data = []
        with open(filename, encoding="utf8") as file:
            for line in file:
                cols = line.split("\t")
                idd = cols[0]
                label = cols[1]
                text = cols[2]

                data.append((idd, label, text))

        return data
    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for idd, label, text in data:
            feats = self.feature_method(text)
            featurized_data.append((label, feats))
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, dataFile, training = False):
        original_data = self.load_data(dataFile)
        data = self.featurize(original_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for label, feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        Y = [None]*D
        for idx, (label, feats) in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]
            Y[idx] = label

        return X, Y, original_data

    def load_test(self, dataFile):
        data = self.load_data(dataFile)
        data = self.featurize(data)

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        Y = [None]*D
        for idx, (data_id, feats) in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]
            Y[idx] = data_id

        return X, Y

# Building a Simple Classifier

## Encoding the Input

### Bag of Words

In [None]:
def bag_of_words(text):
    # Here the `feats` dict should contain the features -- the key should be the feature name,
    # and the value is the feature value.  See `simple_featurize` for an example.

    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
      word=word.lower()

      feats[word] = 1
    return feats


In [None]:
BoW_Loader = FeaturizedDataLoader(bag_of_words)

In [None]:
# let's take a look

# dimensions
D, F = BoW_Loader.trainX.shape
print("number of rows in train X:", D)
print("number of features:", F)

number of rows in train X: 1000
number of features: 21078


### Embeddings

In [None]:
# Fetch embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2025-02-10 04:00:56--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-10 04:00:56--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-10 04:00:57--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
# To make use of these embeddings, we need to ensure we have a mapping of words to indices
# Let's define a simple word based tokenizer to work with these embeddings

class WordTokenizer:
    def __init__(self, documents=None, vocab_size=100000):
        if documents:
          self.id2token = Dictionary(
              [nltk.word_tokenize(document.lower()) for document in documents],
              prune_at=vocab_size
          )
          self.id2token.filter_extremes(no_below=1, no_above=1, keep_n=vocab_size)
        else:
          self.id2token = Dictionary(prune_at = vocab_size)
        self.vocab_size = vocab_size

    def __call__(self, text):
        return self.tokenize(text)

    def tokenize(self, text):
        tokens = nltk.word_tokenize(text.lower())
        return self.id2token.doc2idx(tokens, unknown_word_index = self.get_vocab_size())

    def get_vocab(self):
        return self.id2token.token2id

    def get_vocab_size(self):
        return len(self.get_vocab())

    def get_word_id(self, word):
        return self.get_vocab()[word]

    def get_word(self, idx):
        return self.id2token.get(idx, None)

    def add_documents(self, documents):
        self.id2token.add_documents([nltk.word_tokenize(document.lower()) for document in documents], prune_at=self.vocab_size)
        self.id2token.filter_extremes(no_below=1, no_above=1, keep_n=self.vocab_size)

In [None]:
# Now let's build an embedding matrix based on this tokenizer and the embeddings we loaded earlier

def create_embedding_matrix(tokenizer: WordTokenizer, embeddings_index, embedding_dim=100):
    vocab_size = tokenizer.get_vocab_size() + 1  # +1 for OOV token if needed
    embedding_matrix = np.zeros((vocab_size, embedding_dim)).astype('float32')

    for word in tokenizer.get_vocab():
        if word in embeddings_index:
            idx = tokenizer.get_word_id(word)
            embedding_matrix[idx] = embeddings_index[word]

    return embedding_matrix



In [None]:
# recall our dataloader stores the original training data in the trainOrig attribute as a list of (idd, label, text)

# now we extract the text from this

trainOrigDocuments = [x[2] for x in BoW_Loader.trainOrig]

word_tokenizer = WordTokenizer(trainOrigDocuments, 10000)
embedding_matrix = create_embedding_matrix(word_tokenizer, embeddings_index)

In [None]:
embedding_matrix.shape

(10001, 100)

In [None]:
(embedding_matrix)

array([[ 0.38472 ,  0.49351 ,  0.49096 , ...,  0.026263,  0.39052 ,
         0.52217 ],
       [ 0.58854 , -0.2025  ,  0.73479 , ..., -0.94475 ,  0.61802 ,
         0.39591 ],
       [ 0.19247 ,  0.36617 ,  0.52301 , ..., -1.2276  ,  1.1152  ,
        -1.0234  ],
       ...,
       [ 0.27855 , -0.25163 ,  1.1612  , ...,  0.18109 , -0.025508,
        -0.90374 ],
       [-0.10752 , -0.73378 , -0.15725 , ...,  0.42029 , -0.2823  ,
         0.45759 ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]], dtype=float32)

In [None]:
word_tokenizer.get_word(0)

'!'

In [None]:
embeddings_index['!']

array([ 0.38472  ,  0.49351  ,  0.49096  , -1.5434   , -0.33614  ,
        0.6222   ,  0.32265  ,  0.075331 ,  0.65591  , -0.23517  ,
        1.2114   ,  0.06193  , -0.62004  ,  0.31371  ,  0.38948  ,
       -0.24381  , -0.065643 ,  0.58797  , -0.86382  ,  0.63166  ,
        0.68363  ,  0.39647  , -0.62388  , -0.25094  ,  0.92831  ,
        1.5152   , -0.43917  ,  0.22249  ,  1.3695   , -0.53098  ,
        0.39811  ,  0.77114  ,  0.49043  ,  0.58853  ,  0.2376   ,
        0.3162   , -0.011962 , -0.047074 ,  0.34585  , -1.2944   ,
        0.18597  ,  0.27002  , -0.70602  , -0.20652  , -0.25194  ,
       -0.4868   , -0.71538  , -0.23887  , -0.041612 , -0.55488  ,
       -0.54226  ,  0.21236  ,  0.025341 ,  0.96517  , -0.88183  ,
       -1.8681   ,  0.32657  ,  1.1689   ,  1.1759   , -0.17393  ,
       -0.3371   ,  0.87535  , -1.0114   , -0.6181   ,  1.008    ,
        0.31506  ,  0.24417  ,  0.064393 ,  0.33678  ,  0.33632  ,
        0.45975  ,  0.22813  , -0.37505  , -0.37508  ,  0.0893

## Defining a Model

### Linear Regression

In [None]:
L2_regularization_strength = 1.0

In [None]:
# Train a Logistic Regression https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
log_reg = LogisticRegression(C = L2_regularization_strength, max_iter=1000)
log_reg.fit(BoW_Loader.trainX, BoW_Loader.trainY)

# score it against our data
training_accuracy = log_reg.score(BoW_Loader.trainX, BoW_Loader.trainY)
development_accuracy = log_reg.score(BoW_Loader.devX, BoW_Loader.devY)

print("Method: %s, Features: %s, Train accuracy: %.3f, Dev accuracy: %.3f" % (BoW_Loader.feature_method.__name__, F, training_accuracy, development_accuracy))

Method: bag_of_words, Features: 21078, Train accuracy: 1.000, Dev accuracy: 0.775


### Multilayer Perceptron

In [None]:
# Create a Torch Dataset and DataLoader from our data

class BoW_Dataset(Dataset):
    def __init__(self, X, Y):
        self.X = X.toarray().astype(np.float32)
        self.Y = Y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.Y[idx]

        # specific to our data
        if y == 'pos':
          y = 1
        else:
          y = 0
        return x, np.array([y]).astype(np.float32)

train_dataset = BoW_Dataset(BoW_Loader.trainX, BoW_Loader.trainY)
valid_dataset = BoW_Dataset(BoW_Loader.devX, BoW_Loader.devY)

# dataloader = DataLoader(train_dataset, batch_size=2)

In [None]:
model = nn.Sequential(
    nn.Linear(F, 1),
    nn.Sigmoid()
)
print(model)

Sequential(
  (0): Linear(in_features=21078, out_features=1, bias=True)
  (1): Sigmoid()
)


In [None]:
def run_training_loop(model, batch_size=32, n_epochs=10, lr=1e-3):

    # We could write our training procedure manually and directly index the `Dataset` objects,
    # but the `DataLoader` object conveniently creates an iterable for automatically creating random minibatches:
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

    # Choose Adam as the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Use the cross entropy loss function
    # loss_fn = nn.CrossEntropyLoss()
    loss_fn = nn.BCELoss()
    # store metrics
    train_loss_history = np.zeros([n_epochs, 1])
    valid_accuracy_history = np.zeros([n_epochs, 1])
    valid_loss_history = np.zeros([n_epochs, 1])

    for epoch in range(n_epochs):

        # Some layers, such as Dropout, behave differently during training
        model.train()

        train_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            # Erase accumulated gradients
            optimizer.zero_grad()

            # Forward pass
            output = model(data)

            # Calculate loss
            loss = loss_fn(output, target)
            train_loss += loss.item()

            # Backward pass
            loss.backward()

            # Weight update
            optimizer.step()

        train_loss_history[epoch] = train_loss / len(train_loader.dataset)

        # Track loss each epoch
        print('Train Epoch: %d  Average loss: %.4f' %
              (epoch + 1,  train_loss_history[epoch]))

        # Putting layers like Dropout into evaluation mode
        model.eval()

        valid_loss = 0
        correct = 0

        # Turning off automatic differentiation
        with torch.no_grad():
            for data, target in valid_loader:
                output = model(data)
                valid_loss += loss_fn(output, target).item()  # Sum up batch loss
                pred = torch.round(output)
                # pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max class score
                correct += pred.eq(target.view_as(pred)).sum().item()

        valid_loss_history[epoch] = valid_loss / len(valid_loader.dataset)
        valid_accuracy_history[epoch] = correct / len(valid_loader.dataset)

        print('Valid set: Average loss: %.4f, Accuracy: %d/%d (%.4f)\n' %
              (valid_loss_history[epoch], correct, len(valid_loader.dataset),
              100. * valid_accuracy_history[epoch]))

    return model, train_loss_history, valid_loss_history, valid_accuracy_history

In [None]:
logistic_nn, train_loss_history, valid_loss_history, valid_accuracy_history = run_training_loop(model, batch_size=32, n_epochs=10, lr=1e-3)

  print('Train Epoch: %d  Average loss: %.4f' %
  print('Valid set: Average loss: %.4f, Accuracy: %d/%d (%.4f)\n' %


Train Epoch: 1  Average loss: 0.0214
Valid set: Average loss: 0.0205, Accuracy: 758/1000 (75.8000)

Train Epoch: 2  Average loss: 0.0173
Valid set: Average loss: 0.0193, Accuracy: 756/1000 (75.6000)

Train Epoch: 3  Average loss: 0.0145
Valid set: Average loss: 0.0185, Accuracy: 797/1000 (79.7000)

Train Epoch: 4  Average loss: 0.0126
Valid set: Average loss: 0.0178, Accuracy: 795/1000 (79.5000)

Train Epoch: 5  Average loss: 0.0110
Valid set: Average loss: 0.0172, Accuracy: 806/1000 (80.6000)

Train Epoch: 6  Average loss: 0.0097
Valid set: Average loss: 0.0169, Accuracy: 808/1000 (80.8000)

Train Epoch: 7  Average loss: 0.0087
Valid set: Average loss: 0.0165, Accuracy: 803/1000 (80.3000)

Train Epoch: 8  Average loss: 0.0078
Valid set: Average loss: 0.0160, Accuracy: 811/1000 (81.1000)

Train Epoch: 9  Average loss: 0.0071
Valid set: Average loss: 0.0159, Accuracy: 811/1000 (81.1000)

Train Epoch: 10  Average loss: 0.0065
Valid set: Average loss: 0.0156, Accuracy: 806/1000 (80.6000)


In [None]:
model = nn.Sequential(
    nn.Linear(F, 1024),
    nn.ReLU(),
    nn.Linear(1024, 256),
    nn.ReLU(),
    nn.Linear(256, 1),
    nn.Sigmoid()
)
print(model)

Sequential(
  (0): Linear(in_features=21078, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=256, bias=True)
  (3): ReLU()
  (4): Linear(in_features=256, out_features=1, bias=True)
  (5): Sigmoid()
)


In [None]:
len(word_tokenizer.get_vocab())
word_tokenizer.vocab_size

10000

In [None]:
mlp, train_loss_history, valid_loss_history, valid_accuracy_history = run_training_loop(model, batch_size=32, n_epochs=10, lr=1e-3)

  print('Train Epoch: %d  Average loss: %.4f' %


Train Epoch: 1  Average loss: 0.0174


  print('Valid set: Average loss: %.4f, Accuracy: %d/%d (%.4f)\n' %


Valid set: Average loss: 0.0150, Accuracy: 809/1000 (80.9000)

Train Epoch: 2  Average loss: 0.0011
Valid set: Average loss: 0.0216, Accuracy: 795/1000 (79.5000)

Train Epoch: 3  Average loss: 0.0001
Valid set: Average loss: 0.0304, Accuracy: 761/1000 (76.1000)

Train Epoch: 4  Average loss: 0.0000
Valid set: Average loss: 0.0262, Accuracy: 790/1000 (79.0000)

Train Epoch: 5  Average loss: 0.0000
Valid set: Average loss: 0.0272, Accuracy: 787/1000 (78.7000)

Train Epoch: 6  Average loss: 0.0000
Valid set: Average loss: 0.0293, Accuracy: 788/1000 (78.8000)

Train Epoch: 7  Average loss: 0.0000
Valid set: Average loss: 0.0323, Accuracy: 787/1000 (78.7000)

Train Epoch: 8  Average loss: 0.0000
Valid set: Average loss: 0.0357, Accuracy: 786/1000 (78.6000)

Train Epoch: 9  Average loss: 0.0000
Valid set: Average loss: 0.0353, Accuracy: 786/1000 (78.6000)

Train Epoch: 10  Average loss: 0.0000
Valid set: Average loss: 0.0367, Accuracy: 787/1000 (78.7000)



### Convolutional Neural Network

In [None]:
# Let's define a dataset using our word tokenizer


class Tokenized_Dataset(Dataset):
    def __init__(self, X, Y, tokenizer, max_seq_len=500):
        self.X = X
        self.Y = Y
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.Y[idx]

        # use the tokenizer

        x = self.tokenizer(x)

        # Pad or truncate to max_seq_len
        if len(x) < self.max_seq_len:
            x = x + [self.tokenizer.get_vocab_size()] * (self.max_seq_len - len(x))
        else:
            x = x[:self.max_seq_len]
        x = np.array(x)
        # specific to our data
        if y == 'pos':
          y = 1
        else:
          y = 0
        return x, np.array([y]).astype(np.float32)

trainOrigDocuments = [x[2] for x in BoW_Loader.trainOrig]
validOrigDocuments = [x[2] for x in BoW_Loader.devOrig]

train_dataset = Tokenized_Dataset(trainOrigDocuments, BoW_Loader.trainY, word_tokenizer, max_seq_len = 500)
valid_dataset = Tokenized_Dataset(validOrigDocuments, BoW_Loader.devY, word_tokenizer, max_seq_len = 500)

In [None]:
# let's try to use the embeddings we loaded earlier now
model = nn.Sequential(
    nn.Embedding.from_pretrained(torch.from_numpy(embedding_matrix)) # by default this is frozen ie will not be updated during training
)
print(model)

Sequential(
  (0): Embedding(10001, 100)
)


In [None]:
# let's mimic what will happen with our embeddings
max_seq_len = 500

x = BoW_Loader.trainOrig[0][2]
x = word_tokenizer(x)
padding_token = word_tokenizer.get_vocab_size()

# Pad or truncate to max_seq_len
if len(x) < max_seq_len:
    x = x + [word_tokenizer.get_vocab_size()] * (max_seq_len - len(x))
else:
    x = x[:max_seq_len]
print("padding and unknown token is:", padding_token)
print("Tokenized Sequence is:")
print(np.array(x))

padding and unknown token is: 10000
Tokenized Sequence is:
[   50    94     4    75     1   126   156   130    89   105     8     6
     3   133    19    63     4    52   153    68   130   127    78    16
   110   129    19   113   129   155    55    90   152    88    26    17
     9     7     3   130    84    69    92    51    13   101     4   122
    66   148   155    55   118    16   126   159    42   130    85   140
    11    69    67    25   130    84    69   147   111     5    68    15
    96   141   131     2    77   130    98    64   149   139     3     4
   160    30    61   130   116    57   137   130   108    21   138     5
    64    80   129     5   130    84    40    92    71   160    27   156
    86     4    91    45     8    70    76   160   117   144   150    69
    58    69    10    74    93   119   129    54    15    22   129    69
    49    15   145    46     5   130   120    72    93    22    53    29
   125    73    68    47   154   121     5    92    51    13   10

In [None]:
model(torch.tensor(np.array([x])))

tensor([[[-0.0201,  0.0375,  0.3536,  ...,  0.0626,  0.2839, -0.3163],
         [-0.2946, -0.2074,  0.4911,  ...,  0.5775,  0.5217, -0.0270],
         [-0.1077,  0.1105,  0.5981,  ..., -0.8316,  0.4529,  0.0826],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])

In [None]:
model(torch.tensor(np.array([x]))).shape

torch.Size([1, 500, 100])

In [None]:
# notice that this is currently (1, seq_len, embedding_dim) when we use this in our dataloader we'll have (batch_size, seq_len, embedding_dim) but the convolutional layers we use expect (batch_size, embedding_dim, seq_len)

# Let's look at how we can create a custom layer for use in Pytorch that will change this for us

class Transpose(nn.Module):
    def __init__(self, dim0, dim1):
        super().__init__()
        self.dim0 = dim0
        self.dim1 = dim1

    def forward(self, x):
        x = torch.transpose(x,self.dim0, self.dim1)
        return x

In [None]:
# a utility to help us calculate the dims of the hidden layer following our convolution layer

def calc_linear_dim(max_seq_len, num_filters, kernel_size, stride, max_pool_size):
  """
  Calculates the in size of a linear layer following convolutions and max pooling

  Assumes no padding is used
  """
  return ((max_seq_len-kernel_size)//stride + 1)//max_pool_size*num_filters

In [None]:
# We'll use this in combination with a CNN since now we have sequential data
# let's try to use the embeddings we loaded earlier now

# recall max_seq_len is 500 right now

max_seq_len = 500
num_filters = 64
kernel_size = 5
stride = 1
max_pool_size = 20

linear_size = calc_linear_dim(max_seq_len, num_filters, kernel_size, stride, max_pool_size)

model = nn.Sequential(
    nn.Embedding.from_pretrained(torch.from_numpy(embedding_matrix)), # by default this is frozen ie will not be updated during training
    Transpose(1, 2),
    nn.Conv1d(100, num_filters, kernel_size),
    nn.ReLU(),
    nn.MaxPool1d(max_pool_size),
    nn.Flatten(),
    nn.Linear(linear_size, 1),
    nn.Sigmoid()
)
print(model)


Sequential(
  (0): Embedding(10001, 100)
  (1): Transpose()
  (2): Conv1d(100, 64, kernel_size=(5,), stride=(1,))
  (3): ReLU()
  (4): MaxPool1d(kernel_size=20, stride=20, padding=0, dilation=1, ceil_mode=False)
  (5): Flatten(start_dim=1, end_dim=-1)
  (6): Linear(in_features=1536, out_features=1, bias=True)
  (7): Sigmoid()
)


In [None]:
cnn, train_loss_history, valid_loss_history, valid_accuracy_history = run_training_loop(model, batch_size=32, n_epochs=15, lr=5e-3)

  print('Train Epoch: %d  Average loss: %.4f' %


Train Epoch: 1  Average loss: 0.0230


  print('Valid set: Average loss: %.4f, Accuracy: %d/%d (%.4f)\n' %


Valid set: Average loss: 0.0222, Accuracy: 504/1000 (50.4000)

Train Epoch: 2  Average loss: 0.0219
Valid set: Average loss: 0.0220, Accuracy: 545/1000 (54.5000)

Train Epoch: 3  Average loss: 0.0208
Valid set: Average loss: 0.0211, Accuracy: 610/1000 (61.0000)

Train Epoch: 4  Average loss: 0.0195
Valid set: Average loss: 0.0201, Accuracy: 663/1000 (66.3000)

Train Epoch: 5  Average loss: 0.0166
Valid set: Average loss: 0.0188, Accuracy: 684/1000 (68.4000)

Train Epoch: 6  Average loss: 0.0134
Valid set: Average loss: 0.0185, Accuracy: 701/1000 (70.1000)

Train Epoch: 7  Average loss: 0.0096
Valid set: Average loss: 0.0195, Accuracy: 715/1000 (71.5000)

Train Epoch: 8  Average loss: 0.0063
Valid set: Average loss: 0.0238, Accuracy: 700/1000 (70.0000)

Train Epoch: 9  Average loss: 0.0032
Valid set: Average loss: 0.0211, Accuracy: 713/1000 (71.3000)

Train Epoch: 10  Average loss: 0.0017
Valid set: Average loss: 0.0222, Accuracy: 719/1000 (71.9000)

Train Epoch: 11  Average loss: 0.000

In [None]:
# recall max_seq_len is 500 right now
num_filters = 64
kernel_size = 5
stride = 1
max_pool_size = 20

linear_size = calc_linear_dim(max_seq_len, num_filters, kernel_size, stride, max_pool_size)

model = nn.Sequential(
    nn.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1]), # let's fit our own custom embeddings on this data of the same shape
    Transpose(1, 2),
    nn.Conv1d(embedding_matrix.shape[1], num_filters, kernel_size),
    nn.ReLU(),
    nn.MaxPool1d(max_pool_size),
    nn.Flatten(),
    nn.Linear(linear_size, 1),
    nn.Sigmoid()
)
print(model)


Sequential(
  (0): Embedding(10001, 100)
  (1): Transpose()
  (2): Conv1d(100, 64, kernel_size=(5,), stride=(1,))
  (3): ReLU()
  (4): MaxPool1d(kernel_size=20, stride=20, padding=0, dilation=1, ceil_mode=False)
  (5): Flatten(start_dim=1, end_dim=-1)
  (6): Linear(in_features=1536, out_features=1, bias=True)
  (7): Sigmoid()
)


In [None]:
cnn, train_loss_history, valid_loss_history, valid_accuracy_history = run_training_loop(model, batch_size=32, n_epochs=15, lr=5e-3)

  print('Train Epoch: %d  Average loss: %.4f' %


Train Epoch: 1  Average loss: 0.0324


  print('Valid set: Average loss: %.4f, Accuracy: %d/%d (%.4f)\n' %


Valid set: Average loss: 0.0225, Accuracy: 541/1000 (54.1000)

Train Epoch: 2  Average loss: 0.0172
Valid set: Average loss: 0.0232, Accuracy: 532/1000 (53.2000)

Train Epoch: 3  Average loss: 0.0080
Valid set: Average loss: 0.0236, Accuracy: 556/1000 (55.6000)

Train Epoch: 4  Average loss: 0.0018
Valid set: Average loss: 0.0250, Accuracy: 554/1000 (55.4000)

Train Epoch: 5  Average loss: 0.0005
Valid set: Average loss: 0.0258, Accuracy: 548/1000 (54.8000)

Train Epoch: 6  Average loss: 0.0002
Valid set: Average loss: 0.0259, Accuracy: 558/1000 (55.8000)

Train Epoch: 7  Average loss: 0.0001
Valid set: Average loss: 0.0266, Accuracy: 551/1000 (55.1000)

Train Epoch: 8  Average loss: 0.0001
Valid set: Average loss: 0.0272, Accuracy: 554/1000 (55.4000)

Train Epoch: 9  Average loss: 0.0001
Valid set: Average loss: 0.0271, Accuracy: 549/1000 (54.9000)

Train Epoch: 10  Average loss: 0.0001
Valid set: Average loss: 0.0280, Accuracy: 547/1000 (54.7000)

Train Epoch: 11  Average loss: 0.000

In [None]:
# recall max_seq_len is 500 right now
num_filters = 8
kernel_size = 5
stride = 1
max_pool_size = 20
embedding_dim = 32


linear_size = calc_linear_dim(max_seq_len, num_filters, kernel_size, stride, max_pool_size)

model = nn.Sequential(
    nn.Embedding(embedding_matrix.shape[0], embedding_dim), # let's fit our own custom embeddings with our choice of dimensions
    Transpose(1, 2),
    nn.Conv1d(embedding_dim, num_filters, kernel_size),
    nn.ReLU(),
    nn.MaxPool1d(max_pool_size),
    nn.Flatten(),
    nn.Linear(linear_size, 1),
    nn.Sigmoid()
)
print(model)

Sequential(
  (0): Embedding(10001, 32)
  (1): Transpose()
  (2): Conv1d(32, 8, kernel_size=(5,), stride=(1,))
  (3): ReLU()
  (4): MaxPool1d(kernel_size=20, stride=20, padding=0, dilation=1, ceil_mode=False)
  (5): Flatten(start_dim=1, end_dim=-1)
  (6): Linear(in_features=192, out_features=1, bias=True)
  (7): Sigmoid()
)


In [None]:
cnn, train_loss_history, valid_loss_history, valid_accuracy_history = run_training_loop(model, batch_size=32, n_epochs=15, lr=5e-3)

  print('Train Epoch: %d  Average loss: %.4f' %


Train Epoch: 1  Average loss: 0.0226


  print('Valid set: Average loss: %.4f, Accuracy: %d/%d (%.4f)\n' %


Valid set: Average loss: 0.0222, Accuracy: 519/1000 (51.9000)

Train Epoch: 2  Average loss: 0.0202
Valid set: Average loss: 0.0224, Accuracy: 529/1000 (52.9000)

Train Epoch: 3  Average loss: 0.0167
Valid set: Average loss: 0.0220, Accuracy: 553/1000 (55.3000)

Train Epoch: 4  Average loss: 0.0114
Valid set: Average loss: 0.0233, Accuracy: 568/1000 (56.8000)

Train Epoch: 5  Average loss: 0.0057
Valid set: Average loss: 0.0239, Accuracy: 580/1000 (58.0000)

Train Epoch: 6  Average loss: 0.0022
Valid set: Average loss: 0.0249, Accuracy: 573/1000 (57.3000)

Train Epoch: 7  Average loss: 0.0010
Valid set: Average loss: 0.0260, Accuracy: 579/1000 (57.9000)

Train Epoch: 8  Average loss: 0.0006
Valid set: Average loss: 0.0266, Accuracy: 573/1000 (57.3000)

Train Epoch: 9  Average loss: 0.0004
Valid set: Average loss: 0.0273, Accuracy: 580/1000 (58.0000)

Train Epoch: 10  Average loss: 0.0003
Valid set: Average loss: 0.0281, Accuracy: 576/1000 (57.6000)

Train Epoch: 11  Average loss: 0.000