In [None]:
#to use this notebook, run all cells containing a def statement and then call the various functions in the very last cell.
#sample usages of this notebook are included there.

from numpy.core.fromnumeric import mean
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext import data
import torch.optim as optim
import argparse
import os
import pandas as pd
import matplotlib.pyplot as plt

glove = torchtext.vocab.GloVe(name="6B",dim=100)

#copied data handling and initialization functions from section 3 and 4
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, vocab, split="train"):
        data_path = "data"
        df = pd.read_csv(os.path.join(data_path, f"{split}.tsv"), sep="\t")
        X, Y = [], []
        V = len(vocab.vectors)
        for i, row in df.iterrows():
            L = row.values[0].split()
            X.append(torch.tensor([vocab.stoi.get(w, V-1) for w in L]))  # Use the last word in the vocab as the "out-of-vocabulary" token
            Y.append(float(row.values[1]))
        self.X = X
        self.Y = torch.tensor(Y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

def my_collate_function(batch, device):
    batch_x, batch_y = [], []
    max_len = 0
    for x,y in batch:
        batch_y += [y]
        max_len = max(max_len, len(x))
    for x,y in batch:
        x_p = torch.concat(
            [x, torch.zeros(max_len - len(x))]
        )
        batch_x.append(x_p)
    return torch.stack(batch_x).int().to(device), torch.tensor(batch_y).to(device)

#initialize dataloaders and device
batch_size = 10
#fix seed
torch.manual_seed(2)
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# 3.3.2
train_dataset = TextDataset(glove, "train")
val_dataset = TextDataset(glove, "val")
test_dataset = TextDataset(glove, "test")
overfit_dataset = TextDataset(glove, "overfit")

# 3.3.3
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size= batch_size,
    shuffle=True,
    collate_fn=lambda batch: my_collate_function(batch, device))

val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset,
    batch_size= batch_size,
    shuffle=True,
    collate_fn=lambda batch: my_collate_function(batch, device))

test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size= batch_size,
    shuffle=False,
    collate_fn=lambda batch: my_collate_function(batch, device))

overfit_dataloader = torch.utils.data.DataLoader(
    dataset=overfit_dataset,
    batch_size= batch_size,
    shuffle=True,
    collate_fn=lambda batch: my_collate_function(batch, device))




Using device: cuda


In [None]:
#new CNN class
class CNN_classifier(torch.nn.Module):
  def __init__(self, vocab, device, k1, k2, n1, n2):
    super().__init__()
    self.embedding = nn.Embedding.from_pretrained(vocab.vectors, freeze = False).to(device)
    self.conv1 = nn.Conv2d(1,n1,(k1, 100),1, bias = False).to(device)
    self.conv2 = nn.Conv2d(1,n2,(k2, 100),1, bias = False).to(device)
    self.relu = nn.ReLU()
    self.maxpool = nn.MaxPool1d(2) #sentence length varies between batches so just use a kernel size of two repeatedly
    self.linear = nn.Linear(n1+n2,1).to(device)
  def forward(self, x):
    s = self.embedding(x).unsqueeze(1) #expect input in the form (N,C,H,W); only one channel here
    c1 = self.relu(self.conv1(s).squeeze(3))
    while c1.shape[2] > 1:
      c1 = self.maxpool(c1)
    c1 = c1.squeeze(1)

    c2 = self.relu(self.conv1(s).squeeze(3))
    while c2.shape[2] > 1:
      c2 = self.maxpool(c2)
    c2 = c2.squeeze(1)
    maxes = torch.cat((c1,c2), 1).squeeze()
    return self.linear(maxes).squeeze()

In [None]:
def eval(net, test_dataloader, criterion, sigmoid):
  iter = 0
  err = 0
  ll = []
  for sentences, labels in test_dataloader:
    logits = net(sentences)
    loss = criterion(logits, labels)
    corr = torch.round(sigmoid(logits)) != labels
    iter += len(corr)
    err += int(corr.sum())
    ll += [float(loss.item())]
  return mean(ll), err/iter

In [None]:
def train(k1, k2, n1, n2, epochs, lr, train_dataloader, val_dataloader, test_dataloader): #5.1 and 5.2
  net = CNN_classifier(glove, device, k1, k2, n1, n2)
  opt = optim.Adam(net.parameters(), lr = lr)
  criterion = torch.nn.BCEWithLogitsLoss()
  sigmoid = nn.Sigmoid()
  iter = 0

  tl = [] #training losses (epochs)
  vl = [] #validation losses (epochs)
  te = [] #training error (epochs)
  ve = [] #validation error (epochs)
  epochslist = []

  for epoch in range(epochs):
    tll = [] #training loss each epoch
    vll = [] #validation loss each epoch
    trainerr = 0.0
    valerr = 0.0
    iter = 0

    for sentences, labels in train_dataloader:
      opt.zero_grad()
      logits = net(sentences)
      loss = criterion(logits, labels)
      loss.backward()
      opt.step()
      tll += [float(loss.item())]
      corr = torch.round(sigmoid(logits)) != labels
      iter += len(corr)
      trainerr += int(corr.sum())
    tl += [mean(tll)]
    te += [trainerr/iter]

    vll, err = eval(net, val_dataloader, criterion, sigmoid)
    vl += [vll]
    ve += [err]
    epochslist += [epoch]

  testloss, testerr = eval(net, test_dataloader, criterion, sigmoid)

  fig, ax = plt.subplots()
  ax.plot(epochslist, tl, label = 'Training Loss')
  ax.plot(epochslist, vl, label = 'Validation Loss')
  ax.set_title("Training and Validation Loss")
  ax.set_xlabel("Epoch")
  ax.set_ylabel("Loss")
  ax.legend()

  fig, ax = plt.subplots()
  ax.plot(epochslist, te, label = 'Training Errors')
  ax.plot(epochslist, ve, label = 'Validation Errors')
  ax.set_title("Training and Validation Errors")
  ax.set_xlabel("Epoch")
  ax.set_ylabel("Error")
  ax.legend()

  print("Testing loss: "+str(testloss))
  print("Testing accuracy: "+str(1-testerr))

  return net

In [None]:
def get_closest_words(net): #5.3
  for name, param in (net.named_parameters()): #get weights
    if name == "conv1.weight":
      weights1 = param.data
    elif name == "conv2.weight":
      weights2 = param.data

  weights1 = weights1.squeeze().cpu() #sorting runs faster on CPU
  weights2 = weights2.squeeze().cpu()

  for k in range(0, weights1.shape[0], 1):
    for j in range(0, weights1.shape[1], 1):
      print("conv1.weight["+str(k) + "]["+str(j)+"] similar words: ")
      #weightsCopy = weights1.detach().clone()
      dists = torch.cosine_similarity(glove.vectors.cpu(), weights1[k][j].cpu())    # compute distances to all words
      lst = sorted(enumerate(dists), key=lambda x: x[1]) # sort by distance
      for i in range(len(lst) -2, len(lst)- (5+2), -1):    # take the top n, don't consider top result (will be target itself)
        idx = lst[i][0]
        difference = lst[i][1]
        print(glove.itos[idx], "\t%5.2f" % lst[i][1])


  for k in range(9, weights2.shape[0], 1):
    for j in range(0, weights2.shape[1], 1):
      print("conv2.weight["+str(k) + "]["+str(j)+"] similar words: ")
      #weightsCopy = weights2.detach().clone()
      dists = torch.cosine_similarity(glove.vectors.cpu(), weights2[k][j].cpu())    # compute distances to all words
      lst = sorted(enumerate(dists), key=lambda x: x[1]) # sort by distance
      for i in range(len(lst) -2, len(lst)- (5+2), -1):    # take the top n, don't consider top result (will be target itself)
        idx = lst[i][0]
        difference = lst[i][1]
        print(glove.itos[idx], "\t%5.2f" % lst[i][1])

  return

In [None]:
get_closest_words(net)
net = train(k1 = 2, k2 = 16, n1 = 15, n2 = 15, epochs = 50, lr = 0.0001, train_dataloader = train_dataloader, val_dataloader = val_dataloader, test_dataloader = test_dataloader)
torch.save(net.state_dict(), 'CNN_classifier_model.pt')