In [None]:
!pip install wget tqdm scikit-learn

In [None]:
# Setup the directory
from google.colab import drive
drive.mount('/content/drive')
import os

from pathlib import Path
ROOT=Path("/content/drive/MyDrive/VU/deep_learning/A3")
os.chdir(ROOT)
print(f'cdw = {os.getcwd()}')

In [None]:
MODELS = ROOT / 'models'
RESULTS = ROOT / 'results'

In [None]:
import time
import logging
from pathlib import Path
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from data import load_imdb
from collections import defaultdict

logging.basicConfig(level='INFO', format='%(asctime)s - %(message)s')


In [None]:
def load_progress(fname):
    df = pd.read_csv(fname, index_col=0)
    from torch import tensor

    def fun(x):
      try:
        return float(eval(x))
      except:
        return x
    df['val-loss'] = df.aggregate({'val-loss' : fun})
    return df

def plots(df):
  melt = df.melt('epoch', var_name='cols', value_name='vals')
  f, ax = plt.subplots(ncols=2, sharex=True, figsize=(20,5))

  sns.lineplot(x="epoch", y="vals", hue='cols', data=melt[melt['cols'].isin(['loss', 'val-loss'])], markers=True, ax=ax[0])
  sns.lineplot(x="epoch", y="vals", hue='cols', data=melt[melt['cols'].isin(['accuracy', 'f1'])], markers=True, ax=ax[1])
  ax[0].set_ylabel('loss')
  ax[1].set_ylabel('accuracy/f1')
  
def prepare_batches(x_train, y_train, i2w, w2i, max_toks):
  
    print([i2w[w] for w in x_train[141]])
    print(len(x_train))
    dict_size = len(i2w)
    # w2i['.pad'] = 0
    pad_val = w2i['.pad']
    # w2i['.start'] = 1
    start_val = w2i['.start']
    # w2i['.end'] = 2
    end_val = w2i['.end']

    for x in x_train:
        x.insert(0, start_val)
        x.append(end_val)

    sizes = defaultdict(list)
    for x,y in zip(x_train, y_train):
        sizes[len(x)].append((x,y))
    
    batches = []
    batchx = []
    batchy = []
    max_batch_toks = 0
    batch_toks = 0
    for size in sorted(list(sizes.keys()), reverse=True):
      xy = sizes[size]

      for x, y in xy:
        if batch_toks + len(x) >= max_toks:
          # Switch batch
          x_tensor = torch.as_tensor(batchx, dtype=torch.long)
          y_tensor = torch.as_tensor(batchy, dtype=torch.float32).view(-1,1)
          batches.append((x_tensor, y_tensor))
          batchx, batchy = [],[]
          batch_toks = 0
          max_batch_toks = 0
        if max_batch_toks == 0:
          max_batch_toks = len(x)

        n_pad = max_batch_toks - len(x) 
        x = x + [pad_val] * n_pad
        batchx.append(x)        
        batchy.append(y)
        batch_toks += len(x)
    return batches

# Models

In [None]:
class MLP(nn.Module):
    def __init__(self, output_size, vocab_size, embedding_dim=300, hidden_size=300):
        super().__init__()
        self.layer1 = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim , padding_idx = 0) 
        self.layer2 = nn.Linear(in_features = embedding_dim, out_features= hidden_size)
        self.layer3 = nn.Linear(hidden_size, output_size)
        
    def forward(self, input):
        emb = self.layer1(input)
        hidden = self.layer2(emb)
        non_linear_hidden = F.relu(hidden)
        permuted = non_linear_hidden.permute(0,2,1)
        pooled = F.max_pool1d(permuted, kernel_size=permuted.shape[2]).squeeze(-1)
        return self.layer3(pooled)


In [None]:
class HParams:
  @classmethod
  def save(cls, fname):
    with Path(fname).open('w') as fp:
      json.dump(
          {
              'embedding_dim' : cls.embedding_dim,
              'hidden_dim' : cls.hidden_dim,
              'num_cls' : cls.num_cls,
              'lr' : cls.lr,
              'epochs' : cls.epochs,
              'batch_size' : cls.batch_size,
          }, fp
      )

# Training loop and evaluation

In [None]:
def eval_model_bin(model, eval_batches, device):
    with torch.no_grad():
        criterion = nn.BCEWithLogitsLoss()
        y_trues = list()
        y_preds = list()
        val_losses = list()
        for i, (x,y) in enumerate(eval_batches):
            x = x.to(device)
            logits = model(x).detach().cpu()
            val_loss = criterion(logits, y).item()
            y_pred = torch.sigmoid(logits)            
            
            val_losses.append(val_loss)
            y_trues.append(y.squeeze(-1))
            y_preds.append(y_pred.squeeze(-1))

        mean_val_loss = torch.as_tensor(val_losses, dtype=torch.float32).mean()
        y_true = torch.cat(y_trues).numpy()
        y_pred = torch.cat(y_preds).numpy()
        y_pred = np.where(y_pred < 0.5, 0, 1)

    return accuracy_score(y_true, y_pred), f1_score(y_true, y_pred), float(mean_val_loss)

def train(model: nn.Module, criterion, optimizer, epochs : int, batches ,eval_batches,  device: torch.device, models_dir, loss_print_freq: int=30):
    model.to(device)

    # Capture training starting time
    ts_train = time.perf_counter()

    running_loss = 0
    
    # list for training progress capturing
    data = list()
    for epoch in tqdm(range(epochs), desc='epochs'):        
        ts = time.perf_counter()
        for i, (x,y) in tqdm(enumerate(batches), total=len(batches), desc=f'Epoch {epoch}'):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(x)            
            loss = criterion(out, y.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            accuracy, f1, val_loss = None, None, None
            if i % loss_print_freq == loss_print_freq - 1: #print every 1000 batches
                accuracy, f1, val_loss = eval_model_bin(model, eval_batches, device)            
                logging.info('[%d, %5d] loss: %.3f v-loss: %.3f acc: %.2f f1: %.2f' %
                    (epoch +1, i+1, running_loss / loss_print_freq, val_loss, accuracy, f1))
                running_loss = 0.0
            
            data.append({'update' : i, 'epoch': epoch, 'loss': loss.item(), 'val-loss': val_loss, 'accuracy' : accuracy, 'f1': f1})
        
        train_accuracy, train_f1, train_loss = eval_model_bin(model, batches, device)            
        logging.info('[%d, %5d] train-loss: %.3f acc: %.2f f1: %.2f' %
          (epoch +1, i+1, train_loss, train_accuracy, train_f1))
        logging.info(f'Epoch took: {time.perf_counter()-ts:.2f}s')
        ckpt = models_dir / f'ckpt{epoch}.pt'
        logging.info(f'Saving model ckpt to: {ckpt}')
        torch.save(model, ckpt) 
        
        prog = models_dir / f'progress{epoch}.csv'
        logging.info(f'Saving progress file to: {prog}')

        df = pd.DataFrame(data)
        df.to_csv(prog)
    final_scores = eval_model_bin(model, eval_batches, device)            

    logging.info(f'Finished training. {epochs} epochs took: {time.perf_counter()-ts_train:.2f}s')
    return data, model, final_scores

# Define dataset

In [None]:
class MLP_HParams(HParams):
  embedding_dim = 300
  hidden_dim = 300
  num_cls = 1
  lr = 0.01
  epochs = 5
  batch_size = 128

In [None]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=False)
trainloader = prepare_batches(x_train, y_train, i2w, w2i, 20000)
valloader = prepare_batches(x_val, y_val, i2w, w2i, 20000)

np.random.shuffle(trainloader)
np.random.shuffle(valloader)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

In [None]:


model = MLP(
    embedding_dim=MLP_HParams.embedding_dim, 
    output_size=MLP_HParams.num_cls,
    hidden_size=MLP_HParams.hidden_dim,
    vocab_size=len(i2w),
)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=MLP_HParams.lr)

MODEL_DIR = MODELS / 'new_MLP'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
data, model = train(model, criterion, 
                     optimizer, 
                     epochs=MLP_HParams.epochs, 
                     batches=trainloader, 
                     eval_batches=valloader, 
                     models_dir=MODEL_DIR, 
                     device=device, 
                     loss_print_freq=int(len(trainloader)/10))
df = pd.DataFrame(data)
df.to_csv(MODEL_DIR / 'progress.csv')
MLP_HParams.save(MODEL_DIR / 'hparams.json')
plots(df)

# MLP HParams tuning

In [None]:
class MLP_HParamsTune(MLP_HParams):
  epochs = 3

class hp1(MLP_HParamsTune):
  embedding_dim = 150
  hidden_dim = 150

class hp2(MLP_HParamsTune):
  embedding_dim = 600
  hidden_dim = 600

class hp3(MLP_HParamsTune):
  embedding_dim = 150

class hp4(MLP_HParamsTune):
  hidden_dim = 150

for i, params in tqdm(enumerate([hp1, hp2, hp3, hp4]), desc='hparams'):
  model = MLP(
      embedding_dim=params.embedding_dim, 
      output_size=params.num_cls,
      hidden_size=params.hidden_dim,
      vocab_size=len(i2w),
  )
  criterion = nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)

  MODEL_DIR = MODELS / f'new_MLP{i}'
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
  data, model = train(model,
                      criterion, 
                      optimizer, 
                      epochs=params.epochs, 
                      batches=trainloader, 
                      eval_batches=valloader, 
                      models_dir=MODEL_DIR, 
                      device=device, 
                      loss_print_freq=int(len(trainloader)/10))  
  df = pd.DataFrame(data)
  df.to_csv(MODEL_DIR / 'progress.csv')
  params.save(MODEL_DIR / 'hparams.json')

In [None]:
class LstmNetMaxPool(nn.Module):
    def __init__(self, output_size, vocab_size, embedding_dim=300, hidden_size=300, num_layers=1, dropout=0.3):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.dropout_rate = dropout
        self.layer1 = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim,  padding_idx = 0)
        self.layer2 = nn.LSTM(input_size = embedding_dim, 
                             hidden_size = hidden_size, 
                             num_layers=num_layers, 
                             batch_first=True,
                             dropout = dropout,
                             
                             )
        self.layer3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input):
        emb = self.layer1(input)
        emb = self.dropout(emb)
        output, (hidden, cell) = self.layer2(emb)
        permuted = output.permute(0,2,1)
        pooled = F.max_pool1d(permuted, kernel_size=permuted.shape[2]).squeeze(-1)
        hidden = self.dropout(pooled)
        output = self.layer3(hidden)
        return output

In [None]:
class LSTM_HParams(MLP_HParams):
  lr = 0.001
  epochs = 10

model = LstmNetMaxPool(
    embedding_dim=LSTM_HParams.embedding_dim, 
    output_size=LSTM_HParams.num_cls,
    hidden_size=LSTM_HParams.hidden_dim,
    vocab_size=len(i2w),
)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LSTM_HParams.lr)

MODEL_DIR = MODELS / 'LSTM_MaxPool'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
data, model = train(model, criterion, 
                     optimizer, 
                     epochs=LSTM_HParams.epochs, 
                     batches=trainloader, 
                     eval_batches=valloader, 
                     models_dir=MODEL_DIR, 
                     device=device, 
                     loss_print_freq=int(len(trainloader)/5))
df = pd.DataFrame(data)
df.to_csv(MODEL_DIR / 'progress.csv')
MLP_HParams.save(MODEL_DIR / 'hparams.json')
plots(df)

In [None]:
class RnnNetMaxPool(nn.Module):
    def __init__(self, output_size, vocab_size, embedding_dim=300, hidden_size=300, num_layers=1, dropout=0.3):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.dropout_rate = dropout
        self.layer1 = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim,  padding_idx = 0)
        self.layer2 = nn.RNN(input_size = embedding_dim, 
                             hidden_size = hidden_size, 
                             num_layers=num_layers, 
                             batch_first=True,
                             dropout = dropout, 
                             )
        self.layer3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input):
        emb = self.layer1(input)
        emb = self.dropout(emb)
        output, hidden = self.layer2(emb)
        permuted = output.permute(0,2,1)
        pooled = F.max_pool1d(permuted, kernel_size=permuted.shape[2]).squeeze(-1)
        hidden = self.dropout(pooled)
        output = self.layer3(hidden)
        return output

In [None]:
class RNN_HParams(MLP_HParams):
  lr = 0.001
  epochs = 10

model = RnnNetMaxPool(
    embedding_dim=RNN_HParams.embedding_dim, 
    output_size=RNN_HParams.num_cls,
    hidden_size=RNN_HParams.hidden_dim,
    vocab_size=len(i2w),
)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=RNN_HParams.lr)

MODEL_DIR = MODELS / 'RNN_MaxPool'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
data, model = train(model, criterion, 
                     optimizer, 
                     epochs=RNN_HParams.epochs, 
                     batches=trainloader, 
                     eval_batches=valloader, 
                     models_dir=MODEL_DIR, 
                     device=device, 
                     loss_print_freq=int(len(trainloader)/5))
df = pd.DataFrame(data)
df.to_csv(MODEL_DIR / 'progress.csv')
MLP_HParams.save(MODEL_DIR / 'hparams.json')
plots(df)

In [None]:
class RNN_HParams(MLP_HParams):
  lr = 0.001
  epochs = 10

class hp1(RNN_HParams):
  embedding_dim = 150
  hidden_dim = 150

class hp2(RNN_HParams):
  embedding_dim = 600
  hidden_dim = 600

class hp3(RNN_HParams):
  embedding_dim = 150

class hp4(RNN_HParams):
  hidden_dim = 150

for i, hp in enumerate([hp1, hp2, hp3, hp4]):
  model = RnnNetMaxPool(
      embedding_dim=hp.embedding_dim, 
      output_size=hp.num_cls,
      hidden_size=hp.hidden_dim,
      vocab_size=len(i2w),
  )
  criterion = nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=hp.lr)

  MODEL_DIR = MODELS / f'RNN_MaxPool_{i}'
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
  data, model = train(model, criterion, 
                      optimizer, 
                      epochs=hp.epochs, 
                      batches=trainloader, 
                      eval_batches=valloader, 
                      models_dir=MODEL_DIR, 
                      device=device, 
                      loss_print_freq=int(len(trainloader)/5))
  df = pd.DataFrame(data)
  df.to_csv(MODEL_DIR / 'progress.csv')
  hp.save(MODEL_DIR / 'hparams.json')
  plots(df)

In [None]:
mlp=load_progress(f'/content/drive/MyDrive/VU/deep_learning/A3/models/RNN_MaxPool/progress.csv')
m=mlp[~mlp['accuracy'].isna()].groupby(by='epoch').max()['accuracy'].max()
m

In [None]:
max=[]
for i in range(4):
  mlp=load_progress(f'/content/drive/MyDrive/VU/deep_learning/A3/models/RNN_MaxPool_{i}/progress.csv')
  m=mlp[~mlp['accuracy'].isna()].groupby(by='epoch').max()['accuracy'].max()
  max.append(m)
for m in max:
  print(f'{m:.6f}')

In [None]:
!nvidia-smi

In [None]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=True)
trainloader = prepare_batches(x_train, y_train, i2w, w2i, 20000)
valloader = prepare_batches(x_val, y_val, i2w, w2i, 20000)
np.random.shuffle(trainloader)
np.random.shuffle(valloader)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

class FinalHParams(MLP_HParams):
  embedding_dim = 300
  hidden_dim = 300
  lr = 0.001
  epochs = 10

model = RnnNetMaxPool(
    embedding_dim=FinalHParams.embedding_dim, 
    output_size=FinalHParams.num_cls,
    hidden_size=FinalHParams.hidden_dim,
    vocab_size=len(i2w),
)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=FinalHParams.lr)

MODEL_DIR = MODELS / 'RNN_MaxPool_final'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
data, model, final_scores = train(model, criterion, 
                     optimizer, 
                     epochs=FinalHParams.epochs, 
                     batches=trainloader, 
                     eval_batches=valloader, 
                     models_dir=MODEL_DIR, 
                     device=device, 
                     loss_print_freq=int(len(trainloader)/10))
df = pd.DataFrame(data)
df.to_csv(MODEL_DIR / 'progress.csv')
MLP_HParams.save(MODEL_DIR / 'hparams.json')
plots(df)

In [None]:
def plots_test(df):
  df['test-loss'] = df['val-loss']
  df = df.drop(columns=['val-loss']) 
  melt = df.melt('epoch', var_name='cols', value_name='vals')
  f, ax = plt.subplots(ncols=2, sharex=True, figsize=(20,5))

  sns.lineplot(x="epoch", y="vals", hue='cols', data=melt[melt['cols'].isin(['loss', 'test-loss'])], markers=True, ax=ax[0])
  sns.lineplot(x="epoch", y="vals", hue='cols', data=melt[melt['cols'].isin(['accuracy', 'f1'])], markers=True, ax=ax[1])
  ax[0].set_ylabel('loss')
  ax[1].set_ylabel('accuracy/f1')
plots_test(df)

In [None]:
final_scores

In [None]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=True)
trainloader = prepare_batches(x_train, y_train, i2w, w2i, 20000)
valloader = prepare_batches(x_val, y_val, i2w, w2i, 20000)
np.random.shuffle(trainloader)
np.random.shuffle(valloader)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

class FinalHParams(MLP_HParams):
  embedding_dim = 600
  hidden_dim = 600
  lr = 0.001
  epochs = 10

model = LstmNetMaxPool(
    embedding_dim=FinalHParams.embedding_dim, 
    output_size=FinalHParams.num_cls,
    hidden_size=FinalHParams.hidden_dim,
    vocab_size=len(i2w),
)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=FinalHParams.lr)

MODEL_DIR = MODELS / 'LSTM_MaxPool_final'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
data, model, final_scores = train(model, criterion, 
                     optimizer, 
                     epochs=FinalHParams.epochs, 
                     batches=trainloader, 
                     eval_batches=valloader, 
                     models_dir=MODEL_DIR, 
                     device=device, 
                     loss_print_freq=int(len(trainloader)/5))
df = pd.DataFrame(data)
df.to_csv(MODEL_DIR / 'progress.csv')
MLP_HParams.save(MODEL_DIR / 'hparams.json')
plots(df)

In [None]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=True)
trainloader = prepare_batches(x_train, y_train, i2w, w2i, 20000)
valloader = prepare_batches(x_val, y_val, i2w, w2i, 20000)
np.random.shuffle(trainloader)
np.random.shuffle(valloader)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

class FinalHParams(MLP_HParams):
  embedding_dim = 150
  hidden_dim = 600
  lr = 0.01
  epochs = 10

model = MLP(
    embedding_dim=FinalHParams.embedding_dim, 
    output_size=FinalHParams.num_cls,
    hidden_size=FinalHParams.hidden_dim,
    vocab_size=len(i2w),
)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=FinalHParams.lr)

MODEL_DIR = MODELS / 'MLP_final'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
data, model, final_scores = train(model, criterion, 
                     optimizer, 
                     epochs=FinalHParams.epochs, 
                     batches=trainloader, 
                     eval_batches=valloader, 
                     models_dir=MODEL_DIR, 
                     device=device, 
                     loss_print_freq=int(len(trainloader)/5))
df = pd.DataFrame(data)
df.to_csv(MODEL_DIR / 'progress.csv')
MLP_HParams.save(MODEL_DIR / 'hparams.json')
plots(df)

In [None]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=True)
trainloader = prepare_batches(x_train, y_train, i2w, w2i, 20000)
valloader = prepare_batches(x_val, y_val, i2w, w2i, 20000)
np.random.shuffle(trainloader)
np.random.shuffle(valloader)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 

class FinalHParams(MLP_HParams):
  embedding_dim = 150
  hidden_dim = 600
  lr = 0.01
  epochs = 4

model = MLP(
    embedding_dim=FinalHParams.embedding_dim, 
    output_size=FinalHParams.num_cls,
    hidden_size=FinalHParams.hidden_dim,
    vocab_size=len(i2w),
)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=FinalHParams.lr)

MODEL_DIR = MODELS / 'MLP_final_2ep'
MODEL_DIR.mkdir(exist_ok=True, parents=True)
data, model, final_scores = train(model, criterion, 
                     optimizer, 
                     epochs=FinalHParams.epochs, 
                     batches=trainloader, 
                     eval_batches=valloader, 
                     models_dir=MODEL_DIR, 
                     device=device, 
                     loss_print_freq=int(len(trainloader)/5))
df = pd.DataFrame(data)
df.to_csv(MODEL_DIR / 'progress.csv')
MLP_HParams.save(MODEL_DIR / 'hparams.json')
plots(df)

In [None]:
final_scores