In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import BatchNorm1d, ReLU, CrossEntropyLoss
from torch import optim
from torch.optim.lr_scheduler import StepLR, MultiplicativeLR, CosineAnnealingLR

import numpy as np

## 1 Load Dataset

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [16]:
%cd /content
!pip install --upgrade kaggle
!mkdir .kaggle

/content
Requirement already up-to-date: kaggle in /usr/local/lib/python3.6/dist-packages (1.5.10)


In [17]:
import json
token = {"username":"kexinzhangcmu","key":"91f327ebd62f2d3cfeb09f0344748606"}

with open('/content/.kaggle/kaggle.json', 'w') as file:
      json.dump(token, file)

In [20]:
!chmod 600 /content/.kaggle/kaggle.json
!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!kaggle config set -n path -v /content/
!kaggle competitions download -c 11-785-fall-20-homework-1-part-2

- path is now set to: /content/
Downloading sample.csv.zip to /content/competitions/11-785-fall-20-homework-1-part-2
  0% 0.00/3.36M [00:00<?, ?B/s]
100% 3.36M/3.36M [00:00<00:00, 55.9MB/s]
Downloading train_labels.npy.zip to /content/competitions/11-785-fall-20-homework-1-part-2
 53% 5.00M/9.45M [00:00<00:00, 22.0MB/s]
100% 9.45M/9.45M [00:00<00:00, 31.5MB/s]
Downloading test.npy.zip to /content/competitions/11-785-fall-20-homework-1-part-2
 99% 97.0M/98.1M [00:03<00:00, 22.5MB/s]
100% 98.1M/98.1M [00:03<00:00, 33.0MB/s]
Downloading phones.txt to /content/competitions/11-785-fall-20-homework-1-part-2
  0% 0.00/3.19k [00:00<?, ?B/s]
100% 3.19k/3.19k [00:00<00:00, 3.05MB/s]
Downloading dev_labels.npy.zip to /content/competitions/11-785-fall-20-homework-1-part-2
  0% 0.00/622k [00:00<?, ?B/s]
100% 622k/622k [00:00<00:00, 194MB/s]
Downloading train.npy.zip to /content/competitions/11-785-fall-20-homework-1-part-2
 99% 1.56G/1.57G [00:32<00:00, 38.8MB/s]
100% 1.57G/1.57G [00:32<00:00, 52.5

In [21]:
%cd /content/competitions/11-785-fall-20-homework-1-part-2
!unzip \*.zip
!rm -rf *.zip

/content/competitions/11-785-fall-20-homework-1-part-2
Archive:  train.npy.zip
  inflating: train.npy               

Archive:  dev_labels.npy.zip
  inflating: dev_labels.npy          

Archive:  dev.npy.zip
  inflating: dev.npy                 

Archive:  test.npy.zip
  inflating: test.npy                

Archive:  train_labels.npy.zip
  inflating: train_labels.npy        

Archive:  sample.csv.zip
  inflating: sample.csv              

6 archives were successfully processed.


In [10]:
context_size = 15

In [23]:
DATA_PATH="/content/competitions/11-785-fall-20-homework-1-part-2/"
def load_data(DATA_PATH):
    """load data"""
    train = np.load(DATA_PATH + "train.npy",allow_pickle=True)
    train_labels = np.load(DATA_PATH + "train_labels.npy",allow_pickle=True)
    dev = np.load(DATA_PATH + 'dev.npy', allow_pickle=True)
    dev_labels = np.load(DATA_PATH + 'dev_labels.npy', allow_pickle=True)
    
    return train, train_labels, dev, dev_labels 


train, train_labels, dev, dev_labels = load_data(DATA_PATH)

In [6]:
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [25]:
class hw1Dataset(Dataset):
    def __init__(self, X, Y, context_size = 12):
        self.context_size = context_size
        self.span = 2 * self.context_size + 1

        features = np.pad(np.concatenate(X), 
                          pad_width = ((context_size, context_size), (0,0)))
        
        labels = np.concatenate(Y)
        
        # assert(len(features) == len(labels))

        print("# of examples:", len(features), "//", len(labels))
        self.X = torch.tensor(features)
        self.Y = torch.tensor(labels)
    
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, index):
        X = self.X[index : index + self.span].float().reshape(-1)
        Y = self.Y[index].long().reshape(-1)
                
        return X, Y
    

num_workers = 4 if cuda else 0

train_dateset = hw1Dataset(train, train_labels, context_size)
train_loader_args = dict(shuffle = True, batch_size = 512,
                        num_workers = num_workers, pin_memory = True, drop_last = True)\
if cuda else dict(shuffle = True, batch_size = 512)

train_loader = DataLoader(train_dateset, **train_loader_args)

# of examples: 27329567 // 27329537


In [28]:
dev_dataset = hw1Dataset(dev, dev_labels, context_size)
dev_loader_args = dict(shuffle = False, batch_size = 512,
                        num_workers = num_workers, pin_memory = True, drop_last = True)\
if cuda else dict(shuffle = False, batch_szie = 512)

dev_loader = DataLoader(dev_dataset, **dev_loader_args)

# of examples: 1598434 // 1598404


In [None]:
del train
del train_labels
del dev
del dev_labels

## 2 Model Architechture

In [29]:
# create a more customizable network module (equivalent here)
class MyNetwork(torch.nn.Module):
    # you can use the layer sizes as initialization arguments if you want to
    def __init__(self,input_size, output_size=346):
        super().__init__()
        

        layers = [
                  
                  torch.nn.Linear(input_size,2048),
                  ReLU(),
                  BatchNorm1d(2048),

                  torch.nn.Linear(2048, 2048),
                  ReLU(),
                  BatchNorm1d(2048),

                  torch.nn.Linear(2048, 1024),
                  ReLU(),
                  BatchNorm1d(1024),

                  torch.nn.Linear(1024, 1024),
                  ReLU(),
                  BatchNorm1d(1024),

                  torch.nn.Linear(1024, 1024),
                  ReLU(),
                  BatchNorm1d(1024),

                  torch.nn.Linear(1024, 1024),
                  ReLU(),
                  BatchNorm1d(1024),

                  torch.nn.Linear(1024, 1024),
                  ReLU(),
                  BatchNorm1d(1024),

                  torch.nn.Linear(1024, 1024),
                  ReLU(),
                  BatchNorm1d(1024),

                  torch.nn.Linear(1024, 768),
                  ReLU(),
                  BatchNorm1d(768),

                  torch.nn.Linear(768, 512),
                  ReLU(),
                  BatchNorm1d(512),

                  torch.nn.Linear(512, 346)
        ]

        self.layers = torch.nn.Sequential(*layers)

    def forward(self, input_val):
        return self.layers(input_val)

model = MyNetwork(13 * (2 * context_size + 1),346)

## 3 Set Hyperparameters

In [41]:
device = torch.device("cuda" if cuda else "GPU")
model.to(device)
NUM_EPOCHS = 4 # TODO: for test train 1 epoch only
criterion = CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 5e-5)

lmbda = lambda epoch: 0.85
scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)

## 4 Train model

In [34]:
def validate(model, dataloader):
    """
    Validation routine, tests on val data, scores accuracy
    Relevant Args:
        dev dataset loader
    Returns:
        float: Accuracy = correct / total
    """
    #TODO: implement validation based on pseudocode
    model.eval()
    
    total = 0
    num_correct = 0
    with torch.no_grad():
      for idx, (data, labels) in enumerate(dataloader):
        data = data.to(device)
        labels = labels.to(device)
              
        out = model.forward(data)
        _, predicted = torch.max(out.data, 1)
        num_correct += (predicted == labels.squeeze()).sum().item()
        total += labels.size(0)
            
    return num_correct / total

In [53]:
def train_for_one_epoch(model, train_loader, dev_loader, optimizer, model_name:str, epoch:int, scheduler = None, if_save = False):
    """
    train for one epoch
    """
    model.train()
    before = time.time()
    
    for idx, (x, y) in enumerate(train_loader):

        model.train()

        optimizer.zero_grad()
        
        x = x.to(device)
        y = y.to(device)
        
        output = model(x)


        loss = criterion(output, y.squeeze()) # softmax
        
        loss.backward()
        optimizer.step() # update the weights using the computed gradients
        

        if idx % 2000 == 1999:
            model.eval()
            # dev_accuracy = validate(model, dev_loader)
            
            print("Epoch {}/{}\tTraining Loss: {:.3f}\ttakes {:.3f} seconds"\
                .format(epoch+1, idx + 1, loss.item(), int(time.time()-before)))
            before=time.time()
            model.train()


            
    if scheduler:    
        scheduler.step()
    
    model.eval()
    dev_accuracy = validate(model, dev_loader)
    print("Epoch {}/{}\tTraining Loss: {:.3f}\tDev Accuracy: {:.3f}\ttakes {:.3f} seconds"\
          .format(epoch+1, idx + 1, loss.item(), dev_accuracy, int(time.time()-before)))
    before=time.time()

    if if_save:

        torch.save({
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
            }, "/content/gdrive/MyDrive/11685deeplearning/hw1p2" + model_name+"_"+"{:.3f}".format(dev_accuracy))

In [49]:
scheduler.get_last_lr()

[5.2200625000000005e-05]

In [55]:
cp /content/competitions/11-785-fall-20-homework-1-part-2/model* /content/gdrive/MyDrive/11685deeplearning/hw1p2

In [56]:
import time

before = time.time()

model_prefix = "model_"
scheduler.step()
for epoch in range(NUM_EPOCHS):
    model.train()
    
    train_for_one_epoch(model = model, train_loader = train_loader, dev_loader = dev_loader,  
                        optimizer = optimizer, model_name = model_prefix + str(epoch), epoch = epoch,
                        scheduler = scheduler, if_save = True)
print('Finished Training')

Epoch 1/2000	Training Loss: 0.928	takes 35.000 seconds
Epoch 1/4000	Training Loss: 0.904	takes 29.000 seconds
Epoch 1/6000	Training Loss: 0.982	takes 29.000 seconds
Epoch 1/8000	Training Loss: 0.903	takes 30.000 seconds
Epoch 1/10000	Training Loss: 1.012	takes 29.000 seconds
Epoch 1/12000	Training Loss: 1.062	takes 29.000 seconds
Epoch 1/14000	Training Loss: 0.849	takes 29.000 seconds
Epoch 1/16000	Training Loss: 0.802	takes 29.000 seconds
Epoch 1/18000	Training Loss: 0.829	takes 29.000 seconds
Epoch 1/20000	Training Loss: 0.982	takes 29.000 seconds
Epoch 1/22000	Training Loss: 0.958	takes 28.000 seconds
Epoch 1/24000	Training Loss: 0.811	takes 28.000 seconds
Epoch 1/26000	Training Loss: 0.802	takes 28.000 seconds
Epoch 1/28000	Training Loss: 0.881	takes 28.000 seconds
Epoch 1/30000	Training Loss: 0.900	takes 28.000 seconds
Epoch 1/32000	Training Loss: 0.946	takes 29.000 seconds
Epoch 1/34000	Training Loss: 1.017	takes 28.000 seconds
Epoch 1/36000	Training Loss: 0.999	takes 29.000 seco

In [52]:
optimizer = optim.Adam(model.parameters(), lr = 1e-4, weight_decay = 5e-5)
lmbda = lambda epoch: 0.85
scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)


import time

before = time.time()

for epoch in range(8, 8 + NUM_EPOCHS):
    train_for_one_epoch(model = model, train_loader = train_loader, dev_loader = dev_loader,  
                        optimizer = optimizer, model_name = model_prefix + str(epoch), epoch = epoch,
                        scheduler = scheduler, if_save = True)

Epoch 5/2000	Training Loss: 1.006	takes 32.000 seconds
Epoch 5/4000	Training Loss: 0.917	takes 29.000 seconds
Epoch 5/6000	Training Loss: 0.987	takes 29.000 seconds


KeyboardInterrupt: ignored

In [51]:
help(CosineAnnealingLR)

Help on class CosineAnnealingLR in module torch.optim.lr_scheduler:

class CosineAnnealingLR(_LRScheduler)
 |  Set the learning rate of each parameter group using a cosine annealing
 |  schedule, where :math:`\eta_{max}` is set to the initial lr and
 |  :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
 |  
 |  .. math::
 |      \begin{aligned}
 |          \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
 |          + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
 |          & T_{cur} \neq (2k+1)T_{max}; \\
 |          \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
 |          \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
 |          & T_{cur} = (2k+1)T_{max}.
 |      \end{aligned}
 |  
 |  When last_epoch=-1, sets initial lr as lr. Notice that because the schedule
 |  is defined recursively, the learning rate can be simultaneously modified
 |  outside this scheduler by other operators. If the learning rate is set

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 1e-4, weight_decay = 5e-5)
lmbda = lambda epoch: 0.85
scheduler = CosineAnnealingLR(optimizer, eta_min=1e-6, verbose=True)


import time

before = time.time()

for epoch in range(4, 4 + NUM_EPOCHS):
    train_for_one_epoch(model = model, train_loader = train_loader, dev_loader = dev_loader,  
                        optimizer = optimizer, model_name = model_prefix + str(epoch), epoch = epoch,
                        scheduler = scheduler, if_save = True)

## 5 generate submission file

In [None]:
def load_model(model_file, model, optimizer, scheduler):
    temp = torch.load("/content/gdrive/My Drive/11685deeplearning/" + model_file) 
    model.load_state_dict(temp['model_state_dict'])
    optimizer.load_state_dict(temp['optimizer_state_dict'])
    scheduler.load_state_dict(temp['scheduler_state_dict'])

    return model, optimizer, scheduler


def submit(model, optimizer, scheduler, model_name, test_file = "test.npy"):
    # load model
  # model_name = "Model_1_7"

  # model, optimizer, scheduler = 
  temp = torch.load("/content/gdrive/My Drive/11685deeplearning/" + model_name) 
  model.load_state_dict(temp['model_state_dict'])
  optimizer.load_state_dict(temp['optimizer_state_dict'])
  scheduler.load_state_dict(temp['scheduler_state_dict'])

  test = np.load(DATA_PATH + test_file, allow_pickle=True)
  test_labels = np.zeros((np.concatenate(test).shape[0], 1))

  test_dataset = hw1Dataset(test, test_labels, context_size)
  test_loader_args = dict(shuffle = False, batch_size = 1024,
                          num_workers = num_workers, pin_memory = True)\
  if cuda else dict(shuffle = False, batch_szie = 64)

  test_loader = DataLoader(test_dataset, **test_loader_args)


  model.eval()
  predicted_list = []

  with torch.no_grad():
    for idx, (data, labels) in enumerate(test_loader):
      data = data.to(device)
      labels = labels.to(device)
            
      out = model.forward(data)
      _, predicted = torch.max(out.data, 1)

      predicted_list.append(predicted.cpu().data.numpy())

  import pandas as pd
  model.train()

  submit = pd.DataFrame(enumerate(np.concatenate(predicted_list)), columns = ['id', 'label'])
  submit.to_csv(model_name + "submission.csv", index=False)

In [None]:
model_name = "Model_3_6"
submit(model, optimizer, scheduler, model_name)

# of examples: 1593253 // 1593223


# New Section

# New Section

In [None]:
## 6 Tuning