# Libraries

In [1]:
import sqlite3
import numpy as np
from collections import OrderedDict, namedtuple
from distutils.spawn import find_executable
import matplotlib.pyplot as plt

import sys
import subprocess
import pathlib
import tempfile
import importlib
import time
import random
import pickle

import math

from configparser import ConfigParser

# from google.protobuf.json_format import MessageToDict

In [2]:
# DEVICE = 0

import os
import time
import pickle
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

import torch
torch.backends.cudnn.benchmark = True
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils import data

# torch.cuda.set_device(DEVICE)
torch.cuda.empty_cache() 
torch.backends.cudnn.benchmark=True

In [3]:
np.random.seed(0)
torch.manual_seed(0)

import os
os.environ['CUDA_VISIBLE_DEVICES']='0'

# load the dataset

In [4]:
with open('scrimmage5_link_dataset.pickle', 'rb') as file:
    link_dataset = pickle.load(file)

In [5]:
train_data = link_dataset[:400] 
test_data = link_dataset[400:]

In [6]:
train_x = torch.cat(tuple(link[0] for link in train_data),dim=0)
train_y = torch.cat(tuple(link[1] for link in train_data),dim=0)

test_x = torch.cat(tuple(link[0] for link in test_data),dim=0)
test_y = torch.cat(tuple(link[1] for link in test_data),dim=0)

In [7]:
train_x = train_x.view(-1,1,20)
test_x = test_x.view(-1,1,20)

In [8]:
test_x.size()

torch.Size([4062650, 1, 20])

In [9]:
class CNN(nn.Module):

    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.norm1 = nn.BatchNorm1d(num_features=16)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
        self.norm2 = nn.BatchNorm1d(num_features=16)
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(in_features=16*20, out_features=2)

    def forward(self, input):
        output = self.conv1(input)
        output = self.norm1(output)
        output = F.relu(output)
        output = self.conv2(output)
        output = self.norm2(output)
        output = F.relu(output)
        
        output = output.view(-1,16*20)
        output = self.dropout(output)
        output = self.linear(output)
        
        return output

# Train Model

In [10]:
NUM_EPOCHS = 10
BATCH_SIZE = 1024

model = CNN()
model.cuda()

loss_function = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0])).cuda()
optimizer = optim.Adam(model.parameters(), 0.001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

train_dataloader = data.DataLoader(
    dataset=data.TensorDataset(train_x,train_y), 
    batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

model.train()
for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

    progress_training_epoch = tqdm(
        train_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Training',
        miniters=1, ncols=88, position=0,
        leave=True, total=len(train_dataloader), smoothing=.9)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)

    train_loss = 0
    train_size = 0
    for idx, (input, target) in enumerate(progress_training_epoch):
        input = input.cuda()
        target = target.cuda()
        model.zero_grad()
        predict = model(input)
        loss = loss_function(predict, target)
        loss.backward()
        optimizer.step()
        train_loss += loss * target.size()[0]
        train_size += target.size()[0]

    test_loss = 0
    test_size = 0
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            loss = loss_function(batch_predict, batch_target)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
            test_loss += loss * batch_target.size()[0]
            test_size += batch_target.size()[0]
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)

#     print(f'train loss:{train_loss.item()/train_size: .5f}, '
#           f'test loss:{test_loss.item()/test_size: .5f}, '
#           f'overall accuracy:{(predict == target).sum() / target.size}, '
#           f'positive accuracy:{predict[target==1].sum() / target.sum()}, '
#           f'negative accuracy:{(np.logical_not(target).sum() - predict[np.logical_not(target)].sum()) / np.logical_not(target).sum()}')
    
    print(f'train loss:{train_loss.item()/train_size: .5f}, '
          f'test loss:{test_loss.item()/test_size: .5f}')

Epoch 0/10, Training: 100%|████████████████████████| 7429/7429 [00:31<00:00, 239.48it/s]
Epoch 0/10, Validation: 100%|██████████████████████| 3968/3968 [00:39<00:00, 101.69it/s]
Epoch 1/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.49091, test loss: 0.55850


Epoch 1/10, Training: 100%|████████████████████████| 7429/7429 [00:29<00:00, 248.59it/s]
Epoch 1/10, Validation: 100%|██████████████████████| 3968/3968 [00:38<00:00, 103.03it/s]
Epoch 2/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.47679, test loss: 0.57277


Epoch 2/10, Training: 100%|████████████████████████| 7429/7429 [00:30<00:00, 244.57it/s]
Epoch 2/10, Validation: 100%|██████████████████████| 3968/3968 [00:37<00:00, 105.84it/s]
Epoch 3/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.47290, test loss: 0.57310


Epoch 3/10, Training: 100%|████████████████████████| 7429/7429 [00:30<00:00, 241.64it/s]
Epoch 3/10, Validation: 100%|██████████████████████| 3968/3968 [00:39<00:00, 100.09it/s]
Epoch 4/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.47110, test loss: 0.57295


Epoch 4/10, Training: 100%|████████████████████████| 7429/7429 [00:29<00:00, 248.04it/s]
Epoch 4/10, Validation: 100%|██████████████████████| 3968/3968 [00:38<00:00, 102.24it/s]
Epoch 5/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.46975, test loss: 0.57369


Epoch 5/10, Training: 100%|████████████████████████| 7429/7429 [00:30<00:00, 241.58it/s]
Epoch 5/10, Validation: 100%|██████████████████████| 3968/3968 [00:37<00:00, 104.84it/s]
Epoch 6/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.46848, test loss: 0.57554


Epoch 6/10, Training: 100%|████████████████████████| 7429/7429 [00:29<00:00, 251.32it/s]
Epoch 6/10, Validation: 100%|██████████████████████| 3968/3968 [00:36<00:00, 107.30it/s]
Epoch 7/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.46755, test loss: 0.57553


Epoch 7/10, Training: 100%|████████████████████████| 7429/7429 [00:30<00:00, 242.42it/s]
Epoch 7/10, Validation: 100%|██████████████████████| 3968/3968 [00:38<00:00, 102.81it/s]
Epoch 8/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.46676, test loss: 0.56907


Epoch 8/10, Training: 100%|████████████████████████| 7429/7429 [00:30<00:00, 244.57it/s]
Epoch 8/10, Validation: 100%|██████████████████████| 3968/3968 [00:39<00:00, 101.40it/s]
Epoch 9/10, Validation:   0%|                                  | 0/3968 [00:00<?, ?it/s]

train loss: 0.46615, test loss: 0.57526


Epoch 9/10, Training: 100%|████████████████████████| 7429/7429 [00:29<00:00, 252.14it/s]
Epoch 9/10, Validation: 100%|██████████████████████| 3968/3968 [00:37<00:00, 106.68it/s]


train loss: 0.46554, test loss: 0.57607


In [11]:
test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

predict = []
target = []
model.eval()
with torch.no_grad():
    for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
        batch_input = batch_input.cuda()
        batch_target = batch_target.cuda()
        batch_predict = model(batch_input)
        predict.append(batch_predict.argmax(dim=1).cpu().numpy())
        target.append(batch_target.cpu().numpy())        
predict = np.concatenate(predict, axis=0)
target = np.concatenate(target, axis=0)

tp = predict[target==1].sum()
tn = (target==0).sum() - predict[target==0].sum()
fp = predict[target==0].sum()
fn = (target==1).sum() - predict[target==1].sum()

print(f'True Positive:{tp}, '
      f'True Negative:{tn}, '
      f'False Positive:{fp}, '
      f'False Negative:{fn}')

True Positive:2530304, True Negative:415449, False Positive:705395, False Negative:411502


In [12]:
accr = (tp+tn)/(tp+tn+fp+fn)
posaccr = tp/(tp+fp)
negaccr = tn/(tn+fn)
print(f'Accuracy:{accr}, '
      f'Positive Accuracy:{posaccr}, '
      f'Negative Accuracy:{negaccr}, '
      f'Weighted Accuracy:{0.5*(posaccr+negaccr)}')

Accuracy:0.7250816585233776, Positive Accuracy:0.781996100378929, Negative Accuracy:0.5023864775542928, Weighted Accuracy:0.6421912889666108


# Train Model 1 Epoch

In [13]:
NUM_EPOCHS = 1
BATCH_SIZE = 1024

model = CNN()
model.cuda()

loss_function = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0])).cuda()
optimizer = optim.Adam(model.parameters(), 0.001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

train_dataloader = data.DataLoader(
    dataset=data.TensorDataset(train_x,train_y), 
    batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

model.train()
for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

    progress_training_epoch = tqdm(
        train_dataloader, 
        desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Training',
        miniters=1, ncols=88, position=0,
        leave=True, total=len(train_dataloader), smoothing=.9)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)

    train_loss = 0
    train_size = 0
    for idx, (input, target) in enumerate(progress_training_epoch):
        input = input.cuda()
        target = target.cuda()
        model.zero_grad()
        predict = model(input)
        loss = loss_function(predict, target)
        loss.backward()
        optimizer.step()
        train_loss += loss * target.size()[0]
        train_size += target.size()[0]

    test_loss = 0
    test_size = 0
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            loss = loss_function(batch_predict, batch_target)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
            test_loss += loss * batch_target.size()[0]
            test_size += batch_target.size()[0]
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)

#     print(f'train loss:{train_loss.item()/train_size: .5f}, '
#           f'test loss:{test_loss.item()/test_size: .5f}, '
#           f'overall accuracy:{(predict == target).sum() / target.size}, '
#           f'positive accuracy:{predict[target==1].sum() / target.sum()}, '
#           f'negative accuracy:{(np.logical_not(target).sum() - predict[np.logical_not(target)].sum()) / np.logical_not(target).sum()}')
    
    print(f'train loss:{train_loss.item()/train_size: .5f}, '
          f'test loss:{test_loss.item()/test_size: .5f}')

Epoch 1/1, Training: 100%|█████████████████████████| 7429/7429 [00:29<00:00, 252.09it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 3968/3968 [00:37<00:00, 105.54it/s]

train loss: 0.49158, test loss: 0.54938





# Test

In [14]:
test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

predict = []
target = []
model.eval()
with torch.no_grad():
    for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
        batch_input = batch_input.cuda()
        batch_target = batch_target.cuda()
        batch_predict = model(batch_input)
        predict.append(batch_predict.argmax(dim=1).cpu().numpy())
        target.append(batch_target.cpu().numpy())        
predict = np.concatenate(predict, axis=0)
target = np.concatenate(target, axis=0)

tp = predict[target==1].sum()
tn = (target==0).sum() - predict[target==0].sum()
fp = predict[target==0].sum()
fn = (target==1).sum() - predict[target==1].sum()

print(f'True Positive:{tp}, '
      f'True Negative:{tn}, '
      f'False Positive:{fp}, '
      f'False Negative:{fn}')

True Positive:2644069, True Negative:362610, False Positive:758234, False Negative:297737


In [15]:
accr = (tp+tn)/(tp+tn+fp+fn)
posaccr = tp/(tp+fp)
negaccr = tn/(tn+fn)
print(f'Accuracy:{accr}, '
      f'Positive Accuracy:{posaccr}, '
      f'Negative Accuracy:{negaccr}, '
      f'Weighted Accuracy:{0.5*(posaccr+negaccr)}')

Accuracy:0.7400782740329588, Positive Accuracy:0.77714095423012, Negative Accuracy:0.5491203867057775, Weighted Accuracy:0.6631306704679487


# Bagging 9

In [16]:
with open('scrimmage5_link_dataset.pickle', 'rb') as file:
    link_dataset = pickle.load(file)

In [17]:
NUM_EPOCHS = 1
model_list = []
BATCH_SIZE = 1024

for model_idx in range(9):
    
    train_data = []
    for link_idx in np.random.choice(200,size=200,replace=True):
        train_data.append(link_dataset[link_idx])
    test_data = link_dataset[200:]
    
    train_x = torch.cat(tuple(link[0] for link in train_data),dim=0)
    train_y = torch.cat(tuple(link[1] for link in train_data),dim=0)
    test_x = torch.cat(tuple(link[0] for link in test_data),dim=0)
    test_y = torch.cat(tuple(link[1] for link in test_data),dim=0)
    
    train_x = train_x.view(-1,1,20)
    test_x = test_x.view(-1,1,20)

    model = CNN()
    model.cuda()

    loss_function = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0])).cuda()
    optimizer = optim.Adam(model.parameters(), 0.001)

    train_dataloader = data.DataLoader(
        dataset=data.TensorDataset(train_x,train_y), 
        batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

    test_dataloader = data.DataLoader(
        data.TensorDataset(test_x,test_y), 
        batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

    model.train()
    for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

        progress_training_epoch = tqdm(
            train_dataloader, 
            desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Training {model_idx+1}/9',
            miniters=1, ncols=88, position=0,
            leave=True, total=len(train_dataloader), smoothing=.9)

        progress_validation_epoch = tqdm(
            test_dataloader, 
            desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Validation {model_idx+1}/9',
            miniters=1, ncols=88, position=0, 
            leave=True, total=len(test_dataloader), smoothing=.9)

        train_loss = 0
        train_size = 0
        for idx, (input, target) in enumerate(progress_training_epoch):
            input = input.cuda()
            target = target.cuda()
            model.zero_grad()
            predict = model(input)
            loss = loss_function(predict, target)
            loss.backward()
            optimizer.step()
            train_loss += loss * target.size()[0]
            train_size += target.size()[0]

        test_loss = 0
        test_size = 0
        model.eval()
        with torch.no_grad():
            for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
                batch_input = batch_input.cuda()
                batch_target = batch_target.cuda()
                batch_predict = model(batch_input)
                loss = loss_function(batch_predict, batch_target)
                test_loss += loss * batch_target.size()[0]
                test_size += batch_target.size()[0]

        print(f'train loss:{train_loss.item()/train_size: .5f}, '
              f'test loss:{test_loss.item()/test_size: .5f}')
        
    model_list.append(model) 

Epoch 1/1, Training 1/9: 100%|█████████████████████| 3472/3472 [00:15<00:00, 221.93it/s]
Epoch 1/1, Validation 1/9: 100%|███████████████████| 7810/7810 [00:28<00:00, 276.14it/s]


train loss: 0.44386, test loss: 0.59017


Epoch 1/1, Training 2/9: 100%|█████████████████████| 3482/3482 [00:12<00:00, 271.26it/s]
Epoch 1/1, Validation 2/9: 100%|███████████████████| 7810/7810 [00:23<00:00, 330.68it/s]


train loss: 0.42794, test loss: 0.60837


Epoch 1/1, Training 3/9: 100%|█████████████████████| 3408/3408 [00:11<00:00, 287.53it/s]
Epoch 1/1, Validation 3/9: 100%|███████████████████| 7810/7810 [00:23<00:00, 335.33it/s]


train loss: 0.44121, test loss: 0.59545


Epoch 1/1, Training 4/9: 100%|█████████████████████| 3643/3643 [00:12<00:00, 285.94it/s]
Epoch 1/1, Validation 4/9: 100%|███████████████████| 7810/7810 [00:23<00:00, 331.46it/s]


train loss: 0.45654, test loss: 0.60347


Epoch 1/1, Training 5/9: 100%|█████████████████████| 3495/3495 [00:12<00:00, 270.28it/s]
Epoch 1/1, Validation 5/9: 100%|███████████████████| 7810/7810 [00:23<00:00, 330.64it/s]


train loss: 0.43663, test loss: 0.58770


Epoch 1/1, Training 6/9: 100%|█████████████████████| 3657/3657 [00:13<00:00, 277.34it/s]
Epoch 1/1, Validation 6/9: 100%|███████████████████| 7810/7810 [00:24<00:00, 321.65it/s]


train loss: 0.48513, test loss: 0.60313


Epoch 1/1, Training 7/9: 100%|█████████████████████| 3698/3698 [00:12<00:00, 291.84it/s]
Epoch 1/1, Validation 7/9: 100%|███████████████████| 7810/7810 [00:23<00:00, 335.42it/s]


train loss: 0.43017, test loss: 0.59256


Epoch 1/1, Training 8/9: 100%|█████████████████████| 3623/3623 [00:12<00:00, 285.59it/s]
Epoch 1/1, Validation 8/9: 100%|███████████████████| 7810/7810 [00:23<00:00, 333.16it/s]


train loss: 0.46081, test loss: 0.60942


Epoch 1/1, Training 9/9: 100%|█████████████████████| 3669/3669 [00:13<00:00, 281.28it/s]
Epoch 1/1, Validation 9/9: 100%|███████████████████| 7810/7810 [00:23<00:00, 329.13it/s]

train loss: 0.46036, test loss: 0.61038





# Test

In [18]:
predict_agg = []
target_agg = []

for model in model_list:
    
    test_dataloader = data.DataLoader(
        data.TensorDataset(test_x,test_y), 
        batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)
    
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)
    
    predict_agg.append(predict)
    target_agg.append(target)

predict = (np.array(predict_agg).sum(axis=0) > 4.5)
target = target_agg[0]
tp = predict[target==1].sum()
tn = (target==0).sum() - predict[target==0].sum()
fp = predict[target==0].sum()
fn = (target==1).sum() - predict[target==1].sum()

print(f'True Positive:{tp}, '
      f'True Negative:{tn}, '
      f'False Positive:{fp}, '
      f'False Negative:{fn}')

Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:11<00:00, 661.65it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:10<00:00, 712.70it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:10<00:00, 713.86it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:10<00:00, 727.32it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:11<00:00, 679.15it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:10<00:00, 717.53it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:10<00:00, 717.23it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:11<00:00, 663.92it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 7810/7810 [00:10<00:00, 712.29it/s]


True Positive:5384505, True Negative:511544, False Positive:1791754, False Negative:309135


In [19]:
accr = (tp+tn)/(tp+tn+fp+fn)
posaccr = tp/(tp+fp)
negaccr = tn/(tn+fn)
print(f'Accuracy:{accr}, '
      f'Positive Accuracy:{posaccr}, '
      f'Negative Accuracy:{negaccr}, '
      f'Weighted Accuracy:{0.5*(posaccr+negaccr)}')

Accuracy:0.7372883221052858, Positive Accuracy:0.7503219992477975, Negative Accuracy:0.6233180086245658, Weighted Accuracy:0.6868200039361816
