# Libraries

In [1]:
import sqlite3
import numpy as np
from collections import OrderedDict, namedtuple
from distutils.spawn import find_executable
import matplotlib.pyplot as plt

import sys
import subprocess
import pathlib
import tempfile
import importlib
import time
import random
import pickle

import math

from configparser import ConfigParser

# from google.protobuf.json_format import MessageToDict

In [2]:
# DEVICE = 0

import os
import time
import pickle
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

import torch
torch.backends.cudnn.benchmark = True
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils import data

# torch.cuda.set_device(DEVICE)
torch.cuda.empty_cache() 
torch.backends.cudnn.benchmark=True

In [3]:
np.random.seed(0)
torch.manual_seed(0)

import os
os.environ['CUDA_VISIBLE_DEVICES']='0'

# load the dataset

In [4]:
with open('scrimmage4_link_dataset.pickle', 'rb') as file:
    link_dataset = pickle.load(file)

In [5]:
train_data = link_dataset[:200] 
test_data = link_dataset[200:]

In [6]:
train_x = torch.cat(tuple(link[0] for link in train_data),dim=0)
train_y = torch.cat(tuple(link[1] for link in train_data),dim=0)

test_x = torch.cat(tuple(link[0] for link in test_data),dim=0)
test_y = torch.cat(tuple(link[1] for link in test_data),dim=0)

In [7]:
train_x = train_x.view(-1,1,20)
test_x = test_x.view(-1,1,20)

In [8]:
test_x.size()

torch.Size([2046930, 1, 20])

In [9]:
class CNN(nn.Module):

    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.norm1 = nn.BatchNorm1d(num_features=16)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
        self.norm2 = nn.BatchNorm1d(num_features=16)
        #self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(in_features=16*20, out_features=2)

    def forward(self, input):
        output = self.conv1(input)
        output = self.norm1(output)
        output = F.relu(output)
        output = self.conv2(output)
        output = self.norm2(output)
        output = F.relu(output)
        
        output = output.view(-1,16*20)
        #output = self.dropout(output)
        output = self.linear(output)
        
        return output

# Train Model

In [10]:
NUM_EPOCHS = 10
BATCH_SIZE = 1024

model = CNN()
model.cuda()

loss_function = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0])).cuda()
optimizer = optim.Adam(model.parameters(), 0.001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

train_dataloader = data.DataLoader(
    dataset=data.TensorDataset(train_x,train_y), 
    batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

model.train()
for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

    progress_training_epoch = tqdm(
        train_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Training',
        miniters=1, ncols=88, position=0,
        leave=True, total=len(train_dataloader), smoothing=.9)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)

    train_loss = 0
    train_size = 0
    for idx, (input, target) in enumerate(progress_training_epoch):
        input = input.cuda()
        target = target.cuda()
        model.zero_grad()
        predict = model(input)
        loss = loss_function(predict, target)
        loss.backward()
        optimizer.step()
        train_loss += loss * target.size()[0]
        train_size += target.size()[0]

    test_loss = 0
    test_size = 0
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            loss = loss_function(batch_predict, batch_target)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
            test_loss += loss * batch_target.size()[0]
            test_size += batch_target.size()[0]
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)

#     print(f'train loss:{train_loss.item()/train_size: .5f}, '
#           f'test loss:{test_loss.item()/test_size: .5f}, '
#           f'overall accuracy:{(predict == target).sum() / target.size}, '
#           f'positive accuracy:{predict[target==1].sum() / target.sum()}, '
#           f'negative accuracy:{(np.logical_not(target).sum() - predict[np.logical_not(target)].sum()) / np.logical_not(target).sum()}')
    
    print(f'train loss:{train_loss.item()/train_size: .5f}, '
          f'test loss:{test_loss.item()/test_size: .5f}')

Epoch 0/10, Training: 100%|████████████████████████| 4371/4371 [00:20<00:00, 215.55it/s]
Epoch 0/10, Validation: 100%|███████████████████████| 1999/1999 [00:25<00:00, 78.70it/s]
Epoch 1/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.50762, test loss: 0.59449


Epoch 1/10, Training: 100%|████████████████████████| 4371/4371 [00:17<00:00, 246.32it/s]
Epoch 1/10, Validation: 100%|███████████████████████| 1999/1999 [00:22<00:00, 87.30it/s]
Epoch 2/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.49104, test loss: 0.61055


Epoch 2/10, Training: 100%|████████████████████████| 4371/4371 [00:19<00:00, 219.09it/s]
Epoch 2/10, Validation: 100%|███████████████████████| 1999/1999 [00:24<00:00, 80.72it/s]
Epoch 3/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48624, test loss: 0.61366


Epoch 3/10, Training: 100%|████████████████████████| 4371/4371 [00:16<00:00, 261.55it/s]
Epoch 3/10, Validation: 100%|███████████████████████| 1999/1999 [00:21<00:00, 91.53it/s]
Epoch 4/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48382, test loss: 0.62279


Epoch 4/10, Training: 100%|████████████████████████| 4371/4371 [00:19<00:00, 227.65it/s]
Epoch 4/10, Validation: 100%|███████████████████████| 1999/1999 [00:23<00:00, 86.32it/s]
Epoch 5/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48211, test loss: 0.62735


Epoch 5/10, Training: 100%|████████████████████████| 4371/4371 [00:19<00:00, 228.56it/s]
Epoch 5/10, Validation: 100%|███████████████████████| 1999/1999 [00:23<00:00, 83.33it/s]
Epoch 6/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48079, test loss: 0.62490


Epoch 6/10, Training: 100%|████████████████████████| 4371/4371 [00:16<00:00, 270.28it/s]
Epoch 6/10, Validation: 100%|███████████████████████| 1999/1999 [00:20<00:00, 95.73it/s]
Epoch 7/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.47983, test loss: 0.62369


Epoch 7/10, Training: 100%|████████████████████████| 4371/4371 [00:19<00:00, 220.87it/s]
Epoch 7/10, Validation: 100%|███████████████████████| 1999/1999 [00:24<00:00, 80.24it/s]
Epoch 8/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.47899, test loss: 0.63472


Epoch 8/10, Training: 100%|████████████████████████| 4371/4371 [00:15<00:00, 277.05it/s]
Epoch 8/10, Validation: 100%|███████████████████████| 1999/1999 [00:20<00:00, 96.14it/s]
Epoch 9/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.47816, test loss: 0.63244


Epoch 9/10, Training: 100%|████████████████████████| 4371/4371 [00:19<00:00, 223.76it/s]
Epoch 9/10, Validation: 100%|███████████████████████| 1999/1999 [00:23<00:00, 84.54it/s]

train loss: 0.47763, test loss: 0.64689





In [11]:
test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

predict = []
target = []
model.eval()
with torch.no_grad():
    for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
        batch_input = batch_input.cuda()
        batch_target = batch_target.cuda()
        batch_predict = model(batch_input)
        predict.append(batch_predict.argmax(dim=1).cpu().numpy())
        target.append(batch_target.cpu().numpy())        
predict = np.concatenate(predict, axis=0)
target = np.concatenate(target, axis=0)

tp = predict[target==1].sum()
tn = (target==0).sum() - predict[target==0].sum()
fp = predict[target==0].sum()
fn = (target==1).sum() - predict[target==1].sum()

print(f'True Positive:{tp}, '
      f'True Negative:{tn}, '
      f'False Positive:{fp}, '
      f'False Negative:{fn}')

True Positive:1022093, True Negative:350471, False Positive:340947, False Negative:333419


In [12]:
accr = (tp+tn)/(tp+tn+fp+fn)
posaccr = tp/(tp+fp)
negaccr = tn/(tn+fn)
print(f'Accuracy:{accr}, '
      f'Positive Accuracy:{posaccr}, '
      f'Negative Accuracy:{negaccr}, '
      f'Weighted Accuracy:{0.5*(posaccr+negaccr)}')

Accuracy:0.6705476005530233, Positive Accuracy:0.7498628066674492, Negative Accuracy:0.5124669171942856, Weighted Accuracy:0.6311648619308674


# Train Model 1 Epoch

In [13]:
NUM_EPOCHS = 1
BATCH_SIZE = 1024

model = CNN()
model.cuda()

loss_function = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0])).cuda()
optimizer = optim.Adam(model.parameters(), 0.001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

train_dataloader = data.DataLoader(
    dataset=data.TensorDataset(train_x,train_y), 
    batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

model.train()
for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

    progress_training_epoch = tqdm(
        train_dataloader, 
        desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Training',
        miniters=1, ncols=88, position=0,
        leave=True, total=len(train_dataloader), smoothing=.9)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)

    train_loss = 0
    train_size = 0
    for idx, (input, target) in enumerate(progress_training_epoch):
        input = input.cuda()
        target = target.cuda()
        model.zero_grad()
        predict = model(input)
        loss = loss_function(predict, target)
        loss.backward()
        optimizer.step()
        train_loss += loss * target.size()[0]
        train_size += target.size()[0]

    test_loss = 0
    test_size = 0
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            loss = loss_function(batch_predict, batch_target)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
            test_loss += loss * batch_target.size()[0]
            test_size += batch_target.size()[0]
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)

#     print(f'train loss:{train_loss.item()/train_size: .5f}, '
#           f'test loss:{test_loss.item()/test_size: .5f}, '
#           f'overall accuracy:{(predict == target).sum() / target.size}, '
#           f'positive accuracy:{predict[target==1].sum() / target.sum()}, '
#           f'negative accuracy:{(np.logical_not(target).sum() - predict[np.logical_not(target)].sum()) / np.logical_not(target).sum()}')
    
    print(f'train loss:{train_loss.item()/train_size: .5f}, '
          f'test loss:{test_loss.item()/test_size: .5f}')

Epoch 1/1, Training: 100%|█████████████████████████| 4371/4371 [00:19<00:00, 223.55it/s]
Epoch 1/1, Validation: 100%|████████████████████████| 1999/1999 [00:24<00:00, 80.96it/s]

train loss: 0.50861, test loss: 0.59892





# Test

In [14]:
test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

predict = []
target = []
model.eval()
with torch.no_grad():
    for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
        batch_input = batch_input.cuda()
        batch_target = batch_target.cuda()
        batch_predict = model(batch_input)
        predict.append(batch_predict.argmax(dim=1).cpu().numpy())
        target.append(batch_target.cpu().numpy())        
predict = np.concatenate(predict, axis=0)
target = np.concatenate(target, axis=0)

tp = predict[target==1].sum()
tn = (target==0).sum() - predict[target==0].sum()
fp = predict[target==0].sum()
fn = (target==1).sum() - predict[target==1].sum()

print(f'True Positive:{tp}, '
      f'True Negative:{tn}, '
      f'False Positive:{fp}, '
      f'False Negative:{fn}')

True Positive:1166201, True Negative:276413, False Positive:415005, False Negative:189311


In [15]:
accr = (tp+tn)/(tp+tn+fp+fn)
posaccr = tp/(tp+fp)
negaccr = tn/(tn+fn)
print(f'Accuracy:{accr}, '
      f'Positive Accuracy:{posaccr}, '
      f'Negative Accuracy:{negaccr}, '
      f'Weighted Accuracy:{0.5*(posaccr+negaccr)}')

Accuracy:0.7047695817639098, Positive Accuracy:0.7375389417950602, Negative Accuracy:0.5935124666111259, Weighted Accuracy:0.6655257042030931


# Bagging 9

In [16]:
with open('scrimmage4_link_dataset.pickle', 'rb') as file:
    link_dataset = pickle.load(file)

In [17]:
NUM_EPOCHS = 1
model_list = []
BATCH_SIZE = 1024

for model_idx in range(9):
    
    train_data = []
    for link_idx in np.random.choice(200,size=200,replace=True):
        train_data.append(link_dataset[link_idx])
    test_data = link_dataset[200:]
    
    train_x = torch.cat(tuple(link[0] for link in train_data),dim=0)
    train_y = torch.cat(tuple(link[1] for link in train_data),dim=0)
    test_x = torch.cat(tuple(link[0] for link in test_data),dim=0)
    test_y = torch.cat(tuple(link[1] for link in test_data),dim=0)
    
    train_x = train_x.view(-1,1,20)
    test_x = test_x.view(-1,1,20)

    model = CNN()
    model.cuda()

    loss_function = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0])).cuda()
    optimizer = optim.Adam(model.parameters(), 0.001)

    train_dataloader = data.DataLoader(
        dataset=data.TensorDataset(train_x,train_y), 
        batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

    test_dataloader = data.DataLoader(
        data.TensorDataset(test_x,test_y), 
        batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

    model.train()
    for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

        progress_training_epoch = tqdm(
            train_dataloader, 
            desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Training {model_idx+1}/9',
            miniters=1, ncols=88, position=0,
            leave=True, total=len(train_dataloader), smoothing=.9)

        progress_validation_epoch = tqdm(
            test_dataloader, 
            desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Validation {model_idx+1}/9',
            miniters=1, ncols=88, position=0, 
            leave=True, total=len(test_dataloader), smoothing=.9)

        train_loss = 0
        train_size = 0
        for idx, (input, target) in enumerate(progress_training_epoch):
            input = input.cuda()
            target = target.cuda()
            model.zero_grad()
            predict = model(input)
            loss = loss_function(predict, target)
            loss.backward()
            optimizer.step()
            train_loss += loss * target.size()[0]
            train_size += target.size()[0]

        test_loss = 0
        test_size = 0
        model.eval()
        with torch.no_grad():
            for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
                batch_input = batch_input.cuda()
                batch_target = batch_target.cuda()
                batch_predict = model(batch_input)
                loss = loss_function(batch_predict, batch_target)
                test_loss += loss * batch_target.size()[0]
                test_size += batch_target.size()[0]

        print(f'train loss:{train_loss.item()/train_size: .5f}, '
              f'test loss:{test_loss.item()/test_size: .5f}')
        
    model_list.append(model) 

Epoch 1/1, Training 1/9: 100%|█████████████████████| 4473/4473 [00:18<00:00, 236.83it/s]
Epoch 1/1, Validation 1/9: 100%|████████████████████| 1999/1999 [00:23<00:00, 85.71it/s]


train loss: 0.47943, test loss: 0.63614


Epoch 1/1, Training 2/9: 100%|█████████████████████| 4415/4415 [00:18<00:00, 241.22it/s]
Epoch 1/1, Validation 2/9: 100%|████████████████████| 1999/1999 [00:22<00:00, 89.96it/s]


train loss: 0.49478, test loss: 0.64759


Epoch 1/1, Training 3/9: 100%|█████████████████████| 4283/4283 [00:19<00:00, 216.67it/s]
Epoch 1/1, Validation 3/9: 100%|████████████████████| 1999/1999 [00:24<00:00, 82.36it/s]


train loss: 0.49840, test loss: 0.62908


Epoch 1/1, Training 4/9: 100%|█████████████████████| 3929/3929 [00:14<00:00, 273.56it/s]
Epoch 1/1, Validation 4/9: 100%|███████████████████| 1999/1999 [00:19<00:00, 104.63it/s]


train loss: 0.46932, test loss: 0.62165


Epoch 1/1, Training 5/9: 100%|█████████████████████| 4327/4327 [00:19<00:00, 220.94it/s]
Epoch 1/1, Validation 5/9: 100%|████████████████████| 1999/1999 [00:23<00:00, 85.34it/s]


train loss: 0.49449, test loss: 0.63093


Epoch 1/1, Training 6/9: 100%|█████████████████████| 4549/4549 [00:15<00:00, 294.77it/s]
Epoch 1/1, Validation 6/9: 100%|████████████████████| 1999/1999 [00:20<00:00, 99.12it/s]


train loss: 0.47523, test loss: 0.60851


Epoch 1/1, Training 7/9: 100%|█████████████████████| 4049/4049 [00:18<00:00, 216.40it/s]
Epoch 1/1, Validation 7/9: 100%|████████████████████| 1999/1999 [00:23<00:00, 85.97it/s]


train loss: 0.47842, test loss: 0.65057


Epoch 1/1, Training 8/9: 100%|█████████████████████| 4322/4322 [00:15<00:00, 279.28it/s]
Epoch 1/1, Validation 8/9: 100%|███████████████████| 1999/1999 [00:19<00:00, 104.26it/s]


train loss: 0.48891, test loss: 0.66183


Epoch 1/1, Training 9/9: 100%|█████████████████████| 4186/4186 [00:17<00:00, 235.28it/s]
Epoch 1/1, Validation 9/9: 100%|████████████████████| 1999/1999 [00:21<00:00, 92.70it/s]

train loss: 0.48823, test loss: 0.66254





# Test

In [18]:
predict_agg = []
target_agg = []

for model in model_list:
    
    test_dataloader = data.DataLoader(
        data.TensorDataset(test_x,test_y), 
        batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx+1}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)
    
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)
    
    predict_agg.append(predict)
    target_agg.append(target)

predict = (np.array(predict_agg).sum(axis=0) > 4.5)
target = target_agg[0]
tp = predict[target==1].sum()
tn = (target==0).sum() - predict[target==0].sum()
fp = predict[target==0].sum()
fn = (target==1).sum() - predict[target==1].sum()

print(f'True Positive:{tp}, '
      f'True Negative:{tn}, '
      f'False Positive:{fp}, '
      f'False Negative:{fn}')

Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 480.76it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 460.92it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:05<00:00, 395.60it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 422.22it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 527.99it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 551.61it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 543.08it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 422.49it/s]
Epoch 1/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 405.54it/s]


True Positive:1176164, True Negative:285775, False Positive:405643, False Negative:179348


In [19]:
accr = (tp+tn)/(tp+tn+fp+fn)
posaccr = tp/(tp+fp)
negaccr = tn/(tn+fn)
print(f'Accuracy:{accr}, '
      f'Positive Accuracy:{posaccr}, '
      f'Negative Accuracy:{negaccr}, '
      f'Weighted Accuracy:{0.5*(posaccr+negaccr)}')

Accuracy:0.7142105494569917, Positive Accuracy:0.7435572102032675, Negative Accuracy:0.6144073718134773, Weighted Accuracy:0.6789822910083724
