# Libraries

In [1]:
import sqlite3
import numpy as np
from collections import OrderedDict, namedtuple
from distutils.spawn import find_executable
import matplotlib.pyplot as plt

import sys
import subprocess
import pathlib
import tempfile
import importlib
import time
import random
import pickle

import math

from configparser import ConfigParser

# from google.protobuf.json_format import MessageToDict

In [2]:
# DEVICE = 0

import os
import time
import pickle
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

import torch
torch.backends.cudnn.benchmark = True
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils import data

# torch.cuda.set_device(DEVICE)
torch.cuda.empty_cache() 
torch.backends.cudnn.benchmark=True

In [3]:
np.random.seed(0)
torch.manual_seed(0)

import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

# load the dataset

In [4]:
with open('scrimmage4_link_dataset.pickle', 'rb') as file:
    link_dataset = pickle.load(file)

In [5]:
train_data = link_dataset[:200] 
test_data = link_dataset[200:]

In [6]:
train_x = torch.cat(tuple(link[0] for link in train_data),dim=0)
train_y = torch.cat(tuple(link[1] for link in train_data),dim=0)

test_x = torch.cat(tuple(link[0] for link in test_data),dim=0)
test_y = torch.cat(tuple(link[1] for link in test_data),dim=0)

In [7]:
train_x = train_x.view(-1,1,20)
test_x = test_x.view(-1,1,20)

In [8]:
test_x.size()

torch.Size([2046930, 1, 20])

In [9]:
class CNN(nn.Module):

    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.norm1 = nn.BatchNorm1d(num_features=16)
        
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
        self.norm2 = nn.BatchNorm1d(num_features=16)
        
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
        self.norm3 = nn.BatchNorm1d(num_features=16)
        
        self.conv4 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3, padding=1)
        self.norm4 = nn.BatchNorm1d(num_features=16)
        
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(in_features=16*20, out_features=2)

    def forward(self, input):
        output = self.conv1(input)
        output = self.norm1(output)
        output = F.relu(output)
        
        output = self.conv2(output)
        output = self.norm2(output)
        output = F.relu(output)
        
        output = self.conv3(output)
        output = self.norm3(output)
        output = F.relu(output)
        
        output = self.conv4(output)
        output = self.norm4(output)
        output = F.relu(output)
        
        output = output.view(-1,16*20)
        output = self.dropout(output)
        output = self.linear(output)
        
        return output

# Train Model

In [10]:
NUM_EPOCHS = 10
BATCH_SIZE = 1024

model = CNN()
model.cuda()

loss_function = nn.CrossEntropyLoss(weight=torch.tensor([2.0,1.0])).cuda()
optimizer = optim.Adam(model.parameters(), 0.001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

train_dataloader = data.DataLoader(
    dataset=data.TensorDataset(train_x,train_y), 
    batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

model.train()
for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

    progress_training_epoch = tqdm(
        train_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Training',
        miniters=1, ncols=88, position=0,
        leave=True, total=len(train_dataloader), smoothing=.9)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)

    train_loss = 0
    train_size = 0
    for idx, (input, target) in enumerate(progress_training_epoch):
        input = input.cuda()
        target = target.cuda()
        model.zero_grad()
        predict = model(input)
        loss = loss_function(predict, target)
        loss.backward()
        optimizer.step()
        train_loss += loss * target.size()[0]
        train_size += target.size()[0]

    test_loss = 0
    test_size = 0
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            loss = loss_function(batch_predict, batch_target)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
            test_loss += loss * batch_target.size()[0]
            test_size += batch_target.size()[0]
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)

#     print(f'train loss:{train_loss.item()/train_size: .5f}, '
#           f'test loss:{test_loss.item()/test_size: .5f}, '
#           f'overall accuracy:{(predict == target).sum() / target.size}, '
#           f'positive accuracy:{predict[target==1].sum() / target.sum()}, '
#           f'negative accuracy:{(np.logical_not(target).sum() - predict[np.logical_not(target)].sum()) / np.logical_not(target).sum()}')
    
    print(f'train loss:{train_loss.item()/train_size: .5f}, '
          f'test loss:{test_loss.item()/test_size: .5f}')

Epoch 0/10, Training: 100%|████████████████████████| 4371/4371 [00:28<00:00, 150.88it/s]
Epoch 0/10, Validation: 100%|███████████████████████| 1999/1999 [00:34<00:00, 57.65it/s]
Epoch 1/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.55199, test loss: 0.63967


Epoch 1/10, Training: 100%|████████████████████████| 4371/4371 [00:31<00:00, 140.07it/s]
Epoch 1/10, Validation: 100%|███████████████████████| 1999/1999 [00:35<00:00, 56.48it/s]
Epoch 2/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.50265, test loss: 0.70091


Epoch 2/10, Training: 100%|████████████████████████| 4371/4371 [00:32<00:00, 134.97it/s]
Epoch 2/10, Validation: 100%|███████████████████████| 1999/1999 [00:38<00:00, 52.34it/s]
Epoch 3/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.49136, test loss: 0.74268


Epoch 3/10, Training: 100%|████████████████████████| 4371/4371 [00:27<00:00, 161.76it/s]
Epoch 3/10, Validation: 100%|███████████████████████| 1999/1999 [00:32<00:00, 61.25it/s]
Epoch 4/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48709, test loss: 0.74447


Epoch 4/10, Training: 100%|████████████████████████| 4371/4371 [00:33<00:00, 131.81it/s]
Epoch 4/10, Validation: 100%|███████████████████████| 1999/1999 [00:39<00:00, 50.96it/s]
Epoch 5/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48454, test loss: 0.77220


Epoch 5/10, Training: 100%|████████████████████████| 4371/4371 [00:26<00:00, 166.13it/s]
Epoch 5/10, Validation: 100%|███████████████████████| 1999/1999 [00:32<00:00, 61.51it/s]
Epoch 6/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48264, test loss: 0.76211


Epoch 6/10, Training: 100%|████████████████████████| 4371/4371 [00:33<00:00, 132.44it/s]
Epoch 6/10, Validation: 100%|███████████████████████| 1999/1999 [00:37<00:00, 53.46it/s]
Epoch 7/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48117, test loss: 0.76748


Epoch 7/10, Training: 100%|████████████████████████| 4371/4371 [00:31<00:00, 140.88it/s]
Epoch 7/10, Validation: 100%|███████████████████████| 1999/1999 [00:37<00:00, 53.74it/s]
Epoch 8/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.48011, test loss: 0.76564


Epoch 8/10, Training: 100%|████████████████████████| 4371/4371 [00:28<00:00, 154.59it/s]
Epoch 8/10, Validation: 100%|███████████████████████| 1999/1999 [00:33<00:00, 60.50it/s]
Epoch 9/10, Validation:   0%|                                  | 0/1999 [00:00<?, ?it/s]

train loss: 0.47900, test loss: 0.76441


Epoch 9/10, Training: 100%|████████████████████████| 4371/4371 [00:33<00:00, 131.91it/s]
Epoch 9/10, Validation: 100%|███████████████████████| 1999/1999 [00:39<00:00, 51.08it/s]


train loss: 0.47827, test loss: 0.77150


In [11]:
test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

predict = []
target = []
model.eval()
with torch.no_grad():
    for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
        batch_input = batch_input.cuda()
        batch_target = batch_target.cuda()
        batch_predict = model(batch_input)
        predict.append(batch_predict.argmax(dim=1).cpu().numpy())
        target.append(batch_target.cpu().numpy())        
predict = np.concatenate(predict, axis=0)
target = np.concatenate(target, axis=0)

print(f'True Positive:{predict[target==1].sum()}, '
      f'True Negative:{(target==0).sum() - predict[target==0].sum()}, '
      f'False Positive:{predict[target==0].sum()}, '
      f'False Negative:{(target==1).sum() - predict[target==1].sum()}')


True Positive:726382, True Negative:426084, False Positive:265334, False Negative:629130


# Train Model 1 Epoch

In [12]:
NUM_EPOCHS = 1
BATCH_SIZE = 1024

model = CNN()
model.cuda()

loss_function = nn.CrossEntropyLoss(weight=torch.tensor([2.0,1.0])).cuda()
optimizer = optim.Adam(model.parameters(), 0.001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

train_dataloader = data.DataLoader(
    dataset=data.TensorDataset(train_x,train_y), 
    batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

model.train()
for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

    progress_training_epoch = tqdm(
        train_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Training',
        miniters=1, ncols=88, position=0,
        leave=True, total=len(train_dataloader), smoothing=.9)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)

    train_loss = 0
    train_size = 0
    for idx, (input, target) in enumerate(progress_training_epoch):
        input = input.cuda()
        target = target.cuda()
        model.zero_grad()
        predict = model(input)
        loss = loss_function(predict, target)
        loss.backward()
        optimizer.step()
        train_loss += loss * target.size()[0]
        train_size += target.size()[0]

    test_loss = 0
    test_size = 0
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            loss = loss_function(batch_predict, batch_target)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
            test_loss += loss * batch_target.size()[0]
            test_size += batch_target.size()[0]
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)

#     print(f'train loss:{train_loss.item()/train_size: .5f}, '
#           f'test loss:{test_loss.item()/test_size: .5f}, '
#           f'overall accuracy:{(predict == target).sum() / target.size}, '
#           f'positive accuracy:{predict[target==1].sum() / target.sum()}, '
#           f'negative accuracy:{(np.logical_not(target).sum() - predict[np.logical_not(target)].sum()) / np.logical_not(target).sum()}')
    
    print(f'train loss:{train_loss.item()/train_size: .5f}, '
          f'test loss:{test_loss.item()/test_size: .5f}')

Epoch 0/1, Training: 100%|█████████████████████████| 4371/4371 [00:28<00:00, 152.27it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:34<00:00, 57.45it/s]

train loss: 0.55466, test loss: 0.64425





# Test

In [13]:
test_dataloader = data.DataLoader(
    data.TensorDataset(test_x,test_y), 
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

predict = []
target = []
model.eval()
with torch.no_grad():
    for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
        batch_input = batch_input.cuda()
        batch_target = batch_target.cuda()
        batch_predict = model(batch_input)
        predict.append(batch_predict.argmax(dim=1).cpu().numpy())
        target.append(batch_target.cpu().numpy())        
predict = np.concatenate(predict, axis=0)
target = np.concatenate(target, axis=0)

print(f'True Positive:{predict[target==1].sum()}, '
      f'True Negative:{(target==0).sum() - predict[target==0].sum()}, '
      f'False Positive:{predict[target==0].sum()}, '
      f'False Negative:{(target==1).sum() - predict[target==1].sum()}')

True Positive:863087, True Negative:444724, False Positive:246694, False Negative:492425


# Bagging 9

In [14]:
with open('scrimmage4_link_dataset.pickle', 'rb') as file:
    link_dataset = pickle.load(file)

In [15]:
NUM_EPOCHS = 1
model_list = []
BATCH_SIZE = 1024

for model_idx in range(9):
    
    train_data = []
    for link_idx in np.random.choice(200,size=200,replace=True):
        train_data.append(link_dataset[link_idx])
    test_data = link_dataset[200:]
    
    train_x = torch.cat(tuple(link[0] for link in train_data),dim=0)
    train_y = torch.cat(tuple(link[1] for link in train_data),dim=0)
    test_x = torch.cat(tuple(link[0] for link in test_data),dim=0)
    test_y = torch.cat(tuple(link[1] for link in test_data),dim=0)
    
    train_x = train_x.view(-1,1,20)
    test_x = test_x.view(-1,1,20)

    model = CNN()
    model.cuda()

    loss_function = nn.CrossEntropyLoss(weight=torch.tensor([2.0,1.0])).cuda()
    optimizer = optim.Adam(model.parameters(), 0.001)

    train_dataloader = data.DataLoader(
        dataset=data.TensorDataset(train_x,train_y), 
        batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

    test_dataloader = data.DataLoader(
        data.TensorDataset(test_x,test_y), 
        batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

    model.train()
    for epoch_idx in range(NUM_EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data

        progress_training_epoch = tqdm(
            train_dataloader, 
            desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Training',
            miniters=1, ncols=88, position=0,
            leave=True, total=len(train_dataloader), smoothing=.9)

        progress_validation_epoch = tqdm(
            test_dataloader, 
            desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Validation',
            miniters=1, ncols=88, position=0, 
            leave=True, total=len(test_dataloader), smoothing=.9)

        train_loss = 0
        train_size = 0
        for idx, (input, target) in enumerate(progress_training_epoch):
            input = input.cuda()
            target = target.cuda()
            model.zero_grad()
            predict = model(input)
            loss = loss_function(predict, target)
            loss.backward()
            optimizer.step()
            train_loss += loss * target.size()[0]
            train_size += target.size()[0]

        test_loss = 0
        test_size = 0
        model.eval()
        with torch.no_grad():
            for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
                batch_input = batch_input.cuda()
                batch_target = batch_target.cuda()
                batch_predict = model(batch_input)
                loss = loss_function(batch_predict, batch_target)
                test_loss += loss * batch_target.size()[0]
                test_size += batch_target.size()[0]

        print(f'train loss:{train_loss.item()/train_size: .5f}, '
              f'test loss:{test_loss.item()/test_size: .5f}')
        
    model_list.append(model) 

Epoch 0/1, Training: 100%|█████████████████████████| 4473/4473 [00:29<00:00, 150.98it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:34<00:00, 57.32it/s]


train loss: 0.52054, test loss: 0.66004


Epoch 0/1, Training: 100%|█████████████████████████| 4415/4415 [00:34<00:00, 129.13it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:39<00:00, 50.07it/s]


train loss: 0.53564, test loss: 0.66484


Epoch 0/1, Training: 100%|█████████████████████████| 4283/4283 [00:27<00:00, 157.01it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:32<00:00, 61.02it/s]


train loss: 0.53275, test loss: 0.65972


Epoch 0/1, Training: 100%|█████████████████████████| 3929/3929 [00:30<00:00, 127.54it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:34<00:00, 57.68it/s]


train loss: 0.51535, test loss: 0.65076


Epoch 0/1, Training: 100%|█████████████████████████| 4327/4327 [00:31<00:00, 138.74it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:36<00:00, 54.73it/s]


train loss: 0.52754, test loss: 0.64791


Epoch 0/1, Training: 100%|█████████████████████████| 4549/4549 [00:31<00:00, 144.46it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:35<00:00, 55.98it/s]


train loss: 0.51904, test loss: 0.64596


Epoch 0/1, Training: 100%|█████████████████████████| 4049/4049 [00:22<00:00, 183.47it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:25<00:00, 77.60it/s]


train loss: 0.52301, test loss: 0.67360


Epoch 0/1, Training: 100%|█████████████████████████| 4322/4322 [00:22<00:00, 192.78it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:25<00:00, 76.98it/s]


train loss: 0.52838, test loss: 0.63679


Epoch 0/1, Training: 100%|█████████████████████████| 4186/4186 [00:21<00:00, 191.38it/s]
Epoch 0/1, Validation: 100%|████████████████████████| 1999/1999 [00:25<00:00, 77.92it/s]

train loss: 0.53608, test loss: 0.65490





# Test

In [16]:
predict_agg = []
target_agg = []

for model in model_list:
    
    test_dataloader = data.DataLoader(
        data.TensorDataset(test_x,test_y), 
        batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

    progress_validation_epoch = tqdm(
        test_dataloader, 
        desc=f'Epoch {epoch_idx}/{NUM_EPOCHS}, Validation',
        miniters=1, ncols=88, position=0, 
        leave=True, total=len(test_dataloader), smoothing=.9)
    
    predict = []
    target = []
    model.eval()
    with torch.no_grad():
        for idx, (batch_input, batch_target) in enumerate(progress_validation_epoch):
            batch_input = batch_input.cuda()
            batch_target = batch_target.cuda()
            batch_predict = model(batch_input)
            predict.append(batch_predict.argmax(dim=1).cpu().numpy())
            target.append(batch_target.cpu().numpy())        
    predict = np.concatenate(predict, axis=0)
    target = np.concatenate(target, axis=0)
    
    predict_agg.append(predict)
    target_agg.append(target)

predict = (np.array(predict_agg).sum(axis=0) > 4.5)
target = target_agg[0]
print(f'True Positive:{predict[target==1].sum()}, '
      f'True Negative:{(target==0).sum() - predict[target==0].sum()}, '
      f'False Positive:{predict[target==0].sum()}, '
      f'False Negative:{(target==1).sum() - predict[target==1].sum()}')

Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 485.12it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 548.16it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 475.64it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 528.78it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 533.05it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 482.76it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:04<00:00, 495.78it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 524.96it/s]
Epoch 0/1, Validation: 100%|███████████████████████| 1999/1999 [00:03<00:00, 553.13it/s]


True Positive:916917, True Negative:425223, False Positive:266195, False Negative:438595
