In [73]:
import torch
import torch.nn as nn
import torch.utils.data as data
import os
import numpy as np
import json

In [74]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [75]:
class JsonDataset(data.Dataset):
    def __init__(self, data_path):
        super(JsonDataset, self).__init__()
        
        f = open(data_path, 'r')
        self.data = json.loads(f.read())
        f.close()
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.FloatTensor(self.data[index][0]), \
            torch.LongTensor([float(self.data[index][1])])


In [76]:
train_data = JsonDataset('/kaggle/input/cpsc490/naive2-dist-train.json')
validation_data = JsonDataset('/kaggle/input/cpsc490/naive2-dist-validation.json')
test_data = JsonDataset('/kaggle/input/cpsc490/naive2-dist-test.json')

In [77]:
#x_num = train_data[0][0].shape[0]
x_size = train_data[0][0].shape[1]

batch_size = 256
fc_hidden_size = 200
#fc_hidden_size_1 = 20
#fc_hidden_size_2 = 100
#fc_num_layers_1 = 2
#fc_num_layers_2 = 2

In [78]:
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 1, 'pin_memory': True}
train_loader = data.DataLoader(train_data, **params)
validation_loader = data.DataLoader(validation_data, **params)
test_loader = data.DataLoader(test_data, **params)

In [79]:
class Naive(nn.Module):
    def __init__(self, x_num, x_size, \
                 fc_hidden_size_1, fc_hidden_size_2, \
                 fc_num_layers_1, fc_num_layers_2):
        super(Naive, self).__init__()
        
        d = 0
        
        self.x_num = x_num
        
        seq = []
        seq.append(nn.Linear(x_size, fc_hidden_size_1))
        seq.append(nn.Tanh())
        seq.append(nn.Dropout(d))
        
        for _ in range(fc_num_layers_1 - 1):
            seq.append(nn.Linear(fc_hidden_size_1, fc_hidden_size_1))
            seq.append(nn.Tanh())
            seq.append(nn.Dropout(d))
            
        seq.append(nn.Linear(fc_hidden_size_1, 1))
        self.fc1 = nn.Sequential(*seq)
        
        seq = []
        seq.append(nn.Linear(x_num, fc_hidden_size_2))
        seq.append(nn.Tanh())
        seq.append(nn.Dropout(d))

        for _ in range(fc_num_layers_2 - 1):
            seq.append(nn.Linear(fc_hidden_size_2, fc_hidden_size_2))
            seq.append(nn.Tanh())
            seq.append(nn.Dropout(d))
        
        seq.append(nn.Linear(fc_hidden_size_2, 2))
        self.fc2 = nn.Sequential(*seq)
        
    def forward(self, x):
        layer = None
        for i in range(self.x_num):
            res = self.fc1(x[:, i, :])
            
            if layer == None:
                layer = res
            else:
                layer = torch.cat((layer, res), dim=1)
                
        out = self.fc2(layer)
        return out

In [80]:
def train(model, criterion, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for i, (x, targets) in enumerate(train_loader):
        x = x.to(device)
        targets = torch.flatten(targets).to(device)

        outputs = model(x)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total += targets.size(0)
        train_loss += loss.item() * targets.size(0)
        _, predicted = outputs.max(1)
        correct += predicted.eq(targets).sum().item()
        
    epoch_train_loss = train_loss / total
    epoch_train_acc = float(100 * correct / total)

    return epoch_train_loss, epoch_train_acc

In [81]:
def validation(model, criterion):
    model.eval()
    validation_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for i, (x, targets) in enumerate(validation_loader):
            x = x.to(device)
            targets = torch.flatten(targets).to(device)

            outputs = model(x)
            loss = criterion(outputs, targets)

        total += targets.size(0)
        validation_loss += loss.item() * targets.size(0)
        _, predicted = outputs.max(1)
        correct += predicted.eq(targets).sum().item()
        
    epoch_validation_loss = validation_loss / total
    epoch_validation_acc = float(100 * correct / total)

    return epoch_validation_loss, epoch_validation_acc

In [82]:
fraction_zero = len([i for i in range(len(train_data)) if train_data[i][1] == 0]) / len(train_data)
print('fraction_zero: {}'.format(fraction_zero))

model = Naive(x_num, x_size, \
                 fc_hidden_size_1, fc_hidden_size_2, \
                 fc_num_layers_1, fc_num_layers_2).to(device)

criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.5 / fraction_zero, 0.5 / (1 - fraction_zero)]).to(device))

optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0002)

num_epochs = 200

fraction_zero: 0.50953125


In [83]:
best_validation_loss = None

for epoch in range(0, num_epochs):
    epoch_train_loss, epoch_train_acc = train(model, criterion, optimizer)
    epoch_validation_loss, epoch_validation_acc = validation(
                                                    model, criterion)
    
    if best_validation_loss == None or epoch_validation_loss < best_validation_loss:
        torch.save(model.state_dict(), 'best_naive2.pth')
        print('Saved.')
        best_validation_loss = epoch_validation_loss

    #epoch_test_loss, epoch_test_acc = test(net, criterion, vgg['best_acc'], 'vgg_best.pth')
    #vgg['test_loss'].append(epoch_test_loss)
    #vgg['test_acc'].append(epoch_test_acc)
    #if epoch_test_acc > vgg['best_acc']:
    #    vgg['best_acc'] = epoch_test_acc

    print('Epoch {}. Training loss: {} ({}% accuracy). Validation loss: {} ({}% accuracy)'
        .format(epoch + 1, 
                format(epoch_train_loss, '.4f'), format(epoch_train_acc, '.4f'),
                format(epoch_validation_loss, '.4f'), format(epoch_validation_acc, '.4f')))

Saved.
Epoch 1. Training loss: 0.6991 (49.5625% accuracy). Validation loss: 0.6938 (48.4375% accuracy)
Saved.
Epoch 2. Training loss: 0.6941 (50.5938% accuracy). Validation loss: 0.6923 (48.4375% accuracy)
Epoch 3. Training loss: 0.6920 (52.0625% accuracy). Validation loss: 0.7001 (45.3125% accuracy)
Saved.
Epoch 4. Training loss: 0.6935 (50.2188% accuracy). Validation loss: 0.6923 (54.6875% accuracy)
Epoch 5. Training loss: 0.6925 (52.5156% accuracy). Validation loss: 0.6984 (39.0625% accuracy)
Epoch 6. Training loss: 0.6928 (51.1875% accuracy). Validation loss: 0.6969 (45.3125% accuracy)
Saved.
Epoch 7. Training loss: 0.6904 (52.9844% accuracy). Validation loss: 0.6818 (57.8125% accuracy)
Epoch 8. Training loss: 0.6886 (52.2344% accuracy). Validation loss: 0.6926 (51.5625% accuracy)
Epoch 9. Training loss: 0.6914 (52.2969% accuracy). Validation loss: 0.6822 (51.5625% accuracy)
Epoch 10. Training loss: 0.6894 (53.4062% accuracy). Validation loss: 0.6935 (51.5625% accuracy)
Epoch 11. T

KeyboardInterrupt: 

In [None]:
# Test
model.load_state_dict(torch.load('best_naive2.pth'))

with torch.no_grad():
    n_correct = 0
    n_samples = 0
        
    for i, (x, targets) in enumerate(validation_loader):
        x = x.to(device)
        targets = torch.flatten(targets).to(device)
        #targets = targets.reshape(-1, 1).to(device)

        outputs = model(x)
        _, predicted = torch.max(outputs.data, 1)

        n_samples += x.shape[0]
        n_correct += (predicted == targets).sum().item()
    
    acc = float(100 * n_correct / n_samples)
    print('Test accuracy: {}%'.format(acc))

In [None]:
hit = 0
positives = 0
n = 0
for i in range(len(test_data)):
    x_center, real, song, target = test_data[i]
    
    target = (target == 1)
    found = False
    for c in x_center:
        if c.tolist() == song.tolist():
            found = True
    
    #if found:
    #    n += 1
            
    if found == target:
        hit += 1

print(100.0 * hit / len(test_data))
#print(hit)

In [None]:
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt

candidates = {'gmat': [780,750,690,710,780,730,690,720,740,690,610,690,710,680,770,610,580,650,540,590,620,600,550,550,570,670,660,580,650,760,640,620,660,660,680,650,670,580,590,790],
              'gpa': [4,3.9,3.3,3.7,3.9,3.7,2.3,3.3,3.3,1.7,2.7,3.7,3.7,3.3,3.3,3,2.7,3.7,2.7,2.3,3.3,2,2.3,2.7,3,3.3,3.7,2.3,3.7,3.3,3,2.7,4,3.3,3.3,2.3,2.7,3.3,1.7,3.7],
              'work_experience': [3,4,3,5,4,6,1,4,5,1,3,5,6,4,3,1,4,6,2,3,2,1,4,1,2,6,4,2,6,5,1,2,4,6,5,1,2,1,4,5],
              'age': [25,28,24,27,26,31,24,25,28,23,25,27,30,28,26,23,29,31,26,26,25,24,28,23,25,29,28,26,30,30,23,24,27,29,28,22,23,24,28,31],
              'admitted': [2,2,1,2,2,2,0,2,2,0,0,2,2,1,2,0,0,1,0,0,1,0,0,0,0,1,1,0,1,2,0,0,1,1,1,0,0,0,0,2]
              }

for i in len(my_data[0])


df = pd.DataFrame(my_data)

X = df[['gmat', 'gpa','work_experience','age']]
y = df['admitted']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))
plt.show()
"""

In [None]:
my_data = [item[0] + [item[1]] for item in my_data]
pd.DataFrame(my_data)