In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import train_test_split, KFold

import torch
import torchvision
import torch.utils.data as tdata
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

import helpers as h
import models as clsf

np.random.seed(69)
torch.manual_seed(69);

# Data Preprocessing

In [2]:
train_df0 = pd.read_csv('train.csv')
X0, Y_c0, Y0 = h.transform_df(train_df0, train=True)
(N, d) = X0.shape

# Model attempt 1

In [3]:
### Just gonna take a random 10th for validation
X, valX, Y_c, valY_c, Y, valY = train_test_split(X0, Y_c0, Y0, test_size=0.1)

pars = h.get_pars_for_processing(X)
X, valX = h.process_with_pars(X, pars), h.process_with_pars(valX, pars)

In [None]:
batch_size = 32
X, Y = torch.Tensor(X), torch.Tensor(Y)
valX, valY = torch.Tensor(valX), torch.Tensor(valY)


train_loader = tdata.DataLoader(tdata.TensorDataset(X, Y), \
                                     batch_size=batch_size,\
                                     shuffle=True)
val_loader = tdata.DataLoader(tdata.TensorDataset(valX, valY), \
                                     batch_size=batch_size,\
                                     shuffle=True)

### Notes on sigmoid
Either Option 1:

1) No nn.Sigmoid() layer, 2) Use nn.BCEWithLogitsLoss, and 3) apply torch.sigmoid() to output to
get probabilities.
This option is more numerically stable

OR Option 2:

1) Use nn.Sigmoid() layer, 2) Use nn.BCELoss(), and 3) Don't need torch.sigmoid() at the prediction step

In [None]:
# Simple one hidden layer (k units) logistic regression
k = 128
model = nn.Sequential(
        nn.Linear(X.shape[1], k),
        nn.ReLU(), 
        nn.Dropout(0.3),
        nn.Linear(k, 1),
        #nn.Sigmoid(); #Option 2
    )

#criterion = nn.MSELoss() # Do regression
criterion = nn.BCEWithLogitsLoss(); #Option 1
#criterion = nn.BCELoss(); #Option 2
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
# Some layers, such as Dropout, behave differently during training
model.train()

for epoch in range(4):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data)

        # Calculate loss (broadcast target to (B, 1), where B is batch size)
        loss = criterion(output, target.unsqueeze(1)) 

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))
    print(model(X).data.numpy())

In [None]:
# Putting layers like Dropout into evaluation mode
model.eval()

def acc(valY_c, output):
    return sum(np.round(output) == valY_c) / len(valY_c)
    #return sum(valY_c == ((np.sign(output) + 1) / 2)) / len(valY_c)

# Turning off automatic differentiation
with torch.no_grad():
    output = torch.sigmoid(model(valX)).data.numpy().squeeze()
    #output = h.scale(output)
    print('Auroc on val set: ', auc(valY_c, output))
    print('Accuracy: ', acc(valY_c, output))


# Get Predictions on Test Set

In [None]:
test_df0 = pd.read_csv('test.csv')
tX = h.transform_df(test_df0)
tX = h.process_with_pars(tX, pars)

with torch.no_grad():
    output = torch.sigmoid(model(torch.Tensor(tX))).data.numpy().squeeze()
    #output = h.scale(output)
    #output = model(torch.Tensor(tX)); #Option 2
     
output_df = pd.DataFrame({'id':test_df0['id'], 'Predicted': output})
output_df.to_csv('submission.csv', index=False)

# Old Code

In [None]:
# Get some data to make sure logistic regression worked lmao
import sys
sys.path.append('~/git/MLModels')
from MLModels import utils as u
from MLModels import linearModels as lm

f, line = u.genF(zero_one=True)
X, Y = u.genData(f, 10000)
(N, D) = X.shape