In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from typing import List

In [64]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [65]:
class NeuralNetwork(nn.Module):
  def __init__(self):
      super().__init__()
      self.linear_stack = nn.Sequential(
          nn.Linear(15, 7),
          nn.LeakyReLU(),
          nn.Linear(7, 1),
          nn.LeakyReLU()
      )

  def forward(self, x):
    logits = self.linear_stack(x)
    return logits

In [66]:
model = NeuralNetwork().to(device)
print(model)

loss_fn = nn.L1Loss()
optimizer = torch.optim.Adamax(model.parameters(), lr=0.002, weight_decay=0.01)

NeuralNetwork(
  (linear_stack): Sequential(
    (0): Linear(in_features=15, out_features=7, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=7, out_features=1, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
  )
)


In [67]:
def load_labels(filename):
    cols = ["flag"]
    data = pd.read_csv(filename, usecols=cols)
    # data.set_index("id", inplace=True)
    return data

In [68]:
def data_prep(filename):
  desired_columns = ["pre_loans5",
                     "pre_loans530",
                     "pre_loans3060",
                     "pre_loans6090",
                     "pre_loans90",
                     "pre_till_pclose",
                     "pre_till_fclose",
                     "pre_loans_credit_limit",
                     "pre_loans_next_pay_summ",
                     "pre_loans_outstanding",
                     "pre_loans_total_overdue",
                     "pre_loans_max_overdue_sum",
                     "pre_util",
                     "pre_over2limit",
                     "pre_maxover2limit"]
  data = pd.read_csv(filename,usecols = desired_columns)
  # data.set_index("id", inplace=True)
  # data = data.sample(100_000)
  return data

In [69]:
class CustomDataset(Dataset):
  def __init__(self, dataset_file, label_file, transform=None):
    self.df = data_prep(dataset_file)
    self.labels = load_labels(label_file)
    self.transform = transform


  def __len__(self):
    # return self.df.shape[0]
    return 100_000

  def __getitem__(self, index):
    data = torch.tensor(self.df.loc[index]).float()
    data = data.to(device)
    label = torch.tensor(self.labels.loc[index]).float()
    label = label.to(device)

    if self.transform:
      data = self.transform(data)
    return data, label

In [70]:
data_files = [
    "datathon_student/train_data/train_data_0.csv",
    "datathon_student/train_data/train_data_1.csv",
    "datathon_student/train_data/train_data_2.csv",
    "datathon_student/train_data/train_data_3.csv",
    "datathon_student/train_data/train_data_4.csv",
    "datathon_student/train_data/train_data_5.csv",
    "datathon_student/train_data/train_data_6.csv",
    "datathon_student/train_data/train_data_7.csv",
    "datathon_student/train_data/train_data_8.csv",
    "datathon_student/train_data/train_data_9.csv",
    "datathon_student/train_data/train_data_10.csv",
]



In [71]:
def train_one_epoch(epoch_index):
  running_loss = 0.
  last_loss = 0.

  for i, data in enumerate(training_loader):
    # Every data instance is an input + label pair
    inputs, labels = data

    # Zero gradients for every batch
    optimizer.zero_grad()

    # Make predictions for this batch
    outputs = model(inputs)

    # Compute the loss and its gradients
    loss = loss_fn(outputs, labels)
    loss.backward()

    # Adjust learning weights
    optimizer.step()

    # Gather data and report
    running_loss += loss.item()
    if i % 1000 == 999:
      last_loss = running_loss / 1000 # loss per batch
      print('  batch {} loss: {}'.format(i + 1, last_loss))
      running_loss = 0.

  return last_loss

In [73]:
for filename in data_files:
    training_dataset = CustomDataset(filename, "datathon_student/train_target.csv", )
    training_loader = DataLoader(training_dataset, batch_size=64, shuffle=True)

    print(f"Loading file: {filename}")

    epoch_number = 0
    EPOCHS = 3

    average_losses = []

    model.train(True)

    for epoch in range(EPOCHS):
        print('EPOCH {}:'.format(epoch_number + 1))

        avg_loss = train_one_epoch(epoch_number)
        average_losses.append(avg_loss)

        print('LOSS train {}'.format(avg_loss))

        model_path = 'model_{}'.format(epoch_number)
        torch.save(model.state_dict(), model_path)
        epoch_number += 1

Loading file: datathon_student/train_data/train_data_0.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.030868761732814165
LOSS train 0.030868761732814165
EPOCH 2:
  batch 1000 loss: 0.030688085252186283
LOSS train 0.030688085252186283
EPOCH 3:
  batch 1000 loss: 0.030052253105417547
LOSS train 0.030052253105417547
Loading file: datathon_student/train_data/train_data_1.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.030533131451963527
LOSS train 0.030533131451963527
EPOCH 2:
  batch 1000 loss: 0.030650622462872888
LOSS train 0.030650622462872888
EPOCH 3:
  batch 1000 loss: 0.03110358352443154
LOSS train 0.03110358352443154
Loading file: datathon_student/train_data/train_data_2.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.030847782335510145
LOSS train 0.030847782335510145
EPOCH 2:
  batch 1000 loss: 0.03028920191857924
LOSS train 0.03028920191857924
EPOCH 3:
  batch 1000 loss: 0.030802631925108473
LOSS train 0.030802631925108473
Loading file: datathon_student/train_data/train_data_3.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.031170632917944203
LOSS train 0.031170632917944203
EPOCH 2:
  batch 1000 loss: 0.030904704718669563
LOSS train 0.030904704718669563
EPOCH 3:
  batch 1000 loss: 0.030946313185898362
LOSS train 0.030946313185898362
Loading file: datathon_student/train_data/train_data_4.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.03087061386100504
LOSS train 0.03087061386100504
EPOCH 2:
  batch 1000 loss: 0.030806856761050314
LOSS train 0.030806856761050314
EPOCH 3:
  batch 1000 loss: 0.03063975946395476
LOSS train 0.03063975946395476
Loading file: datathon_student/train_data/train_data_5.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.030348188916093477
LOSS train 0.030348188916093477
EPOCH 2:
  batch 1000 loss: 0.030256801371281655
LOSS train 0.030256801371281655
EPOCH 3:
  batch 1000 loss: 0.03051843699751453
LOSS train 0.03051843699751453
Loading file: datathon_student/train_data/train_data_6.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.030674636182114
LOSS train 0.030674636182114
EPOCH 2:
  batch 1000 loss: 0.02994938337906865
LOSS train 0.02994938337906865
EPOCH 3:
  batch 1000 loss: 0.03112891795231826
LOSS train 0.03112891795231826
Loading file: datathon_student/train_data/train_data_7.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.030164942395829713
LOSS train 0.030164942395829713
EPOCH 2:
  batch 1000 loss: 0.03085201465111106
LOSS train 0.03085201465111106
EPOCH 3:
  batch 1000 loss: 0.03064919065166727
LOSS train 0.03064919065166727
Loading file: datathon_student/train_data/train_data_8.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.03035215855394302
LOSS train 0.03035215855394302
EPOCH 2:
  batch 1000 loss: 0.031414460862319735
LOSS train 0.031414460862319735
EPOCH 3:
  batch 1000 loss: 0.030523799685136906
LOSS train 0.030523799685136906
Loading file: datathon_student/train_data/train_data_9.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.03057076785030344
LOSS train 0.03057076785030344
EPOCH 2:
  batch 1000 loss: 0.030398798311398665
LOSS train 0.030398798311398665
EPOCH 3:
  batch 1000 loss: 0.030898880102214178
LOSS train 0.030898880102214178
Loading file: datathon_student/train_data/train_data_10.csv
EPOCH 1:


  data = torch.tensor(self.df.loc[index]).float()
  label = torch.tensor(self.labels.loc[index]).float()


  batch 1000 loss: 0.030274613870493863
LOSS train 0.030274613870493863
EPOCH 2:
  batch 1000 loss: 0.030102155425010622
LOSS train 0.030102155425010622
EPOCH 3:
  batch 1000 loss: 0.030414426884157755
LOSS train 0.030414426884157755
