In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from typing import List

In [32]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [89]:
class NeuralNetwork(nn.Module):
  def __init__(self):
      super().__init__()
      self.linear_stack = nn.Sequential(
          nn.Linear(15, 10),
          nn.LeakyReLU(),
          nn.Linear(10, 5),
          nn.LeakyReLU(),
          nn.Linear(5, 1),
          nn.LeakyReLU()
      )

  def forward(self, x):
    logits = self.linear_stack(x)
    return logits

In [82]:
model = NeuralNetwork().to(device)
print(model)

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

NeuralNetwork(
  (linear_stack): Sequential(
    (0): Linear(in_features=15, out_features=10, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=10, out_features=5, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=5, out_features=1, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
  )
)


In [83]:
def load_labels(filename):
    cols = ["flag"]
    data = pd.read_csv(filename, usecols=cols)
    # data.set_index("id", inplace=True)
    return data

In [84]:
def data_prep(filename):
  desired_columns = ["pre_loans5",
                     "pre_loans530",
                     "pre_loans3060",
                     "pre_loans6090",
                     "pre_loans90",
                     "pre_till_pclose",
                     "pre_till_fclose",
                     "pre_loans_credit_limit",
                     "pre_loans_next_pay_summ",
                     "pre_loans_outstanding",
                     "pre_loans_total_overdue",
                     "pre_loans_max_overdue_sum",
                     "pre_util",
                     "pre_over2limit",
                     "pre_maxover2limit"]
  data = pd.read_csv(filename,usecols = desired_columns)
  # data.set_index("id", inplace=True)
  return data

In [90]:
class CustomDataset(Dataset):
  def __init__(self, dataset_files: List[int], label_file, transform=None):
    self.df = pd.DataFrame()
    self.df = pd.concat([data_prep(file) for file in dataset_files])
    self.labels = load_labels(label_file)
    self.transform = transform


  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    data = torch.tensor(self.df.loc[index]).double()
    label = torch.tensor(self.labels.loc[index]).double()

    if self.transform:
      data = self.transform(data)
    return data, label

In [91]:
training_dataset = CustomDataset([
    "datathon_student/train_data/train_data_0.csv",
    "datathon_student/train_data/train_data_1.csv",
    "datathon_student/train_data/train_data_2.csv",
    "datathon_student/train_data/train_data_3.csv",
    "datathon_student/train_data/train_data_4.csv",
    "datathon_student/train_data/train_data_5.csv",
    "datathon_student/train_data/train_data_6.csv",
    "datathon_student/train_data/train_data_7.csv",
    "datathon_student/train_data/train_data_8.csv",
    "datathon_student/train_data/train_data_9.csv",
    "datathon_student/train_data/train_data_10.csv",
], "datathon_student/train_target.csv", )

training_loader = DataLoader(training_dataset, batch_size=64, shuffle=True)

In [92]:
def train_one_epoch(epoch_index):
  running_loss = 0.
  last_loss = 0.

  for i, data in enumerate(training_loader):
    # Every data instance is an input + label pair
    inputs, labels = data

    # Zero gradients for every batch
    optimizer.zero_grad()

    # Make predictions for this batch
    outputs = model(inputs)

    # Compute the loss and its gradients
    loss = loss_fn(outputs, labels)
    loss.backward()

    # Adjust learning weights
    optimizer.step()

    # Gather data and report
    running_loss += loss.item()
    if i % 1000 == 999:
      last_loss = running_loss / 1000 # loss per batch
      print('  batch {} loss: {}'.format(i + 1, last_loss))
      running_loss = 0.

  return last_loss

In [94]:
epoch_number = 0
EPOCHS = 10

average_losses = []

model.train(True)

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    avg_loss = train_one_epoch(epoch_number)
    average_losses.append(avg_loss)

    print('LOSS train {}'.format(avg_loss))

    model_path = 'model_{}'.format(epoch_number)
    torch.save(model.state_dict(), model_path)

EPOCH 1:


  data = torch.tensor(self.df.loc[index]).double()
  label = torch.tensor(self.labels.loc[index]).double()


  batch 1000 loss: 0.038816361217663284
  batch 2000 loss: 0.033767471594472916
  batch 3000 loss: 0.0374334817932053
  batch 4000 loss: 0.0331824362463732
  batch 5000 loss: 0.035198265704711476
  batch 6000 loss: 0.0341236987233373
  batch 7000 loss: 0.031461678994399125
  batch 8000 loss: 0.032821271167891755
  batch 9000 loss: 0.04288211725693564
  batch 10000 loss: 0.038700827594956785
  batch 11000 loss: 0.03743395361887705
  batch 12000 loss: 0.03408593349417668
  batch 13000 loss: 0.03211904559523472
  batch 14000 loss: 0.04204234735509084
  batch 15000 loss: 0.03809481817058177
  batch 16000 loss: 0.03337958837479168
  batch 17000 loss: 0.03794246988660733
  batch 18000 loss: 0.04017003888350857
  batch 19000 loss: 0.031577920654953885
  batch 20000 loss: 0.03166884849493923
  batch 21000 loss: 0.034919025707491856
  batch 22000 loss: 0.04044758032211493
  batch 23000 loss: 0.04213577880358061
  batch 24000 loss: 0.03493864672828816
  batch 25000 loss: 0.03744035429576222
  ba