In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from typing import List

In [32]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [75]:
class NeuralNetwork(nn.Module):
  def __init__(self):
      super().__init__()
      self.linear_stack = nn.Sequential(
          nn.Linear(15, 10),
          nn.LeakyReLU(),
          nn.Linear(10, 5),
          nn.LeakyReLU(),
          nn.Linear(5, 1),
          nn.LeakyReLU()
      )

  def forward(self, x):
    logits = self.linear_stack(x)
    return logits

In [64]:
model = NeuralNetwork().to(device)
print(model)

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

NeuralNetwork(
  (linear_stack): Sequential(
    (0): Linear(in_features=15, out_features=10, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=10, out_features=5, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=5, out_features=1, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
  )
)


In [36]:
def load_labels(filename):
    cols = ["flag"]
    data = pd.read_csv(filename, usecols=cols)
    # data.set_index("id", inplace=True)
    return data

In [65]:
def data_prep(filename):
  desired_columns = ["pre_loans5",
                     "pre_loans530",
                     "pre_loans3060",
                     "pre_loans6090",
                     "pre_loans90",
                     "pre_till_pclose",
                     "pre_till_fclose",
                     "pre_loans_credit_limit",
                     "pre_loans_next_pay_summ",
                     "pre_loans_outstanding",
                     "pre_loans_total_overdue",
                     "pre_loans_max_overdue_sum",
                     "pre_util",
                     "pre_over2limit",
                     "pre_maxover2limit"]
  data = pd.read_csv(filename,usecols = desired_columns)
  # data.set_index("id", inplace=True)
  return data

In [69]:
class CustomDataset(Dataset):
  def __init__(self, dataset_files: List[int], label_file, transform=None):
    self.df = pd.DataFrame()
    self.df = pd.concat([data_prep(file) for file in dataset_files])
    self.labels = load_labels(label_file)
    self.transform = transform


  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    data = torch.tensor(self.df.loc[index])
    label = torch.tensor(self.labels.loc[index])

    if self.transform:
      data = self.transform(data)
    return data, label

In [70]:
training_dataset = CustomDataset([
    "datathon_student/train_data/train_data_0.csv",
    # "datathon_student/train_data/train_data_1.csv",
    # "datathon_student/train_data/train_data_2.csv",
    # "datathon_student/train_data/train_data_3.csv",
    # "datathon_student/train_data/train_data_4.csv",
    # "datathon_student/train_data/train_data_5.csv",
    # "datathon_student/train_data/train_data_6.csv",
    # "datathon_student/train_data/train_data_7.csv",
    # "datathon_student/train_data/train_data_8.csv",
    # "datathon_student/train_data/train_data_9.csv",
    # "datathon_student/train_data/train_data_10.csv",
], "datathon_student/train_target.csv", )

training_loader = DataLoader(training_dataset, batch_size=3, shuffle=True)

In [71]:
for data in training_loader:
    print(data)

  data = torch.tensor(self.df.loc[index])
  label = torch.tensor(self.labels.loc[index])


[tensor([[12, 11, 12,  1,  4,  0,  2,  6, 16,  5,  4,  8,  1,  2, 17],
        [12, 11, 12,  6,  4,  0,  2,  6, 16,  5,  4,  8,  0,  2, 17],
        [15, 14, 11,  2,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17]]), tensor([[0],
        [0],
        [0]])]
[tensor([[ 2,  6, 10,  2,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17],
        [15,  7,  7,  2,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17],
        [ 8,  8, 18,  2,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17]]), tensor([[0],
        [0],
        [0]])]
[tensor([[ 5,  3,  7,  2,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17],
        [ 1, 15, 10,  2,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17],
        [ 2,  6,  3,  2,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17]]), tensor([[1],
        [0],
        [0]])]
[tensor([[ 1, 11, 11,  0,  3,  0,  2,  6, 16,  5,  4,  8, 16,  2, 17],
        [12, 11,  6,  1,  4,  0,  2,  6, 16,  5,  4,  8,  6,  2, 17],
        [15,  6, 12,  2,  3,  0,  3,  6, 16,  5,  4,  8, 16,  2,  3]]), tensor([[0],
        [0],

KeyboardInterrupt: 

In [60]:
def train_one_epoch(epoch_index):
  running_loss = 0.
  last_loss = 0.

  for i, data in enumerate(training_loader):
    # Every data instance is an input + label pair
    inputs, labels = data

    # Zero gradients for every batch
    optimizer.zero_grad()

    # Make predictions for this batch
    outputs = model(inputs)

    # Compute the loss and its gradients
    loss = loss_fn(outputs, labels)
    loss.backward()

    # Adjust learning weights
    optimizer.step()

    # Gather data and report
    running_loss += loss.item()
    if i % 1000 == 999:
      last_loss = running_loss / 1000 # loss per batch
      print('  batch {} loss: {}'.format(i + 1, last_loss))
      running_loss = 0.

  return last_loss

In [74]:
epoch_number = 0
EPOCHS = 10

average_losses = []

model.train(True)

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    avg_loss = train_one_epoch(epoch_number)
    average_losses.append(avg_loss)

    print('LOSS train {}'.format(avg_loss))

model_path = 'model_{}'.format(epoch_number)
torch.save(model.state_dict(), model_path)

EPOCH 1:


  data = torch.tensor(self.df.loc[index])
  label = torch.tensor(self.labels.loc[index])


RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float