In [101]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
np.__version__, pd.__version__

('1.26.4', '2.2.3')

In [102]:
import torch
from torch import nn
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [103]:
dataFrame = pd.read_csv('data/loan_data.csv')

# Convert non-numeric columns to numeric
for col in dataFrame.select_dtypes(include=['object']).columns:
    dataFrame[col] = dataFrame[col].astype('category').cat.codes  # Label encode strings
dataFrame.groupby('loan_status').size()

loan_status
0    35000
1    10000
dtype: int64

In [104]:
features = dataFrame.drop(columns=['loan_status'])
targets = dataFrame['loan_status']
scaler = MinMaxScaler()
scaledFeatures = scaler.fit_transform(features)
scaledDataFrame = pd.DataFrame(scaledFeatures, columns=features.columns)
scaledDataFrame['loan_status'] = targets
scaledDataFrame

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,0.016129,0.0,1.00,0.008891,0.000,1.000000,1.000000,0.8,0.727023,0.742424,0.035714,0.371739,0.0,1
1,0.008065,0.0,0.75,0.000595,0.000,0.666667,0.014493,0.2,0.392318,0.121212,0.000000,0.247826,1.0,0
2,0.040323,0.0,0.75,0.000617,0.024,0.000000,0.144928,0.6,0.510974,0.666667,0.035714,0.532609,0.0,1
3,0.024194,0.0,0.25,0.009976,0.000,1.000000,1.000000,0.6,0.672840,0.666667,0.000000,0.619565,0.0,1
4,0.032258,1.0,1.00,0.008082,0.008,1.000000,1.000000,0.6,0.606996,0.803030,0.071429,0.426087,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,0.056452,1.0,0.00,0.005557,0.048,1.000000,0.420290,0.6,0.702332,0.469697,0.035714,0.554348,0.0,1
44996,0.137097,0.0,0.00,0.008036,0.136,1.000000,0.246377,0.4,0.593278,0.212121,0.321429,0.502174,0.0,1
44997,0.104839,1.0,0.00,0.006804,0.056,1.000000,0.065826,0.0,0.315501,0.075758,0.285714,0.604348,0.0,1
44998,0.072581,1.0,0.25,0.003499,0.032,1.000000,0.333333,0.2,0.535665,0.545455,0.142857,0.465217,0.0,1


In [105]:
class LoanDataset(Dataset):
    def __init__(self, current_slice):

        # Convert to numpy
        xy = scaledDataFrame.to_numpy()
        sliced_xy = xy[current_slice]
        self.x = torch.from_numpy(sliced_xy[:, :-1]).float()  # all columns except last
        self.y = torch.from_numpy(sliced_xy[:, [-1]]).float()
        self.n_samples = sliced_xy.shape[0]
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

pct = .8
split = int(pct*45000)
trainData = LoanDataset(slice(None,split))
testData = LoanDataset(slice(split,None))
trainDataLoader = DataLoader(trainData, 45, True)
testDataLoader = DataLoader(testData, 45, True)
trainData[0], trainData[4], testData[0]

((tensor([0.0161, 0.0000, 1.0000, 0.0089, 0.0000, 1.0000, 1.0000, 0.8000, 0.7270,
          0.7424, 0.0357, 0.3717, 0.0000]),
  tensor([1.])),
 (tensor([0.0323, 1.0000, 1.0000, 0.0081, 0.0080, 1.0000, 1.0000, 0.6000, 0.6070,
          0.8030, 0.0714, 0.4261, 0.0000]),
  tensor([1.])),
 (tensor([0.0323, 1.0000, 0.7500, 0.0044, 0.0320, 0.6667, 0.0874, 0.6000, 0.6015,
          0.1364, 0.0357, 0.5130, 1.0000]),
  tensor([0.])))

In [106]:
def accuracy(predictions, actual):
    correct = torch.eq(actual, predictions).sum().item()
    acc = correct / len(actual)
    return acc

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.stack = nn.Sequential(
            nn.Linear(13, 20),
            nn.Linear(20, 20),
            nn.Linear(20, 1)
        )
    
    def forward(self, x):
        return self.stack(x)
    
model = Model()
lossFun = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
epochs = 100
for epoch in range(epochs):
    trainLoss = 0
    for batch, (XTrain, yTrain) in enumerate(trainDataLoader):
        model.train()
        yLogits = model(XTrain)
        loss = lossFun(yLogits, yTrain)
        trainLoss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    trainLoss /= len(trainDataLoader)
    
    testLoss = 0
    model.eval()
    with torch.inference_mode():
        for XTest, yTest in testDataLoader:
            testLogits = model(XTest)
            testPred = torch.round(torch.sigmoid(testLogits))
            testLoss += lossFun(testPred, yTest)
        testLoss /= len(trainDataLoader)
        testAcc = accuracy(testPred, yTest)
    if epoch % 10 == 0:
        print(f'Epoch: {epoch}; Training Loss: {trainLoss.item():.3f}; Testing Loss: {testLoss.item():.3f}; Acc: {testAcc*100:.3f}%')

Epoch: 0; Training Loss: 0.417; Testing Loss: 0.169; Acc: 77.778%
Epoch: 10; Training Loss: 0.228; Testing Loss: 0.156; Acc: 91.111%
Epoch: 20; Training Loss: 0.226; Testing Loss: 0.157; Acc: 88.889%
Epoch: 30; Training Loss: 0.226; Testing Loss: 0.157; Acc: 82.222%
Epoch: 40; Training Loss: 0.226; Testing Loss: 0.156; Acc: 86.667%
Epoch: 50; Training Loss: 0.226; Testing Loss: 0.156; Acc: 91.111%
Epoch: 60; Training Loss: 0.226; Testing Loss: 0.157; Acc: 84.444%
Epoch: 70; Training Loss: 0.226; Testing Loss: 0.157; Acc: 88.889%
Epoch: 80; Training Loss: 0.226; Testing Loss: 0.157; Acc: 86.667%
Epoch: 90; Training Loss: 0.226; Testing Loss: 0.157; Acc: 91.111%


In [107]:
# stuff
# used pandas
# added batches
# had problems when splitting data
# had to learn about splices
# wierd stuff with squeezing
# had to add shuffle
# scaled ALL the data
# counted the data
# loss function IS with logits
# accuracy function
# lowered learning rate