In [1]:
import time
import math
import copy
import torch
import pickle
import random
import logging
import warnings
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import torch.nn as nn
import torch.optim as opt
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
import fairlearn.datasets as fd
from tqdm import tqdm
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from torch.autograd import grad
from torch.autograd.functional import vhp
from get_datasets import get_diabetes, get_adult, get_law
from torch.utils.data import Subset, DataLoader
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score
from scipy.stats import spearmanr

plt.rcParams['figure.dpi'] = 300
warnings.filterwarnings("ignore")

E = math.e

In [2]:
def get_law():
    data = pd.read_csv('data/lawschs1_1.csv')
    
    data = data[data.MissingRace != 1]
    data = data.drop('Race', axis=1)
    data = data.drop('MissingRace', axis=1)
    data = data.drop('college', axis=1)
    data = data.drop('Year', axis=1)
    data = data.dropna(how='any', axis=0)

    data['LSAT'] = data['LSAT'].apply(lambda x: round(x/10))
    data['GPA'] = data['GPA'].apply(lambda x: round(x, 1))
    
    to_replace = ['LSAT', 'GPA']
    data = pd.get_dummies(data, columns=to_replace, drop_first = False)
    
    return data

In [3]:
 class CreateData(torch.utils.data.Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        out_data = self.data[idx]
        out_label = self.targets[idx]

        return out_data, out_label

In [4]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, num_features):
        super(LogisticRegression, self).__init__()
        
        self.fc1 = torch.nn.Linear(num_features, 1, bias=False)
        self.criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
        
    def forward(self, x):
        logits = self.fc1(x)

        return logits
    
    def loss(self, logits, y):
        loss = self.criterion(logits.ravel(), y)
        
        probabilities = torch.sigmoid(logits)
        thresh_results = []
        
        for p in probabilities:
            if p>.5:
                thresh_results.append(1)
            else:
                thresh_results.append(0)
                
        num_correct = 0
        for r,y_ in zip(thresh_results, y):
            if r == y_:
                num_correct += 1
                
        acc = num_correct / len(y) * 100
        prec = precision_score(y.detach().cpu().numpy(), thresh_results, zero_division=0) * 100
        rec = recall_score(y.detach().cpu().numpy(), thresh_results) * 100
        return loss, acc, prec, rec

In [5]:
def train(model, dataset, lr, bs, eps):
    model.train()
    
    opt = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=0)
    criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')

            
    train_data = CreateData(dataset[0], dataset[1])
    train_dataloader = DataLoader(train_data, batch_size=bs, shuffle=True)

    for itr in range(0, eps):
        itr_loss = 0
        for i, [x,y] in enumerate(train_dataloader):
            opt.zero_grad()
            oupt = model(x)
            
            try:
                loss_val = criterion(oupt.ravel(), y)
            except ValueError:
                loss_val = criterion(oupt, y)
            itr_loss += loss_val
            loss_val.backward()
            opt.step() 
        #print(itr_loss/len(train_dataloader))
    return model

In [6]:
data = get_law()
label = 'admit'

feature_set = set(data.columns) - {label}
num_features = len(feature_set)

print(num_features)

41


In [7]:
def Main(lr, bs, eps):

    device = 'cuda:5' if torch.cuda.is_available() else 'cpu'
    criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
    
    prec_avg = 0
    rec_avg = 0
    acc_avg = 0
    loss_avg = 0
    
    data = get_law()
    label = 'admit'

    feature_set = set(data.columns) - {label}
    num_features = len(feature_set)

    X = data[feature_set]
    y = data[label]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    new_train_df = pd.concat([X_train, y_train], axis=1)

    train_sample_num = len(X_train)

    x_test_input = torch.FloatTensor(X_test.values).to(device)
    y_test_input = torch.FloatTensor(y_test.values).to(device)

    x_train_input = torch.FloatTensor(X_train.values).to(device)
    y_train_input = torch.FloatTensor(y_train.values).to(device)

    ##############################################
    # Train original model and get original loss #
    ##############################################
    torch_model = LogisticRegression(num_features)
  
    torch_model.to(device)
    torch_model = train(torch_model, [x_train_input, y_train_input], lr, bs, eps)
    test_loss, acc, prec, rec = torch_model.loss(torch_model(x_test_input), y_test_input)

    prec_avg += prec
    rec_avg += rec
    acc_avg += acc
    loss_avg += test_loss
        
    return prec_avg, rec_avg, acc_avg, loss_avg

In [None]:
lr = [.01, .05, .1, .5]
eps = [10, 15, 20, 25, 50]
bs = [1, 8, 16, 32]

for l in lr:
    for e in eps:
        for b in bs:
            print(f'\nLR: {l}, BS: {b}, Eps: {e}')
            prec, rec, acc, loss = Main(l, b, e)
            print(f'Accuracy: {acc:.2f}')
            print(f'Loss: {loss:.3f}')
            print(f'Precision: {prec:.2f}')
            print(f'Recall: {rec:.2f}')
            


LR: 0.01, BS: 1, Eps: 10
Accuracy: 71.50
Loss: 0.549
Precision: 72.00
Recall: 79.91

LR: 0.01, BS: 8, Eps: 10
Accuracy: 71.54
Loss: 0.554
Precision: 71.79
Recall: 80.57

LR: 0.01, BS: 16, Eps: 10
Accuracy: 71.24
Loss: 0.561
Precision: 71.12
Recall: 81.44

LR: 0.01, BS: 32, Eps: 10
Accuracy: 70.51
Loss: 0.574
Precision: 69.35
Recall: 84.34

LR: 0.01, BS: 1, Eps: 15
