In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from tqdm import tqdm


In [2]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def mse_loss(y_hat, y):
    return ((y-y_hat)**2).mean()

def log_loss(y_hat, y):
    epsilon = 1e-15 
    y_hat = np.clip(y_hat, epsilon, 1 - epsilon)
    return (-y*np.log(y_hat) - (1-y)*np.log(1-y_hat)).mean()

In [3]:
train_data = pd.read_csv('datasets/diabetes/train.csv') 
test_data = pd.read_csv('datasets/diabetes/test.csv') 

In [4]:
cols = train_data.columns

In [5]:
numeric_cols = []
catagorical_cols = []
for i in cols:
    if train_data[i].dtypes in [float, int]:
        numeric_cols.append(i)
    else:
        catagorical_cols.append(i)

In [6]:
catagorical_maps = dict()
for i in catagorical_cols:
    catagorical_maps[i] = {}
    for num,val in enumerate(train_data[i].unique()):
        catagorical_maps[i][val] = num

In [7]:
for i in catagorical_cols:
    train_data[i] = train_data[i].map(catagorical_maps[i])
    test_data[i] = test_data[i].map(catagorical_maps[i])

In [8]:
x = train_data.drop(columns=['id', 'diagnosed_diabetes']).to_numpy()

mu, sigma = np.mean(x, axis=0), np.std(x, axis=0)+1e-20
x = (x-mu.T)/sigma.T

y = train_data['diagnosed_diabetes'].to_numpy().reshape(-1,1)

In [9]:
test_ids = test_data['id'].to_numpy(dtype=int).reshape(-1,1)
test_data = test_data.drop(columns=['id']).to_numpy()

# test_data = np.column_stack([np.ones((test_data.shape[0],1)), test_data])

test_data = (test_data-mu.T)/sigma.T


In [10]:
# weights = np.random.randn(1,x.shape[1])
# learning_rate = 0.1
# x = np.column_stack([np.ones((x.shape[0],1)), x])


# def hypothesis(weights, dataset):
#     return sigmoid(np.dot(dataset, weights.T))

# for i in range(1500):
#     y_hat = hypothesis(weights, x)
#     error = y_hat - y
    
#     # Correct Gradient: (1/m) * (Prediction - y) * x
#     # We transpose error to match shapes for dot product
#     gradient = np.dot(error.T, x) / x.shape[0]
    
#     weights = weights - learning_rate * gradient
    
#     if i % 100 == 0:
#         print(f"Iteration {i6}, Loss: {log_loss(y_hat, y)}")

In [11]:
x.shape, y.shape, test_data.shape

((700000, 24), (700000, 1), (300000, 24))

In [None]:
import torch

# Detect Apple Metal (MPS) device for Mac GPUs; fallback to CPU
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print('Using device:', device)


Using device: mps


In [18]:
pd.DataFrame(result, columns = ['id', 'diagnosed_diabetes']).to_csv()

',id,diagnosed_diabetes\n0,700000,0.595\n1,700001,0.58\n2,700002,0.69\n3,700003,0.57\n4,700004,0.66\n5,700005,0.555\n6,700006,0.7\n7,700007,0.855\n8,700008,0.635\n9,700009,0.71\n10,700010,0.655\n11,700011,0.55\n12,700012,0.425\n13,700013,0.6\n14,700014,0.78\n15,700015,0.63\n16,700016,0.59\n17,700017,0.89\n18,700018,0.755\n19,700019,0.88\n20,700020,0.72\n21,700021,0.9\n22,700022,0.425\n23,700023,0.505\n24,700024,0.655\n25,700025,0.705\n26,700026,0.73\n27,700027,0.805\n28,700028,0.535\n29,700029,0.535\n30,700030,0.685\n31,700031,0.73\n32,700032,0.615\n33,700033,0.69\n34,700034,0.63\n35,700035,0.595\n36,700036,0.92\n37,700037,0.545\n38,700038,0.925\n39,700039,0.555\n40,700040,0.645\n41,700041,0.47\n42,700042,0.595\n43,700043,0.595\n44,700044,0.69\n45,700045,0.67\n46,700046,0.6\n47,700047,0.825\n48,700048,0.82\n49,700049,0.645\n50,700050,0.7\n51,700051,0.625\n52,700052,0.565\n53,700053,0.735\n54,700054,0.65\n55,700055,0.44\n56,700056,0.9\n57,700057,0.465\n58,700058,0.565\n59,700059,0.565\n