In [37]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [38]:
dataset=pd.read_csv('dataset.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [39]:
dataset.head(5)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [40]:
print(dataset['PatientId'].nunique())
print(dataset['Neighbourhood'].nunique())


62299
81


In [41]:
dataset['No-show']=dataset['No-show'].map({'Yes':1,'No':0})
dataset['Gender'] = dataset['Gender'].map({'M': 1, 'F': 0})  

dataset['ScheduledDay'] = pd.to_datetime(dataset['ScheduledDay'])
dataset['AppointmentDay'] = pd.to_datetime(dataset['AppointmentDay'])

dataset['AppointmentWeekday'] = dataset['AppointmentDay'].dt.dayofweek  
dataset['DaysWaited'] = (dataset['AppointmentDay'].dt.normalize() - dataset['ScheduledDay'].dt.normalize()).dt.days

dataset=pd.get_dummies(dataset,columns=['AppointmentWeekday','Neighbourhood'])

dataset=dataset.drop(['PatientId','AppointmentID','AppointmentDay','ScheduledDay'],axis=1)


In [42]:

labels=dataset['No-show']
dataset=dataset.drop('No-show',axis=1)
# scaler = StandardScaler()
# X = scaler.fit_transform(dataset)
train_features,test_features,train_labels,test_labels= train_test_split(dataset,labels,test_size=0.2,random_state=42,stratify=labels)



In [43]:
dataset.head(5)

Unnamed: 0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,DaysWaited,AppointmentWeekday_0,...,Neighbourhood_SANTOS REIS,Neighbourhood_SEGURANÇA DO LAR,Neighbourhood_SOLON BORGES,Neighbourhood_SÃO BENEDITO,Neighbourhood_SÃO CRISTÓVÃO,Neighbourhood_SÃO JOSÉ,Neighbourhood_SÃO PEDRO,Neighbourhood_TABUAZEIRO,Neighbourhood_UNIVERSITÁRIO,Neighbourhood_VILA RUBIM
0,0,62,0,1,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,False
1,1,56,0,0,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,False
2,0,62,0,0,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,False
3,0,8,0,0,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,False
4,0,56,0,1,1,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,False


In [44]:
from torch.utils.data import TensorDataset,DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

X_train_tensor = torch.tensor(train_features.values.astype(np.float32))
X_test_tensor = torch.tensor(test_features.values.astype(np.float32))

y_train_tensor = torch.tensor(train_labels.values.astype(np.float32)).view(-1, 1)
y_test_tensor = torch.tensor(test_labels.values.astype(np.float32)).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


class NN(nn.Module):
    def __init__(self,input_size):
        super(NN,self).__init__()
        self.fc1 = nn.Linear(input_size,64)
        self.fc2 = nn.Linear(64,32)
        self.out = nn.Linear(32,1)

    def forward(self,x):
        x = F.relu(self.fc1(x))
        x=F.relu(self.fc2(x))
        return self.out(x)        

In [45]:
# pos_weight = torch.tensor([num_zeros / num_ones], dtype=torch.float32)
# print(pos_weight)
# print(num_zeros)
# print(num_ones)

torch.manual_seed(42)
model = NN(input_size=train_features.shape[1])
criterion=nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=1)

num_zeros = (train_labels == 0).sum()
num_ones = (train_labels == 1).sum()

# Compute pos_weight

for epoch in range(10):  # or more
    model.train()
    for batch_X, batch_y in train_loader:
        batch_y = batch_y.view(-1, 1)
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")



Epoch 1, Loss: 0.4851
Epoch 2, Loss: 0.5962
Epoch 3, Loss: 0.6501
Epoch 4, Loss: 0.4067
Epoch 5, Loss: 0.5565
Epoch 6, Loss: 0.5573
Epoch 7, Loss: 0.5241
Epoch 8, Loss: 0.5650
Epoch 9, Loss: 0.4582
Epoch 10, Loss: 0.5561


In [46]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        output = model(batch_X)
        probs = torch.sigmoid(output)  # convert logits to probabilities
        preds = (probs > 0.5).int().view(-1)  # binary predictions (0 or 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_y.view(-1).cpu().numpy())  # flatten labels


# Calculate metrics
f1 = f1_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
accuracy = sum([p == t for p, t in zip(all_preds, all_labels)]) / len(all_labels)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


print(classification_report(all_labels, all_preds))
print(np.bincount(all_preds))  # how many 0s and 1s?




Accuracy:  0.7981
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000
              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     17642
         1.0       0.00      0.00      0.00      4464

    accuracy                           0.80     22106
   macro avg       0.40      0.50      0.44     22106
weighted avg       0.64      0.80      0.71     22106

[22106]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
