In [4]:
import pandas as pd
import numpy as np
# Load dataset
df = pd.read_csv("/content/medical-appointments-no-show-V02.csv")
# Check the first few rows
print(df.head())


              specialty appointment_time gender appointment_date no_show  \
0         physiotherapy            14:40      M         8/7/2019      no   
1  occupational therapy            14:20      M         6/6/2019      no   
2         physiotherapy            14:40      M       24/06/2019     yes   
3         physiotherapy            17:20      M       20/05/2019      no   
4         physiotherapy            14:40      M       15/07/2019     yes   

  no_show_reason disability date_of_birth entry_service_date    city  ...  \
0            NaN                      NaN           8/5/2019  ITAJAÍ  ...   
1            NaN                      NaN           8/5/2019  ITAJAÍ  ...   
2            NaN                      NaN           8/5/2019  ITAJAÍ  ...   
3            NaN                      NaN           8/5/2019  ITAJAÍ  ...   
4            NaN                      NaN           8/5/2019  ITAJAÍ  ...   

  over_60_years_old patient_needs_companion  average_temp_day  \
0              

In [None]:
# Check missing values per column
missing_values = df.isnull().sum()

print("Missing values per column:")
print(missing_values)

# Check total missing values
print("\nTotal missing values:", df.isnull().sum().sum())


Missing values per column:
specialty                   7454
appointment_time               0
gender                         0
appointment_date               0
no_show                        0
no_show_reason             47856
disability                  5137
date_of_birth              10321
entry_service_date          5155
city                        5181
icd                        38876
appointment_month              0
appointment_year               0
appointment_shift              0
age                        10350
under_12_years_old             0
over_60_years_old              0
patient_needs_companion        0
average_temp_day            1016
average_rain_day            1016
max_temp_day                1016
max_rain_day                1016
rainy_day_before               0
storm_day_before               0
rain_intensity                 0
heat_intensity                 0
dtype: int64

Total missing values: 134394


In [None]:
# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

print("Numerical columns:\n", numerical_cols)
print("\nCategorical columns:\n", categorical_cols)


Numerical columns:
 Index(['appointment_year', 'age', 'under_12_years_old', 'over_60_years_old',
       'patient_needs_companion', 'average_temp_day', 'average_rain_day',
       'max_temp_day', 'max_rain_day', 'rainy_day_before', 'storm_day_before'],
      dtype='object')

Categorical columns:
 Index(['specialty', 'appointment_time', 'gender', 'appointment_date',
       'no_show', 'no_show_reason', 'disability', 'date_of_birth',
       'entry_service_date', 'city', 'icd', 'appointment_month',
       'appointment_shift', 'rain_intensity', 'heat_intensity'],
      dtype='object')


In [None]:
# Detect outliers using IQR
outlier_summary = {}
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_summary[col] = outliers.shape[0]
# Display outlier counts
outlier_df = pd.DataFrame.from_dict(outlier_summary, orient='index', columns=['Outlier Count'])
print(outlier_df)


                         Outlier Count
appointment_year                     0
age                               6721
under_12_years_old                   0
over_60_years_old                 3560
patient_needs_companion              0
average_temp_day                   318
average_rain_day                  7190
max_temp_day                       241
max_rain_day                      6114
rainy_day_before                   942
storm_day_before                   942


In [None]:
# Unique values in categorical features
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print("Unique values:", df[col].nunique())
    print(df[col].value_counts().head())



Column: specialty
Unique values: 8
specialty
speech therapy          11877
psychotherapy           11675
physiotherapy            9074
occupational therapy     6486
pedagogo                 2064
Name: count, dtype: int64

Column: appointment_time
Unique values: 74
appointment_time
14:40    4130
09:10    2981
13:50    2568
15:30    2387
08:20    2293
Name: count, dtype: int64

Column: gender
Unique values: 3
gender
M    37583
F    12003
I        7
Name: count, dtype: int64

Column: appointment_date
Unique values: 1001
appointment_date
17/04/2017    267
15/03/2017    190
29/05/2017    164
12/04/2017    159
30/10/2018    147
Name: count, dtype: int64

Column: no_show
Unique values: 2
no_show
no     44761
yes     4832
Name: count, dtype: int64

Column: no_show_reason
Unique values: 341
no_show_reason
doente                               197
sem transporte                       191
gripe                                 78
desmarcado                            78
cancelado atendimento sexta

In [None]:
#Drop unecessary columns
columns_to_drop = ['appointment_date', 'date_of_birth', 'entry_service_date', 'no_show_reason', 'icd','appointment_time','max_temp_day', 'max_rain_day','under_12_years_old','over_60_years_old']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

In [None]:
# Separate target column
target = 'no_show'

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target from features lists
if target in numerical_cols:
    numerical_cols.remove(target)

if target in categorical_cols:
    categorical_cols.remove(target)

print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

Numerical columns: ['appointment_year', 'age', 'patient_needs_companion', 'average_temp_day', 'average_rain_day', 'rainy_day_before', 'storm_day_before']
Categorical columns: ['specialty', 'gender', 'disability', 'city', 'appointment_month', 'appointment_shift', 'rain_intensity', 'heat_intensity']


In [None]:
# Label encoding for ordinal categories
from sklearn.preprocessing import OrdinalEncoder
ordinal_mappings = {
    'appointment_month': ['jan','feb','mar','april','may','june','july','aug','sept','oct','nov','dec'],
    'appointment_shift': ['morning','afternoon','evening'],
    'rain_intensity': ['no_rain', 'weak', 'moderate', 'heavy'],
    'heat_intensity': ['heavy_cold', 'cold', 'mild', 'warm', 'heavy_warm']
}
for col, order in ordinal_mappings.items():
    encoder = OrdinalEncoder(categories=[order])
    df[[col]] = encoder.fit_transform(df[[col]])

print(df[['appointment_shift', 'rain_intensity', 'heat_intensity', 'appointment_month']])


       appointment_shift  rain_intensity  heat_intensity  appointment_month
0                    1.0             0.0             2.0                8.0
1                    1.0             0.0             2.0                8.0
2                    1.0             0.0             2.0                8.0
3                    1.0             0.0             2.0                8.0
4                    1.0             0.0             2.0                8.0
...                  ...             ...             ...                ...
49588                0.0             0.0             0.0                1.0
49589                0.0             0.0             0.0                1.0
49590                0.0             0.0             0.0                1.0
49591                0.0             0.0             0.0                1.0
49592                0.0             0.0             0.0                1.0

[49593 rows x 4 columns]


In [None]:
# One hot encoding
import pandas as pd
df.columns = df.columns.str.strip()
one_hot_cols = ['specialty', 'gender', 'disability', 'city']
df_encoded = pd.get_dummies(df[one_hot_cols], drop_first=True)
df_encoded = df_encoded.astype(int)
print(df_encoded.head())
print("\nShape of the encoded DataFrame:", df_encoded.shape)


   specialty_enf  specialty_occupational therapy  specialty_pedagogo  \
0              0                               0                   0   
1              0                               0                   0   
2              0                               0                   0   
3              0                               0                   0   
4              0                               0                   0   

   specialty_physiotherapy  specialty_psychotherapy  \
0                        1                        0   
1                        0                        1   
2                        0                        0   
3                        1                        0   
4                        1                        0   

   specialty_sem especialidade  specialty_speech therapy  gender_I  gender_M  \
0                            0                         0         0         1   
1                            0                         0         0         1

In [None]:
# Feature Scaling using the standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Check result
print(df[numerical_cols].head())


   appointment_year       age  patient_needs_companion  average_temp_day  \
0          1.580315       NaN                -1.092744          0.292144   
1          1.580315       NaN                -1.092744          0.292144   
2          1.580315       NaN                -1.092744          0.292144   
3          1.580315       NaN                -1.092744          0.292144   
4          1.580315  2.572304                 0.915127          0.292144   

   average_rain_day  rainy_day_before  storm_day_before  
0         -0.398309          0.139149          0.139149  
1         -0.398309          0.139149          0.139149  
2         -0.398309          0.139149          0.139149  
3         -0.398309          0.139149          0.139149  
4         -0.398309          0.139149          0.139149  


In [None]:
from sklearn.impute import KNNImputer
num_cols = numerical_cols
# Initialize KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)
# Apply KNN imputer
df[num_cols] = knn_imputer.fit_transform(df[num_cols])
# Check result
print(df[num_cols].head())


   appointment_year       age  patient_needs_companion  average_temp_day  \
0          1.580315  0.534327                -1.092744          0.292144   
1          1.580315  0.534327                -1.092744          0.292144   
2          1.580315  0.534327                -1.092744          0.292144   
3          1.580315  0.534327                -1.092744          0.292144   
4          1.580315  2.572304                 0.915127          0.292144   

   average_rain_day  rainy_day_before  storm_day_before  
0         -0.398309          0.139149          0.139149  
1         -0.398309          0.139149          0.139149  
2         -0.398309          0.139149          0.139149  
3         -0.398309          0.139149          0.139149  
4         -0.398309          0.139149          0.139149  


In [None]:
from sklearn.decomposition import PCA
import pandas as pd

# Apply PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(df_encoded)

# Convert to DataFrame
df_pca = pd.DataFrame(
    X_pca,
    columns=[f'PC{i+1}' for i in range(X_pca.shape[1])]
)

print(df_pca.head())
print("Shape after PCA:", df_pca.shape)


        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0 -0.449502 -0.495812 -0.101184  0.645145  0.155789  0.454559 -0.224321   
1  0.059394 -0.388111 -0.655716 -0.238112  0.561781 -0.230182 -0.220810   
2 -0.307243 -0.481816  0.750327 -0.749713 -0.178194  0.026860 -0.244858   
3 -0.707981 -0.622026 -0.097187  0.011430 -0.532160  0.651579 -0.242909   
4 -1.038323 -0.468121 -0.118756  0.568065  0.382202  0.266887 -0.106090   

        PC8       PC9      PC10  
0 -0.291701  0.080353 -0.396179  
1 -0.298928  0.140010 -0.348361  
2 -0.324448  0.178794 -0.361174  
3 -0.311796  0.118494 -0.397852  
4 -0.008191 -0.158917 -0.030193  
Shape after PCA: (49593, 10)


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
num_cols = numerical_cols
# Calculate Z-scores
z_scores = np.abs(zscore(df[num_cols], nan_policy='omit'))
# Identify rows with any Z-score > 3
z_outliers = (z_scores > 3).any(axis=1)
print("Number of outlier rows detected by z-score:", z_outliers.sum())
# Remove the outliers
df_z_clean = df[~z_outliers].copy()
print("Shape after removing Z-score outliers:", df_z_clean.shape)


Number of outlier rows detected by z-score: 2281
Shape after removing Z-score outliers: (47312, 16)


In [9]:
import pandas as pd
num_cols = numerical_cols
# IQR method
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
iqr_outliers = ((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
# Count outlier rows
print("Number of outlier rows detected by IQR:", iqr_outliers.sum())
# Remove the outliers
df_iqr_clean = df[~iqr_outliers]
print("Shape after removing IQR outliers:", df_iqr_clean.shape)


Number of outlier rows detected by IQR: 8963
Shape after removing IQR outliers: (28397, 26)


In [10]:
# Preprocessing Pipeline Representation
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer

# Separate numerical and categorical columns
numerical_cols = ['appointment_year', 'age', 'patient_needs_companion',
                  'average_temp_day', 'average_rain_day', 'rainy_day_before', 'storm_day_before']
categorical_cols = ['specialty', 'gender', 'disability', 'city', 'appointment_month',
                    'appointment_shift', 'rain_intensity', 'heat_intensity']

# Numerical preprocessing: Impute missing values (KNN) + scale features
num_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

# Categorical preprocessing: One-hot encode
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# Combine both pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Fit-transform the pipeline on features
X = df_iqr_clean.drop('no_show', axis=1)
y = df_iqr_clean['no_show']
X_processed = preprocessor.fit_transform(X)

print("Preprocessing completed.")
print("Shape after preprocessing:", X_processed.shape)


Preprocessing completed.
Shape after preprocessing: (28397, 49)


In [11]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit preprocessor on training data
X_train = preprocessor.fit_transform(X_train)

# Transform test data
X_test = preprocessor.transform(X_test)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (22717, 49)
Test shape: (5680, 49)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize model
lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',   # IMPORTANT for imbalanced data
    random_state=42
)

# Train model
lr.fit(X_train, y_train)

# Predictions
y_pred_lr = lr.predict(X_test)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.586443661971831

Classification Report:
               precision    recall  f1-score   support

          no       0.94      0.58      0.72      5188
         yes       0.12      0.62      0.21       492

    accuracy                           0.59      5680
   macro avg       0.53      0.60      0.46      5680
weighted avg       0.87      0.59      0.68      5680


Confusion Matrix:
 [[3024 2164]
 [ 185  307]]


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize Random Forest model
rf = RandomForestClassifier(
    n_estimators=200,          # number of trees
    max_depth=None,            # let trees grow fully
    random_state=42,
    class_weight='balanced',   # handle class imbalance
    n_jobs=-1                  # use all CPU cores
)


In [14]:
rf.fit(X_train, y_train)


In [15]:
y_pred_rf = rf.predict(X_test)


In [16]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.9068661971830986

Classification Report:
               precision    recall  f1-score   support

          no       0.93      0.97      0.95      5188
         yes       0.42      0.20      0.27       492

    accuracy                           0.91      5680
   macro avg       0.67      0.58      0.61      5680
weighted avg       0.88      0.91      0.89      5680


Confusion Matrix:
 [[5055  133]
 [ 396   96]]


In [17]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Store results
model_comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf)
    ],
    "Precision (Yes)": [
        precision_score(y_test, y_pred_lr, pos_label='yes'),
        precision_score(y_test, y_pred_rf, pos_label='yes')
    ],
    "Recall (Yes)": [
        recall_score(y_test, y_pred_lr, pos_label='yes'),
        recall_score(y_test, y_pred_rf, pos_label='yes')
    ],
    "F1-Score (Yes)": [
        f1_score(y_test, y_pred_lr, pos_label='yes'),
        f1_score(y_test, y_pred_rf, pos_label='yes')
    ]
})

print(model_comparison)


                 Model  Accuracy  Precision (Yes)  Recall (Yes)  \
0  Logistic Regression  0.586444         0.124241      0.623984   
1        Random Forest  0.906866         0.419214      0.195122   

   F1-Score (Yes)  
0        0.207222  
1        0.266297  
