In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [53]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [40]:
# Loading dataset
df = pd.read_csv('/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv')
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [41]:
# Removing the "Other" gender since it's only 18 observations 
print (df['gender'].value_counts())
df= df[df['gender'].isin(['Male', 'Female'])]
df.columns

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64


Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

In [42]:
# Splitting the data set into train/test 
features = ['gender', 'age', 'hypertension','heart_disease', 'smoking_history','bmi', 'HbA1c_level','blood_glucose_level']
target = ['diabetes']
X = df[features]
Y = df[target]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
X_train, X_test, Y_train, Y_test = [df.reset_index(drop=True) for df in [X_train, X_test, Y_train, Y_test]] 

#### Smoking history column transformed to contain only 4 categories (smoker, non_smoker, former_smoker, unknown) followed by OneHotEncoding to be absorbed in the Model. 

In [48]:
# Defining the encoding function
def encode_smoking_history(df, ohe=None, fit=True):
    """
    df: DataFrame to encode in this case we'll pass X_train and X_test
    ohe: pre-fitted OneHotEncoder (used for test set) fitted on train set to prevent data leakage (good habit!)
    fit: if True, fit a new encoder; if False, transform using the provided encoder
    """
    # Reduce categories
    df['smoking_history'] = df['smoking_history'].replace({
        'never': 'non_smoker',
        'ever': 'non_smoker',
        'No Info': 'unknown',
        'current': 'smoker',
        'former': 'former_smoker',
        'not current': 'former_smoker'
    })
    
    if fit:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
        ohe_df = ohe.fit_transform(df[['smoking_history']])
    else:
        ohe_df = ohe.transform(df[['smoking_history']])
    
    ohe_df.columns = ohe_df.columns.str.replace('smoking_history_', '', regex=False)
    df_encoded = pd.concat([df.drop(columns=['smoking_history']), ohe_df], axis=1)
    
    return df_encoded, ohe
    
X_train_encoded, ohe = encode_smoking_history(X_train, fit=True)
X_test_encoded,_ = encode_smoking_history(X_test, ohe= ohe, fit=False)

In [50]:
# Encoding the gender column : 1 for Male and 0 for Female
for df in [X_train_encoded, X_test_encoded]:
    df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

In [52]:
numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
for col in numerical_cols:
    min_val = df[col].min()
    max_val = df[col].max()
    print(f"{col}: min = {min_val}, max = {max_val}, range = {max_val - min_val}")

age: min = 0.08, max = 80.0, range = 79.92
bmi: min = 10.14, max = 95.22, range = 85.08
HbA1c_level: min = 3.5, max = 9.0, range = 5.5
blood_glucose_level: min = 80, max = 300, range = 220


In [55]:
# Fitting MinMaxScaler on train data only 
Scaler = MinMaxScaler()
X_train_encoded[numerical_cols] = Scaler.fit_transform(X_train_encoded[numerical_cols])
X_test_encoded[numerical_cols] = Scaler.transform(X_test_encoded[numerical_cols])
# Scaler check 
for col in numerical_cols:
    print(f"{col} - Train: min={X_train_encoded[col].min():.2f}, max={X_train_encoded[col].max():.2f}")
    print(f"{col} - Test: min={X_test_encoded[col].min():.2f}, max={X_test_encoded[col].max():.2f}\n")

age - Train: min=0.00, max=1.00
age - Test: min=0.00, max=1.00

bmi - Train: min=0.00, max=1.00
bmi - Test: min=0.00, max=0.99

HbA1c_level - Train: min=0.00, max=1.00
HbA1c_level - Test: min=0.00, max=1.00

blood_glucose_level - Train: min=0.00, max=1.00
blood_glucose_level - Test: min=0.00, max=1.00



In [59]:
# Finalizing nomenclature for my prepared datasets
X_train_original = X_train.copy()
X_test_original = X_test.copy() 
X_train = X_train_encoded.copy()
X_test = X_test_encoded.copy() 