In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('patient_dataset.csv')
df.shape

(6000, 16)

In [3]:
df.columns

Index(['age', 'gender', 'chest_pain_type', 'blood_pressure', 'cholesterol',
       'max_heart_rate', 'exercise_angina', 'plasma_glucose', 'skin_thickness',
       'insulin', 'bmi', 'diabetes_pedigree', 'hypertension', 'heart_disease',
       'residence_type', 'smoking_status'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                6000 non-null   int64  
 1   gender             5528 non-null   float64
 2   chest_pain_type    6000 non-null   int64  
 3   blood_pressure     6000 non-null   int64  
 4   cholesterol        6000 non-null   int64  
 5   max_heart_rate     6000 non-null   int64  
 6   exercise_angina    6000 non-null   int64  
 7   plasma_glucose     5391 non-null   float64
 8   skin_thickness     5386 non-null   float64
 9   insulin            5432 non-null   float64
 10  bmi                6000 non-null   float64
 11  diabetes_pedigree  6000 non-null   float64
 12  hypertension       6000 non-null   int64  
 13  heart_disease      6000 non-null   int64  
 14  residence_type     5545 non-null   object 
 15  smoking_status     6000 non-null   object 
dtypes: float64(6), int64(8),

In [5]:
df.isnull().sum()

age                    0
gender               472
chest_pain_type        0
blood_pressure         0
cholesterol            0
max_heart_rate         0
exercise_angina        0
plasma_glucose       609
skin_thickness       614
insulin              568
bmi                    0
diabetes_pedigree      0
hypertension           0
heart_disease          0
residence_type       455
smoking_status         0
dtype: int64

In [6]:
df.head()

Unnamed: 0,age,gender,chest_pain_type,blood_pressure,cholesterol,max_heart_rate,exercise_angina,plasma_glucose,skin_thickness,insulin,bmi,diabetes_pedigree,hypertension,heart_disease,residence_type,smoking_status
0,24,1.0,4,250,139,212,0,108.0,33.0,109.0,37.999303,0.480277,1,1,Urban,Smoker
1,29,0.0,4,132,187,147,0,202.0,42.0,,25.588346,0.283986,1,1,Urban,Unknown
2,46,0.0,3,271,185,193,0,149.0,43.0,102.0,37.892029,2.472309,1,0,Rural,Non-Smoker
3,73,,2,102,200,125,0,105.0,77.0,165.0,18.660241,1.472052,0,1,Rural,Smoker
4,49,1.0,3,91,163,192,0,162.0,31.0,170.0,12.76798,0.537627,1,1,Rural,Smoker


In [7]:
df['gender'].unique()    

array([ 1.,  0., nan])

In [8]:
df['residence_type'].unique()

array(['Urban', 'Rural', nan], dtype=object)

In [9]:
df['heart_disease'].unique()

array([1, 0])

In [10]:
from sklearn.impute import SimpleImputer
import mlflow

In [11]:
mlflow.set_experiment("Patient_Data_Preprocessing")
with mlflow.start_run(run_name="Preprocessing_Run"):
    df["gender"] = df["gender"].map({0.0: "Female", 1.0: "Male"}).fillna("Unknown")
    mlflow.log_param("gender_cleaning", "0->Female, 1->Male, NaN->Unknown")
    df["residence_type"] = df["residence_type"].fillna("Unknown")
    mlflow.log_param("residence_type_imputation", "Unknown")

    num_cols = ["plasma_glucose", "skin_thickness", "insulin"] #imputation
    imputer = SimpleImputer(strategy="median")
    df[num_cols] = imputer.fit_transform(df[num_cols])
    mlflow.log_param("numerical_imputation", "SimpleImputer(median)")
    mlflow.log_param("imputed_columns", ",".join(num_cols))
    medians = imputer.statistics_
    for col, val in zip(num_cols, medians):
        mlflow.log_metric(f"median_{col}", val)

    mlflow.log_metric("total_rows", df.shape[0]) #dataset info
    mlflow.log_metric("total_columns", df.shape[1])
    mlflow.log_metric("null_values_after_cleaning", df.isnull().sum().sum())

    duplicate_rows = df[df.duplicated()] #duplicates check
    num_duplicates = duplicate_rows.shape[0]
    mlflow.log_metric("duplicate_rows", num_duplicates)

    if num_duplicates > 0: #remove the duplicates
        duplicates_path = "duplicate_rows.csv"
        duplicate_rows.to_csv(duplicates_path, index=False)
        mlflow.log_artifact(duplicates_path)
        df = df.drop_duplicates()

    cleaned_path = "patient_data_cleaned.csv" #save 
    df.to_csv(cleaned_path, index=False)
    mlflow.log_artifact(cleaned_path)

print("Preprocessing run logged to MLflow")


Preprocessing run logged to MLflow


In [12]:
df = pd.read_csv("patient_data_cleaned.csv")
mlflow.set_experiment("Patient_Data_Analysis")
with mlflow.start_run(run_name="Skew_and_Target_Balance"):
    numeric_features = ["age", "blood_pressure", "cholesterol", "max_heart_rate","plasma_glucose", "skin_thickness", "insulin", "bmi"]

    skew_values = df[numeric_features].skew() #check for skew
    for col, val in skew_values.items():
        mlflow.log_metric(f"skew_{col}", val)
    highly_skewed = skew_values[abs(skew_values) > 1].index.tolist()
    mlflow.log_param("highly_skewed_columns", ",".join(highly_skewed))

    for col in highly_skewed: #log1p 
        df[col + "_log1p"] = np.log1p(df[col])
        mlflow.log_metric(f"skew_{col}_log1p", df[col + "_log1p"].skew())

    target = "heart_disease" #checks for imbalance
    target_counts = df[target].value_counts()
    target_percent = df[target].value_counts(normalize=True) * 100
    for val, count in target_counts.items():
        mlflow.log_metric(f"target_count_{val}", count)
    for val, pct in target_percent.items():
        mlflow.log_metric(f"target_percent_{val}", pct)
    
    trans_csv = "patient_skew_transformed.csv" #save
    df.to_csv(trans_csv, index=False)
    mlflow.log_artifact(trans_csv)


print("Skewness and target balance run logged to MLflow")

Skewness and target balance run logged to MLflow


In [13]:
df = pd.read_csv("patient_dataset.csv")

mlflow.set_experiment("Patient_Dataset_Visualization")

with mlflow.start_run(run_name="EDA"):

    numeric_cols = ["age", "blood_pressure", "cholesterol", "max_heart_rate",
                    "plasma_glucose", "skin_thickness", "insulin", "bmi"]

    skew_values = df[numeric_cols].skew()
    highly_skewed = skew_values[abs(skew_values) > 1].index.tolist()
    
    for col in numeric_cols:
        plt.figure(figsize=(6,4))
        sns.histplot(df[col], kde=True, color="skyblue")
        plt.title(f"Distribution of {col}")
        if col in highly_skewed:
            plt.xlabel(f"{col} (Highly Skewed)")
        plt.ylabel("Count")
        hist_path = f"hist_{col}.png"
        plt.savefig(hist_path)
        plt.close()
        mlflow.log_artifact(hist_path)

    counts = df["heart_disease"].value_counts()
    plt.figure(figsize=(5,5))
    plt.pie(counts, labels=["No Disease", "Disease"], autopct='%1.1f%%',
            colors=["skyblue", "salmon"], startangle=90)
    plt.title("Heart Disease Distribution")
    pie_path = "heart_disease_pie.png"
    plt.savefig(pie_path)
    plt.close()
    mlflow.log_artifact(pie_path)

    for col in numeric_cols:
        plt.figure(figsize=(6,4))
        sns.boxplot(x=df[col], color="lightgreen")
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)][col]
        for val in outliers:
            plt.text(val, 0.02, f'{val:.1f}', rotation=90, verticalalignment='bottom', fontsize=8, color='red')
        plt.title(f"Boxplot of {col} (Outliers in red)")
        box_path = f"box_{col}.png"
        plt.savefig(box_path)
        plt.close()
        mlflow.log_artifact(box_path)

print("EDA visualizations logged to MLflow")


EDA visualizations logged to MLflow


In [16]:
df = pd.read_csv("patient_data_cleaned.csv")
df['residence_type'].unique()

array(['Urban', 'Rural', 'Unknown'], dtype=object)

In [17]:
df['smoking_status'].unique()

array(['Smoker', 'Unknown', 'Non-Smoker'], dtype=object)

In [18]:
df['gender'].unique()

array(['Male', 'Female', 'Unknown'], dtype=object)