In [1]:
# import important packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from matplotlib import rcParams
import warnings

warnings.filterwarnings("ignore")

# figure size in inches
rcParams["figure.figsize"] = 10, 6
np.random.seed(42)

# Aim:
The aim of this attempt is to predict if the female patients has 0/1 outcomes, which means nondiabetic/diabetic based on their health status, by building a classification model using Decision Tree. This case is entirely based on classification model as the target variable is nondiabetic(0) or diabetic(1). 

In [2]:
df = pd.read_csv('pima_indians_diabetes.csv')
print(df.isnull().sum())
print("total number of Rows and Columns in the dataframe: " , df.shape)
unique_values = pd.Series({col: df[col].unique() for col in df})
unique_values.tolist()

time_pregnant_no              0
plasma_concentration          0
diastolic_blood_pressure      0
triceps_skinfold_thickness    0
serum_insulin                 0
bmi                           0
diabetes_pedigree             0
age                           0
class                         0
dtype: int64
total number of Rows and Columns in the dataframe:  (768, 9)


[array([ 6,  1,  8,  0,  5,  3, 10,  2,  4,  7,  9, 11, 13, 15, 17, 12, 14],
       dtype=int64),
 array([148,  85, 183,  89, 137, 116,  78, 115, 197, 125, 110, 168, 139,
        189, 166, 100, 118, 107, 103, 126,  99, 196, 119, 143, 147,  97,
        145, 117, 109, 158,  88,  92, 122, 138, 102,  90, 111, 180, 133,
        106, 171, 159, 146,  71, 105, 101, 176, 150,  73, 187,  84,  44,
        141, 114,  95, 129,  79,   0,  62, 131, 112, 113,  74,  83, 136,
         80, 123,  81, 134, 142, 144,  93, 163, 151,  96, 155,  76, 160,
        124, 162, 132, 120, 173, 170, 128, 108, 154,  57, 156, 153, 188,
        152, 104,  87,  75, 179, 130, 194, 181, 135, 184, 140, 177, 164,
         91, 165,  86, 193, 191, 161, 167,  77, 182, 157, 178,  61,  98,
        127,  82,  72, 172,  94, 175, 195,  68, 186, 198, 121,  67, 174,
        199,  56, 169, 149,  65, 190], dtype=int64),
 array([ 72,  66,  64,  40,  74,  50,   0,  70,  96,  92,  80,  60,  84,
         30,  88,  90,  94,  76,  82,  75,  58

In [3]:
df

Unnamed: 0,time_pregnant_no,plasma_concentration,diastolic_blood_pressure,triceps_skinfold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# PLEASE NOTE: According to science, If there is over insulin in the body, there is a high chance of diabetes in patients. So depending upon some of the data shown below, it is clealy seen that that even though the insulin level is 0, the outcome of the patient is diabetic!!! THAT MEANS THE VALUE, 0, in the data means null values. 
Also, some of the rows in the columns, [bmi, diastolic_blood_pressure, triceps_skinfold_thickness ], show 0 values and that is impossible!!!!

ref: https://medlineplus.gov/lab-tests/insulin-in-blood/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8306487/

In [4]:
print(((df['serum_insulin'] == 0) & (df['class'] == 1)).sum(), ": the total number of O values in the column, Serum Insulin for diabetic patients.")
print("=="*23)
print(((df['bmi'] == 0)).sum(), ": the total number of O values in the column, BMI")
print("=="*23)
print(((df['diastolic_blood_pressure'] == 0)).sum(), ": the total number of O values in the column, Blood pressure")
print("=="*23)
print(((df['triceps_skinfold_thickness'] == 0)).sum(), ": the total number of O values in the column, Skin Thickness")

138 : the total number of O values in the column, Serum Insulin for diabetic patients.
11 : the total number of O values in the column, BMI
35 : the total number of O values in the column, Blood pressure
227 : the total number of O values in the column, Skin Thickness


In [5]:
# just to see the mean and the shape of the serum insulin value for the diabetic patient whose serum insulin is not equal to 0
c= df.loc[((df['serum_insulin'] != 0) & (df['class'] == 1))]
print("Mean: ", c.serum_insulin.mean())
print("Number or rows and columns:", c.shape)

Mean:  206.84615384615384
Number or rows and columns: (130, 9)


In [6]:
df.dtypes

time_pregnant_no                int64
plasma_concentration            int64
diastolic_blood_pressure        int64
triceps_skinfold_thickness      int64
serum_insulin                   int64
bmi                           float64
diabetes_pedigree             float64
age                             int64
class                           int64
dtype: object

In [7]:
df[['bmi','serum_insulin','diastolic_blood_pressure','triceps_skinfold_thickness']] = df[['bmi','serum_insulin','diastolic_blood_pressure','triceps_skinfold_thickness']].replace(0, np.nan)
df.isnull().sum()

time_pregnant_no                0
plasma_concentration            0
diastolic_blood_pressure       35
triceps_skinfold_thickness    227
serum_insulin                 374
bmi                            11
diabetes_pedigree               0
age                             0
class                           0
dtype: int64

In [8]:
df.corr()

Unnamed: 0,time_pregnant_no,plasma_concentration,diastolic_blood_pressure,triceps_skinfold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
time_pregnant_no,1.0,0.129459,0.214178,0.100239,0.082171,0.021719,-0.033523,0.544341,0.221898
plasma_concentration,0.129459,1.0,0.221436,0.211354,0.58001,0.21929,0.137337,0.263514,0.466581
diastolic_blood_pressure,0.214178,0.221436,1.0,0.226839,0.098272,0.28923,-0.002805,0.330107,0.170589
triceps_skinfold_thickness,0.100239,0.211354,0.226839,1.0,0.184888,0.648214,0.115016,0.166816,0.259491
serum_insulin,0.082171,0.58001,0.098272,0.184888,1.0,0.22805,0.130395,0.220261,0.303454
bmi,0.021719,0.21929,0.28923,0.648214,0.22805,1.0,0.155382,0.025841,0.31368
diabetes_pedigree,-0.033523,0.137337,-0.002805,0.115016,0.130395,0.155382,1.0,0.033561,0.173844
age,0.544341,0.263514,0.330107,0.166816,0.220261,0.025841,0.033561,1.0,0.238356
class,0.221898,0.466581,0.170589,0.259491,0.303454,0.31368,0.173844,0.238356,1.0


In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# Setting the random_state argument for reproducibility
imputer = IterativeImputer(random_state=42)
imputed = imputer.fit_transform(df)
df = pd.DataFrame(imputed, columns=df.columns)
df


# ref: https://towardsdatascience.com/iterative-imputation-with-scikit-learn-8f3eb22b1a38

Unnamed: 0,time_pregnant_no,plasma_concentration,diastolic_blood_pressure,triceps_skinfold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,6.0,148.0,72.0,35.000000,217.942496,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.000000,72.670072,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,21.332373,265.993100,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.000000,94.000000,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.000000,168.000000,43.1,2.288,33.0,1.0
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.000000,180.000000,32.9,0.171,63.0,0.0
764,2.0,122.0,70.0,27.000000,159.132431,36.8,0.340,27.0,0.0
765,5.0,121.0,72.0,23.000000,112.000000,26.2,0.245,30.0,0.0
766,1.0,126.0,60.0,27.954931,173.265905,30.1,0.349,47.0,1.0


In [10]:
from sklearn.compose import ColumnTransformer,make_column_transformer
#%pip install -U scikit-learn
#categorical_columns=[]

numeric_columns=['bmi','serum_insulin','diastolic_blood_pressure','age', 'time_pregnant_no', 'diabetes_pedigree' , 'triceps_skinfold_thickness']     
                 

X = df.drop(columns=["class"])
y = df["class"]

ct = ColumnTransformer(
    [
        ('scaling', StandardScaler(), numeric_columns),
    ]
)

ct = make_column_transformer(
    (StandardScaler(), numeric_columns),
)

transformed = ct.fit_transform(X)



from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test=train_test_split(X,y)
RF = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

print(RF.score(X_train, y_train), 
     "Regression scores for the X-train variables, such as: 'bmi','serum_insulin','diastolic_blood_pressure','age', 'time_pregnant_no', 'diabetes_pedigree', 'triceps_skinfold_thickness' wrt y-variable, diabetis'")

# watched column transformer video for reference: https://youtu.be/to2mukSyvLk?si=QpwbIdoKrznloz-y
# https://towardsdatascience.com/iterative-imputation-with-scikit-learn-8f3eb22b1a38
# https://github.com/npradaschnor/Pima-Indians-Diabetes-Dataset/blob/master/Pima%20Indians%20Diabetes%20Dataset.ipynb

print(y.shape, 'y shape')
print(X.shape, 'X shape')

1.0 Regression scores for the X-train variables, such as: 'bmi','serum_insulin','diastolic_blood_pressure','age', 'time_pregnant_no', 'diabetes_pedigree', 'triceps_skinfold_thickness' wrt y-variable, diabetis'
(768,) y shape
(768, 8) X shape


In [11]:
score_test =RF.score(X_test, y_test)
print("Testing score: ",score_test)

Testing score:  0.71875


# ~~~~~~~~~~~~~~~~~~~~Another way:~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [12]:
# Import the necessary modules
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Create the pipeline with two steps: preprocessor and classifier
pipeline = Pipeline([
    ('preprocessor', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)
print("Training Score:", pipeline.score(X_train, y_train))
print("Testing Score:", pipeline.score(X_test, y_test))

Training Score: 1.0
Testing Score: 0.7395833333333334
