In [1]:
import pandas as pd 

df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
df.shape

(768, 9)

In [2]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [10]:
import numpy as np 

numerical_cols=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']

# 2. Calculate Z-scores for these columns
z_scores = np.abs((df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std())

# 3. Create a boolean mask to identify rows without outliers
# A row is kept if ALL of its z-scores are less than the threshold (e.g., 3)
no_outliers_mask = (z_scores < 3).all(axis=1)

# 4. Filter the original DataFrame
df_cleaned = df[no_outliers_mask]

print("Shape after removing outliers:", df_cleaned.shape)
print("\nCleaned DataFrame:")
df_cleaned.head()

Shape after removing outliers: (688, 9)

Cleaned DataFrame:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
5,5,116,74,0,0,25.6,0.201,30,0


In [12]:
X = df_cleaned.drop('Outcome',axis = 1)
y = df_cleaned.Outcome

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

[[ 0.65735547  0.92404008 -0.02811548 ...  0.21028523  0.60651604
   1.47921983]
 [-0.86848989 -1.17708222 -0.51576543 ... -0.84806325 -0.36422044
  -0.18326505]
 [ 1.26769361  2.09133025 -0.67831541 ... -1.34699896  0.76478829
  -0.09576584]
 ...
 [ 0.3521864   0.0235591  -0.02811548 ... -0.90854031 -0.73703952
  -0.27076425]
 [-0.86848989  0.19031483 -1.00341538 ... -0.31888901 -0.37125476
   1.21672222]
 [-0.86848989 -0.91027304 -0.19066546 ... -0.27353122 -0.49083824
  -0.88325868]]


In [13]:
from sklearn.model_selection import train_test_split as tts

In [20]:
X_train,X_test,y_train,y_test = tts(X_scaled,y,test_size=0.2,random_state=42)
len(X_train),len(y_test)

(550, 138)

In [22]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

model_params = {
    'svm': {
        'model' : SVC(gamma='auto'),
        'params' : {
            'C':[1,10,20,30,50],
            'kernel' : ['rbf','linear']
        }
    },
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params': {
            'n_estimators' : [1,5,10,20],
        }
    },
    'logistic_regression' : {
        'model' : LogisticRegression(),
        'params' : {
            'C': [1,5,10,20]
        }
    },
    'decision_tree':{
        'model' : DecisionTreeClassifier(),
        'params' : {
            
        }
        
    },
    'knn' : {
        'model':KNeighborsClassifier(),
        'params' : {
            'n_neighbors' : [1,5,7,9,10,15,20,25,30,40,50]
        }
    }
}

In [24]:
from sklearn.model_selection import GridSearchCV as gsc

scores = []

for model_name,mp in model_params.items():
    clf = gsc(mp['model'],mp['params'],cv = 5, return_train_score= False)
    clf.fit(X_train,y_train)
    scores.append({
        'model':model_name,
        'best_score' : clf.best_score_,
        'best_param' : clf.best_params_
    })

df1 = pd.DataFrame(scores,columns = ['model','best_score','best_param'])
df1

Unnamed: 0,model,best_score,best_param
0,svm,0.787273,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.767273,{'n_estimators': 20}
2,logistic_regression,0.792727,{'C': 1}
3,decision_tree,0.723636,{}
4,knn,0.761818,{'n_neighbors': 25}


In [25]:
scores = []

for model_name,mp in model_params.items():
    clf = gsc(mp['model'],mp['params'],cv = 5, return_train_score= False)
    clf.fit(X_test,y_test)
    scores.append({
        'model':model_name,
        'best_score' : clf.best_score_,
        'best_param' : clf.best_params_
    })

df1 = pd.DataFrame(scores,columns = ['model','best_score','best_param'])
df1

Unnamed: 0,model,best_score,best_param
0,svm,0.73836,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.753175,{'n_estimators': 5}
2,logistic_regression,0.731746,{'C': 1}
3,decision_tree,0.636508,{}
4,knn,0.789153,{'n_neighbors': 15}


In [33]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    estimator = LogisticRegression(C=1),
    n_estimators=100,
    max_samples=0.8,
    oob_score=True,
    random_state=0
)
bag_model.fit(X_train,y_train)
bag_model.oob_score_

0.7890909090909091

In [34]:
bag_model.score(X_test,y_test)

0.7681159420289855

In [35]:
import joblib

# Assume 'lr_model' is your trained Logistic Regression model
filename = 'diabetes_model.joblib'

# Save the model
joblib.dump(bag_model, filename)

['diabetes_model.joblib']