In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:

def load_and_explore_data(filepath):
    """Load and display initial data exploration"""
    df = pd.read_csv(filepath)
    print("Dataset Shape:", df.shape)
    print("\nSample Data:")
    print(df.head())
    print("\nData Info:")
    print(df.info())
    return df

In [3]:
# Data Preprocessing
def preprocess_data(df):
    """Handle missing values and feature engineering"""
    # Replace 0s with NaN for specific columns
    cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
    df[cols] = df[cols].replace(0, np.nan)
    
    # Handle missing values
    df['Glucose'].fillna(df['Glucose'].mean(), inplace=True)
    df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace=True)
    df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace=True)
    df['Insulin'].fillna(df['Insulin'].median(), inplace=True)
    df['BMI'].fillna(df['BMI'].median(), inplace=True)
    
    # Feature engineering
    # Log transform highly skewed features
    df['Insulin'] = np.log1p(df['Insulin'])
    df['DiabetesPedigreeFunction'] = np.log1p(df['DiabetesPedigreeFunction'])
    
    # Create BMI categories
    df['BMI_Category'] = pd.cut(df['BMI'], 
                               bins=[0, 18.5, 24.9, 29.9, 100],
                               labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
    
    # Create age groups
    df['Age_Group'] = pd.cut(df['Age'], 
                            bins=[20, 30, 40, 50, 60, 100],
                            labels=['20-30', '31-40', '41-50', '51-60', '60+'])
    
    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['BMI_Category', 'Age_Group'])
    
    return df



In [4]:
# Split and Balance Data
def prepare_data(df, test_size=0.2, random_state=42):
    """Split data and handle class imbalance"""
    X = df.drop('Outcome', axis=1)
    y = df['Outcome']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Scale features
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Handle class imbalance
    smote = SMOTE(random_state=random_state)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
    
    return X_train_balanced, X_test_scaled, y_train_balanced, y_test, scaler


In [5]:

# Model Training and Evaluation
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """Train multiple models using GridSearchCV and evaluate them"""
    models = {
        'random_forest': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5],
                'class_weight': ['balanced']
            }
        }
    }
    
    best_models = {}
    results = []
    
    for name, model_info in models.items():
        print(f"\nTraining {name}...")
        grid_search = GridSearchCV(model_info['model'], model_info['params'], 
                                 cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Save best model
        best_models[name] = grid_search.best_estimator_
        
        # Make predictions
        y_pred = grid_search.predict(X_test)
        auc_score = roc_auc_score(y_test, grid_search.predict_proba(X_test)[:, 1])
        
        results.append({
            'Model': name,
            'Best Parameters': grid_search.best_params_,
            'ROC AUC Score': auc_score,
            'Accuracy': accuracy_score(y_test, y_pred)
        })
        
        print(f"\nClassification Report for {name}:")
        print(classification_report(y_test, y_pred))
    
    return pd.DataFrame(results), best_models


In [6]:
# Save models
def save_models(models, scaler, base_path='../saved_models/'):
    """Save trained models and scaler"""
    import os
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    
    for name, model in models.items():
        with open(f'{base_path}{name}.pkl', 'wb') as f:
            pickle.dump(model, f)
    
    with open(f'{base_path}scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

In [7]:
# Prediction function
def predict_diabetes(model, scaler, data):
    """Make predictions for new data"""
    # Scale the input data
    scaled_data = scaler.transform(data)
    
    # Make prediction
    prediction = model.predict(scaled_data)
    probability = model.predict_proba(scaled_data)[:, 1]
    
    return prediction, probability



In [8]:

# Load and preprocess data
df = load_and_explore_data("../data/diabetes.csv")
df_processed = preprocess_data(df)

# Prepare data
X_train, X_test, y_train, y_test, scaler = prepare_data(df_processed)

# Train and evaluate models
results, best_models = train_and_evaluate_models(X_train, X_test, y_train, y_test)

# Save models
save_models(best_models, scaler)

# Print results
print("\nModel Comparison:")
print(results.sort_values('ROC AUC Score', ascending=False))


Dataset Shape: (768, 9)

Sample Data:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-