# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the diabetes dataset

In [2]:
df = pd.read_csv('/content/diabetes.csv')

# Step 1: Create multiple versions of the dataset

**Original: Keep raw data**

In [3]:
df_original = df.copy()

**Manual Cleaning: Remove biologically implausible values**

In [4]:
df_manual = df.copy()
df_manual = df_manual[df_manual['Pregnancies'] <= 13]  # Max realistic pregnancies
df_manual = df_manual[df_manual['Glucose'] != 0]       # Remove zero glucose
df_manual = df_manual[df_manual['BloodPressure'] >= 40]  # Min realistic blood pressure
df_manual = df_manual[(df_manual['SkinThickness'] != 0) & (df_manual['SkinThickness'] != 99)]  # Remove invalid skin thickness
df_manual = df_manual[(df_manual['Insulin'] != 0) & (df_manual['Insulin'] <= 500)]  # Remove zero or extreme insulin
df_manual = df_manual[(df_manual['BMI'] != 0) & (df_manual['BMI'] < 53.2)]  # Remove zero or extreme BMI
df_manual = df_manual[(df_manual['DiabetesPedigreeFunction'] >= 0.1) & (df_manual['DiabetesPedigreeFunction'] <= 2.0)]  # Realistic pedigree range
print(f"Manual Cleaning: {len(df_manual)} rows remaining (from {len(df)})")

Manual Cleaning: 367 rows remaining (from 768)


**Clipping: Cap outliers using IQR**

In [5]:
df_clip = df.copy()
numeric_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for col in numeric_cols:
    Q1 = df_clip[col].quantile(0.25)
    Q3 = df_clip[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df_clip[col] = df_clip[col].clip(lower=lower, upper=upper)
print(f"Clipping: {len(df_clip)} rows remaining (from {len(df)})")

Clipping: 768 rows remaining (from 768)


**Imputation: Replace zeros with median**

In [6]:
df_impute = df.copy()
cols_to_impute = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_to_impute:
    median_val = df_impute[col].median()
    df_impute[col] = df_impute[col].replace(0, median_val)
print(f"Imputation: {len(df_impute)} rows remaining (from {len(df)})")

Imputation: 768 rows remaining (from 768)


# Step 2: Correlation analysis to understand feature importance


In [7]:
datasets = {
    'Original': df_original,
    'Manual Cleaning': df_manual,
    'Clipping': df_clip,
    'Imputation': df_impute
}
for name, data in datasets.items():
    print(f"\nCorrelation with 'Outcome' for {name}:")
    print(data.corr()['Outcome'].sort_values(ascending=False))


Correlation with 'Outcome' for Original:
Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64

Correlation with 'Outcome' for Manual Cleaning:
Outcome                     1.000000
Glucose                     0.515473
Age                         0.349566
Insulin                     0.320706
Pregnancies                 0.265999
BMI                         0.248377
SkinThickness               0.223329
BloodPressure               0.217601
DiabetesPedigreeFunction    0.206793
Name: Outcome, dtype: float64

Correlation with 'Outcome' for Clipping:
Outcome                     1.000000
Glucose                     0.479158
BMI                         0.309739
Age                         0.

# Step 4: Function to evaluate Logistic Regression with tuning


In [8]:
def evaluate_logistic_regression(df, name):
    # Select top features based on correlation (e.g., top 5)
    corr = df.corr()['Outcome'].abs().sort_values(ascending=False)
    top_features = corr[1:6].index.tolist()  # Exclude 'Outcome', take top 5
    X = df[top_features]
    y = df['Outcome']

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

    # Define parameter grid for tuning
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
        'class_weight': [None, 'balanced']  # Handle class imbalance
    }

    # Train Logistic Regression with GridSearchCV
    model = LogisticRegression(max_iter=1000)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best model
    best_model = grid_search.best_estimator_
    print(f"\nBest Parameters for {name}: {grid_search.best_params_}")

    # Predict on test set
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Print results
    print(f"\n📊 Results for {name}:")
    print(f"Accuracy:  {accuracy*100:.2f}%")
    print(f"Precision: {precision*100:.2f}%")
    print(f"Recall:    {recall*100:.2f}%")
    print(f"F1 Score:  {f1*100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    return {'Name': name, 'Accuracy': accuracy, 'F1 Score': f1}

# Step 5: Evaluate all datasets


In [9]:
results = []
for name, data in datasets.items():
    result = evaluate_logistic_regression(data, name)
    results.append(result)


Best Parameters for Original: {'C': 1, 'class_weight': None}

📊 Results for Original:
Accuracy:  80.52%
Precision: 79.87%
Recall:    80.52%
F1 Score:  79.79%
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       107
           1       0.73      0.57      0.64        47

    accuracy                           0.81       154
   macro avg       0.78      0.74      0.75       154
weighted avg       0.80      0.81      0.80       154


Best Parameters for Manual Cleaning: {'C': 0.1, 'class_weight': None}

📊 Results for Manual Cleaning:
Accuracy:  81.08%
Precision: 82.00%
Recall:    81.08%
F1 Score:  80.48%
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.93      0.85        43
           1       0.87      0.65      0.74        31

    accuracy                           0.81        74
   macro avg       0.83      0.79      0.80        74
weighted avg       0.

# Step 6: Find best dataset

In [10]:
results_df = pd.DataFrame(results)
best_result = results_df.loc[results_df['F1 Score'].idxmax()]
print("\n✅ Best Dataset based on F1 Score:")
print(best_result)


✅ Best Dataset based on F1 Score:
Name        Manual Cleaning
Accuracy           0.810811
F1 Score           0.804847
Name: 1, dtype: object
