In [2]:

import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


uploaded = files.upload()
filename = next(iter(uploaded))  # get uploaded filename


df = pd.read_csv(filename)
print("Dataset shape:", df.shape)
print(df.head())

# Define features (X) and target (y)
X = df.drop("Outcome", axis=1)  # features
y = df["Outcome"]               # target

#  Split before preprocessing to avoid leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Logistic Regression with GridSearchCV
param_grid_log = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],  # 'l1' needs solver='liblinear'
    'solver': ['lbfgs', 'liblinear']
}
grid_log = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_log, cv=5, scoring='accuracy')
grid_log.fit(X_train_scaled, y_train)
best_log = grid_log.best_estimator_

#  SVM with GridSearchCV
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
grid_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring='accuracy')
grid_svm.fit(X_train_scaled, y_train)
best_svm = grid_svm.best_estimator_

# 9. Predictions
y_pred_log = best_log.predict(X_test_scaled)
y_pred_svm = best_svm.predict(X_test_scaled)

# 10. Evaluation
print(" Logistic Regression")
print("Best Params:", grid_log.best_params_)
print(classification_report(y_test, y_pred_log))
print("Accuracy:", accuracy_score(y_test, y_pred_log))

print("SVM ")
print("Best Params:", grid_svm.best_params_)
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))




# Phase 4: Launch Prediction Engine


def predict_patient(model, scaler, patient_data):
    """
    Predicts whether a patient is Diabetic or Non-Diabetic.
    model: trained model (Logistic Regression or SVM)
    scaler: fitted StandardScaler
    patient_data: list with same number/order of features as dataset
    """
    patient_df = pd.DataFrame([patient_data], columns=X.columns)
    patient_scaled = scaler.transform(patient_df)
    prediction = model.predict(patient_scaled)[0]
    return "Diabetic" if prediction == 1 else "Non-Diabetic"


new_patient = [6,148,72,35,0,33.6,0.627,50]

print("\n=== Prediction Engine ===")
print("Logistic Regression says:", predict_patient(best_log, scaler, new_patient))
print("SVM says:", predict_patient(best_svm, scaler, new_patient))


Saving diabetes.csv to diabetes (1).csv
Dataset shape: (768, 9)
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
 Logistic Regression
Best Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.60      0.50      0.55  