In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #mean vusual
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler #binary classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc #roc curve performace tp bw fp

In [None]:
# Load the dataset
df = pd.read_csv('/content/archive (3).zip')

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.isnull().sum()

In [None]:
# Handle missing values (replace zeros in certain columns with NaN)#prepr
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[columns_with_zeros] = df[columns_with_zeros].replace(0, np.nan)

In [None]:
# Fill missing values with the median #middle value
df.fillna(df.median(), inplace=True)

In [None]:
# Split dataset into features and target variable
X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:,1]) #cal fpr and tpr
    auc_score = auc(fpr, tpr)

    print(f'Accuracy: {acc:.4f}')
    print(f'Precision: {prec:.4f}')
    print(f'Recall: {rec:.4f}')
    print(f'F1 Score: {f1:.4f}')

    # Plot Confusion Matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Diabetes', 'Diabetes'],
                yticklabels=['No Diabetes', 'Diabetes'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    # Plot ROC Curve
    plt.figure(figsize=(6, 4))#graph size
    plt.plot(fpr, tpr, label=f'AUC = {auc_score:.4f}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Evaluate the model
evaluate_model(log_reg, X_test, y_test) #performace

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Evaluate the model
evaluate_model(knn, X_test, y_test)

In [None]:
from sklearn.svm import SVC

# Train SVM model
svm = SVC(probability=True)
svm.fit(X_train, y_train)

# Evaluate the model
evaluate_model(svm, X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Evaluate the model
evaluate_model(dt, X_test, y_test)


In [None]:
#multiple decision trees #combine mux model
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

# Evaluate the model
evaluate_model(rf, X_test, y_test)


In [None]:
models = {
    "Logistic Regression": log_reg,
    "KNN": knn,
    "SVM": svm,
    "Decision Tree": dt,
    "Random Forest": rf
}

results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1-Score": f1}

# Convert to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print(results_df)
