In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Load the dataset
df = pd.read_csv('/content/diabetes.csv')

In [None]:
# Check the first few rows
print(df.head())

# Check for summary statistics
print(df.describe())

# Check the data types
print(df.info())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for column in columns_with_zeros:
    df[column] = df[column].replace(0, df[column].median())

In [None]:
scaler = StandardScaler()
df_scaled = df.copy()

df_scaled[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']] = scaler.fit_transform(
    df_scaled[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
)
X = df_scaled.drop('Outcome', axis=1)
y = df_scaled['Outcome']

In [None]:
print(df_scaled.head())

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Pairplot
sns.pairplot(df, hue='Outcome', diag_kind='kde')
plt.show()

In [None]:
# Distribution Plot (for Glucose)
plt.figure(figsize=(10, 6))
sns.histplot(df['Glucose'], kde=True, color='blue')
plt.title('Distribution of Glucose Levels')
plt.show()

In [None]:
# Boxplot for Glucose vs Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='Outcome', y='Glucose', data=df, palette='coolwarm')
plt.title('Glucose Levels by Outcome')
plt.show()

# Boxplot for BMI vs Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='Outcome', y='BMI', data=df, palette='coolwarm')
plt.title('BMI Levels by Outcome')
plt.show()

In [None]:
# Count Plot for Outcome
plt.figure(figsize=(6, 4))
sns.countplot(x='Outcome', data=df)
plt.title('Count of Diabetes Outcome (0 = No, 1 = Yes)')
plt.show()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize models
log_reg = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
svc = SVC()

In [None]:
# Train models
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
svc.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_log_reg = log_reg.predict(X_test)
y_pred_tree = decision_tree.predict(X_test)
y_pred_forest = random_forest.predict(X_test)
y_pred_svc = svc.predict(X_test)

In [None]:
# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

In [None]:
# Logistic Regression Evaluation
evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")

In [None]:
# Decision Tree Evaluation
evaluate_model(y_test, y_pred_tree, "Decision Tree")

In [None]:
# Random Forest Evaluation
evaluate_model(y_test, y_pred_forest, "Random Forest")

In [None]:
# SVM Evaluation
evaluate_model(y_test, y_pred_svc, "Support Vector Machine")