In [None]:
import pandas as pd
import requests
import io

# URL for the Pima Indians Diabetes dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
colnames = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigree', 'Age', 'Outcome']

# Download the dataset
content = requests.get(url).content

# Read the dataset into a pandas DataFrame
df = pd.read_csv(io.StringIO(content.decode('utf-8')), names=colnames)

# Save the dataset to a CSV file
df.to_csv('diabetes.csv', index=False)

print('Dataset created successfully!')
print(df.info())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [None]:
# Load the dataset
diabetes_df = pd.read_csv('diabetes.csv')
diabetes_df.head()

In [None]:
# Check for missing values
print('Missing values:')
print(diabetes_df.isnull().sum())

In [None]:
# Summary statistics
print('Summary statistics:')
print(diabetes_df.describe())

In [None]:
# Visualize outcome distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Outcome', data=diabetes_df)
plt.title('Distribution of Diabetes Outcome')
plt.xlabel('Outcome (0: No, 1: Yes)')
plt.ylabel('Count')
plt.show()

In [None]:
# Preprocessing
# Split features and target
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set shape: {X_train.shape}')
print(f'Testing set shape: {X_test.shape}')

In [None]:
from sklearn.linear_model import LogisticRegression

# Create and train Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = log_reg.predict(X_test_scaled)
y_pred_proba_lr = log_reg.predict_proba(X_test_scaled)

In [None]:
print("="*50)
print("LOGISTIC REGRESSION RESULTS")
print("="*50)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba_lr[:, 1]):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

# Confusion Matrix Plot
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create and train Decision Tree model
dt_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_classifier.fit(X_train_scaled, y_train)

y_pred_dt = dt_classifier.predict(X_test_scaled)

print("="*50)
print("DECISION TREE RESULTS")
print("="*50)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_dt):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create and train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)

print("="*50)
print("KNN RESULTS")
print("="*50)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_knn):.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

y_pred_rf = rf_classifier.predict(X_test_scaled)

print("="*50)
print("RANDOM FOREST RESULTS")
print("="*50)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

In [None]:
from xgboost import XGBClassifier

# Create and train XGBoost model
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_classifier.fit(X_train_scaled, y_train)

y_pred_xgb = xgb_classifier.predict(X_test_scaled)

print("="*50)
print("XGBOOST RESULTS")
print("="*50)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_xgb):.4f}")

In [None]:
# Visualize Feature Importance for Random Forest
importances = rf_classifier.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance (Random Forest)')
plt.show()

In [None]:
print("="*60)
print("FINAL MODEL SUMMARY")
print("="*60)
print(f"Best Performing Model: Random Forest (Accuracy: {accuracy_score(y_test, y_pred_rf):.4f})")
print("\nTop 3 Important Features:")
print(feature_importance_df.head(3).to_string(index=False))