In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load the dataset
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv('diabetes.csv', names=column_names)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


In [3]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
# # Data Preprocessing
# # Handle missing values (if any), here replacing 0s with NaN (common practice for some medical datasets)
# df.replace(0, np.nan, inplace=True)
# df.fillna(df.mean(), inplace=True)  # Replace missing values with the mean of the column

In [9]:
# Verify the data types and check for non-numeric columns
# print("Data types:\n", df.dtypes)

# If there are any non-numeric columns, we convert them to numeric (if applicable)
# For example, we can forcefully convert all columns to numeric
df = df.apply(pd.to_numeric, errors='coerce')

In [11]:
# df.head()

In [13]:
 # Check for any missing values after conversion
print("Missing values:\n", df.isnull().sum())

Missing values:
 Pregnancies                 1
Glucose                     1
BloodPressure               1
SkinThickness               1
Insulin                     1
BMI                         1
DiabetesPedigreeFunction    1
Age                         1
Outcome                     1
dtype: int64


In [15]:
# Handle missing values by replacing NaNs with the mean of each column
df.fillna(df.mean(), inplace=True)

In [17]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [21]:
df['Outcome'] = df['Outcome'].astype(int)

In [58]:
# Separate features and target variable
X = df.drop(columns=['Outcome'])  # Features
y = df['Outcome']  # Target (Diabetes Outcome)

In [38]:
# Normalize the data (optional but often beneficial for models like SVM and Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [40]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [46]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(),
    "Support Vector Machine (SVM)": SVC()
}

In [60]:
# Train and evaluate models
for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {model_name}: {accuracy * 100:.2f}%")
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix of {model_name}:\n", conf_matrix)
    
    # Classification Report
    print(f"Classification Report of {model_name}:\n", classification_report(y_test, y_pred))
    
    # Visualize Confusion Matrix
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix of {model_name}')
    plt.show()

    print("-" * 50)

Training Logistic Regression...


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [62]:
print("Unique values in 'Outcome' column:", df['Outcome'].unique())


Unique values in 'Outcome' column: [0 1]
