In [None]:
#Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Load the data (Kaggle format)
df = pd.read_csv('/kaggle/input/diabetes-data-set/diabetes.csv')

In [None]:
# Display 5-pointer summary
print(df.describe())

In [None]:
# Replace zeros with the median in each column (except for 'Outcome')
for column in df.columns:
    if column != 'Outcome':  # Skip the target column
        median = df[column].median()
        df[column] = df[column].replace(0, median)

In [None]:
# Plot data distributions
plt.figure(figsize=(12, 8))
sns.histplot(df, kde=True)
plt.title("Feature Distributions")
plt.show()

In [None]:
# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Pair plot for relationships between features
sns.pairplot(df, hue='Outcome', palette="coolwarm")
plt.suptitle("Pair Plot of Features", y=1.02)
plt.show()

In [None]:
# Box plots for each feature to inspect distributions by Outcome
plt.figure(figsize=(15, 10))
for i, column in enumerate(df.columns[:-1], 1):  # Skip 'Outcome' for individual feature box plots
    plt.subplot(3, 3, i)
    sns.boxplot(data=df, x='Outcome', y=column, palette="coolwarm")
    plt.title(f"Box Plot of {column} by Outcome")
plt.tight_layout()
plt.show()

In [None]:
# Separate independent and dependent features
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Standardize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
# Split data into training (70%) and testing sets (30%)
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, stratify=y, random_state=0)

In [None]:
# Use GridSearchCV to find the best parameters for the SVM model
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=1)
grid.fit(X_train, y_train)

In [None]:
# Best parameters and model accuracy
print("Best Parameters from GridSearchCV: ", grid.best_params_)
best_model = grid.best_estimator_

In [None]:
# Calculate accuracy on the training set
train_accuracy = accuracy_score(y_train, best_model.predict(X_train))
print("Training Set Accuracy:", train_accuracy)

In [None]:
# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, best_model.predict(X_test))
print("Test Set Accuracy:", test_accuracy)

In [None]:
# Confusion matrix for the test set
conf_matrix = confusion_matrix(y_test, best_model.predict(X_test))
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

print("Classification Report (Test Set):\n", classification_report(y_test, best_model.predict(X_test)))