<a href="https://colab.research.google.com/github/avnishcodes/DiabetesPredictionSVM/blob/main/DiabetesPredictionSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
#basic libraries
import pandas as pd
import numpy as np
#visualization
import matplotlib.pyplot as plt
import seaborn as sns
#model + Evalution
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [14]:
#load dataset from Github

url = "https://raw.githubusercontent.com/avnishcodes/DiabetesPredictionSVM/main/diabetes_prediction_dataset.csv"
df = pd.read_csv(url)




In [15]:
#Check dataset info
df.info()
#check for missing values
print("\nMissing values:\n", df.isnull().sum())
#check unique values in target column
print("\nTarget  column values:\n", df['diabetes'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB

Missing values:
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Target  col

In [17]:

#one-hot encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)
#split features and target
X = df_encoded.drop('diabetes', axis=1)
Y = df_encoded['diabetes']
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
#feature scaling (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)




In [18]:
# Train SVM model (default parameters)
svm_model = SVC()
svm_model.fit(X_train, y_train)
#predict
y_pred = svm_model.predict(X_test)
#Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
#confusion matrix and specificity
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print("Specificity:", specificity)

Accuracy: 0.96135

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     18292
           1       0.97      0.56      0.71      1708

    accuracy                           0.96     20000
   macro avg       0.97      0.78      0.85     20000
weighted avg       0.96      0.96      0.96     20000

Specificity: 0.9985786136015744


In [22]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
#define parameter distribution
param_dist = {
    'C' : uniform(0.1, 10),
    'gamma' : uniform(0.01, 1),
    'kernel' :['linear', 'rbf']
}
#create the random search object
random_search = RandomizedSearchCV(
   estimator=SVC(),
   param_distributions=param_dist,
   n_iter=10, cv=3,scoring='accuracy', verbose=1, random_state=42)
#fit on training data
X_sample = X_train[:5000]
y_sample = y_train[:5000]

random_search.fit(X_sample, y_sample)

#best model or parameters
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
#predict and evaluate
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
#specificity
cm_best = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm_best.ravel()
specificity_best = tn / (tn + fp)
print("Specificity:", specificity_best)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'C': np.float64(3.4370861113902182), 'gamma': np.float64(0.1528668179219408), 'kernel': 'linear'}
Accuracy: 0.9589

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.89      0.59      0.71      1708

    accuracy                           0.96     20000
   macro avg       0.93      0.79      0.84     20000
weighted avg       0.96      0.96      0.96     20000

Specificity: 0.9929477367155041
