# Diabetes prediction with SVM¶

In [41]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale,StandardScaler # for standardization
from sklearn.model_selection import train_test_split, GridSearchCV ,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score, mean_squared_error, r2_score,classification_report,roc_auc_score,roc_curve
from sklearn.svm import SVC

In [42]:
# turn off alerts
from warnings import filterwarnings
filterwarnings ('ignore')

# Dataset and Story


amaç: Hastanemizde tutulan veri setinde kişilerin bazı bilgileri bulunmaktadır.Kişinin tahlil sonuçlarına göre şeker hastası olup olmadığına dair bir tahminleme modeli gerçekleştirmemiz isteniyor.

In [43]:
df= pd.read_csv("diabetes.csv")

In [44]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Model ve Tahmin

In [45]:
df["Outcome"].value_counts() # representation numbers of the dependent variable.

0    500
1    268
Name: Outcome, dtype: int64

There is information of 268 people in the data 1, that is, the number of diabetics, and information of 500 people from the data of 0, that is, the data of people who do not have diabetes.

In [46]:
df.describe().T # descriptive statistics

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [47]:
y=df["Outcome"]# get dependent variable
X=df.drop(["Outcome"], axis=1) # bağımsız değişkenleri alınması
X_train,X_test,y_train,y_test = train_test_split(X,# independent variable
                                                y, #the dependent variable
                                                test_size=0.30,# test data
                                                random_state=42) 

In [48]:
svm_model=SVC(kernel="linear").fit(X_train,y_train)# model installed


In [49]:
y_pred = svm_model.predict(X) # predictive acquisition values

In [50]:
accuracy_score(y,y_pred) # success rate

0.76953125

In [51]:
print(classification_report(y,y_pred)) #detailed reporting

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       500
           1       0.70      0.59      0.64       268

    accuracy                           0.77       768
   macro avg       0.75      0.73      0.74       768
weighted avg       0.76      0.77      0.76       768



# Model Tuning

In [52]:
svm_params={"C": np.arange(1,10),
            "kernel": ["linear","rbf"]} #grouping of parameters

In [53]:
svm_model=SVC()# model object

In [55]:
# finding ideal parameter values
svm_cv_model=GridSearchCV(svm_model,svm_params,cv=5,n_jobs=-1,verbose=2).fit(X_train,y_train).fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [56]:
#best model success values
knn_cv_model.best_score_

0.7839044652128765

In [57]:
#the most ideal parameters
svm_cv_model.best_params_

{'C': 2, 'kernel': 'linear'}

In [61]:

svm_tuned= SVC(C=2,kernel="linear").fit(X_train, y_train)

In [59]:
y_pred=svm_tuned.predict(X_test)

In [60]:
accuracy_score(y_test,y_pred)

0.7445887445887446