In [57]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [58]:
data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\Artificial_intelligence\\datasets\\diabetes.csv")

In [59]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [60]:
X = data.drop("Outcome",axis=1)
y = data["Outcome"]

In [61]:
logreg = LogisticRegression()

### Resampling on the basis of train_test_split

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
X_train, X_test, y_train, y_test =  train_test_split(X,y,test_size=0.3,random_state=42)

In [64]:
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
y_pred = logreg.predict(X_test)

In [66]:
print("Accuracy : ",accuracy_score(y_pred,y_test)*100,"%")

Accuracy :  75.32467532467533 %


### Resampling on the basis of k-fold Validation

In [67]:
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score

In [68]:
kfold = KFold(n_splits=10, random_state=42)

In [70]:
accuracy=cross_val_score(logreg, X, y, cv=kfold)

In [None]:
# array([0.7012987 , 0.81818182, 0.74025974, 0.71428571, 0.77922078,
#           0.75324675, 0.85714286, 0.80519481, 0.72368421, 0.80263158])

# this is the accuracy for the 10 folds. 

In [73]:
print("Accurcy : ", accuracy.mean()*100,"%",)
print("Standard Deviation of the accuracies",accuracy.std()*100)

Accurcy :  76.95146958304852 %
Standard Deviation of the accuracies 4.841051924567195


### Resampling on the basis of (Leave One Out ) variation of k-fold

In [74]:
from sklearn.model_selection import LeaveOneOut

In [75]:
loout = LeaveOneOut()

In [79]:
accuracy=cross_val_score(logreg, X, y, cv=loout)

In [80]:
print("Accurcy : ", accuracy.mean()*100,"%",)
print("Standard Deviation of the accuracies",accuracy.std()*100)

Accurcy :  76.953125 %
Standard Deviation of the accuracies 42.11328831538063


In [81]:
#  standard deviation that the score has more variance 
#                                than the k-fold cross validation results described above

### Resampling on the basis of Repeated Random Test-Train Splits


In [84]:
from sklearn.model_selection import ShuffleSplit

In [85]:
shuflsplit = ShuffleSplit(n_splits=10, test_size=0.33, random_state=42)

In [86]:
accuracy=cross_val_score(logreg, X, y, cv=shuflsplit)

In [87]:
print("Accurcy : ", accuracy.mean()*100,"%",)
print("Standard Deviation of the accuracies",accuracy.std()*100)

Accurcy :  77.95275590551182 %
Standard Deviation of the accuracies 2.7276390670376025


## Conclusion
    Repeated Random Test-Train Splits has bee found best resampling method for this dataset with        
    Accuracy : 77.95 %
    
    Standard Deviation : 2.72
    

### Performance Metrics

In [88]:
# Using the shuffel split

shuflsplit = ShuffleSplit(n_splits=10, test_size=0.33, random_state=42)


In [89]:
accuracy=cross_val_score(logreg, X, y, cv=shuflsplit)

In [90]:
print("Accurcy : ", accuracy.mean()*100,"%",)
print("Standard Deviation of the accuracies",accuracy.std()*100)

Accurcy :  77.95275590551182 %
Standard Deviation of the accuracies 2.7276390670376025


###  Classiﬁcation Accuracy

In [94]:
accuracy=cross_val_score(logreg, X, y, cv=shuflsplit, scoring='accuracy') # default value of scoring

In [95]:
print("Accurcy : ", accuracy.mean()*100,"%",)
print("Standard Deviation of the accuracies",accuracy.std()*100)  

Accurcy :  77.95275590551182 %
Standard Deviation of the accuracies 2.7276390670376025


### Lograithmic Loss

In [97]:
accuracy=cross_val_score(logreg, X, y, cv=shuflsplit, scoring='neg_log_loss')

In [98]:
print("Accurcy : ", accuracy.mean()*100,"%",)
print("Standard Deviation of the accuracies",accuracy.std()*100)

Accurcy :  -48.7884495433039 %
Standard Deviation of the accuracies 3.3489384189142983
