# Import Library & Data

In [None]:
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [None]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(url, names=names)

In [None]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Split target variable

In [None]:
Y = df['class']
X = df.drop(['class'],axis=1)

In [None]:
Y.value_counts()

0    500
1    268
Name: class, dtype: int64

# Select best K

In [None]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
X_new = SelectKBest(chi2, k=2).fit_transform(X, Y)
X_new.shape

(768, 2)

In [None]:
X_new

array([[148.,   0.],
       [ 85.,   0.],
       [183.,   0.],
       ...,
       [121., 112.],
       [126.,   0.],
       [ 93.,   0.]])

# Train Test Split

In [None]:
test_size = 0.33
seed = 123
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

# Model

In [None]:
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)

# Evaluation

In [None]:
print("Accuracy: %.3f%%" % (result*100.0))

Accuracy: 79.134%


# Other Metrics

In [None]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]

## AUC

In [None]:
from sklearn.metrics import roc_auc_score

result1 = roc_auc_score(Y_test,y_pred_proba)
print("AUC of ROC: %.3f%%" % (result1*100.0))

AUC of ROC: 84.864%


## f-1 score

In [None]:
from sklearn.metrics import f1_score

result2 = f1_score(Y_test,y_pred)
print("F1 Score: %.3f%%" % (result2*100.0))

F1 Score: 70.056%


## Log-loss

In [None]:
from sklearn.metrics import log_loss

result3 = log_loss(Y_test,y_pred_proba)
print("Log-loss: %.3f" % (result3))

Log-loss: 0.467


# Model Validation

## k-fold Validation

### Accuracy

In [None]:
kfold = model_selection.KFold(n_splits=10,shuffle = True, random_state=seed)
model = LogisticRegression()
results = model_selection.cross_val_score(model, X, Y, cv=kfold,scoring='accuracy')
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 78.002% (4.008%)


### AUC

In [None]:
kfold = model_selection.KFold(n_splits=10,shuffle = True, random_state=seed)
model = LogisticRegression()
results = model_selection.cross_val_score(model, X, Y, cv=kfold,scoring='roc_auc')
print("ROC AUC: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

ROC AUC: 82.880% (4.953%)


## Leave one out cross-validation

### Accuracy

In [None]:
loocv = model_selection.LeaveOneOut()
model = LogisticRegression()
results = model_selection.cross_val_score(model, X, Y, cv=loocv,scoring='accuracy')
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 77.865% (41.516%)


## Repeated Train-Test Split

### Accuracy

In [None]:
repeatedcv = model_selection.ShuffleSplit(n_splits=10, test_size=test_size, random_state=seed)
model = LogisticRegression()
results = model_selection.cross_val_score(model, X, Y, cv=repeatedcv,scoring='accuracy')
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 77.638% (2.104%)


### AUC

In [None]:
repeatedcv = model_selection.ShuffleSplit(n_splits=10, test_size=test_size, random_state=seed)
model = LogisticRegression()
results = model_selection.cross_val_score(model, X, Y, cv=repeatedcv,scoring='roc_auc')
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 82.640% (2.676%)


# Grid Search CV

In [None]:
params = {'C':[0.01,0.05,0.1,0.5,0.7,1,2,3]}

In [None]:
kfold = model_selection.StratifiedKFold(n_splits = 5,shuffle = True,random_state = seed)
grid = model_selection.GridSearchCV(model,param_grid = params,scoring = 'roc_auc',cv = kfold)

In [None]:
grid.fit(X_train,Y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.05, 0.1, 0.5, 0.7, 1, 2, 3]},
             scoring='roc_auc')

In [None]:
grid.best_score_

0.8096289410905028

In [None]:
grid.best_params_

{'C': 0.1}