In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Kyphosis.csv')
df.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [3]:
# Seperate the dependent and independent feature
X = df.drop('Kyphosis', axis=1)
y = df['Kyphosis']

In [4]:
# Train test split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

lr = LogisticRegression()
dtc = DecisionTreeClassifier(random_state=24)
bagg = BaggingClassifier(dtc, n_estimators=25,random_state=24,
                         n_jobs=-1, oob_score=True)

In [6]:
# Train the model 
bagg.fit(X_train, y_train)
print("OOB score:", bagg.oob_score_)
# Make predictions on the test set
y_pred = bagg.predict(X_test)
print(f"the predicted values are : {y_pred}")

OOB score: 0.8214285714285714
the predicted values are : ['present' 'absent' 'absent' 'absent' 'absent' 'absent' 'present' 'absent'
 'absent' 'absent' 'absent' 'absent' 'absent' 'absent' 'absent' 'absent'
 'present' 'absent' 'absent' 'present' 'absent' 'present' 'absent'
 'absent' 'absent']


In [7]:
# Model Evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
print(f'The Accuracy score is : {accuracy_score(y_test,y_pred)}')

y_pred_proba = bagg.predict_proba(X_test)
print(f'Log loss : {log_loss(y_test,y_pred_proba)}')

The Accuracy score is : 0.8
Log loss : 0.5112901769293141


In [8]:
print(bagg.get_params())

{'bootstrap': True, 'bootstrap_features': False, 'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__max_features': None, 'estimator__max_leaf_nodes': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__monotonic_cst': None, 'estimator__random_state': 24, 'estimator__splitter': 'best', 'estimator': DecisionTreeClassifier(random_state=24), 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 25, 'n_jobs': -1, 'oob_score': True, 'random_state': 24, 'verbose': 0, 'warm_start': False}


In [11]:
# Hyperparameter Tuning the model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

param = {
    'estimator': [lr,dtc]
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
grid = GridSearchCV(estimator=bagg, param_grid=param, cv=kfold, scoring='neg_log_loss')
grid.fit(X_train, y_train)

grid.fit(X_train, y_train)

print(f'Best parameters : {grid.best_params_}')
print(f'Best Score : {grid.best_score_}')

Best parameters : {'estimator': LogisticRegression()}
Best Score : -0.41711742503855725
