Creating a Five-Fold cross validation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [3]:
_kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
_headers = ['buying', 'maint','doors','persons','lug_boot', 'saftey', 'car']

df = pd.read_csv('../datasets/car.data',names=_headers, index_col=None)

In [5]:
indices = _kf.split(df)

In [6]:
print(type(indices))

<class 'generator'>


In [7]:
# first set
train_indices, val_indices = next(indices)

In [8]:
train_df = df.drop(val_indices)

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1382 non-null   object
 1   maint     1382 non-null   object
 2   doors     1382 non-null   object
 3   persons   1382 non-null   object
 4   lug_boot  1382 non-null   object
 5   saftey    1382 non-null   object
 6   car       1382 non-null   object
dtypes: object(7)
memory usage: 86.4+ KB


In [10]:
val_df = df.drop(train_indices)
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 15 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    346 non-null    object
 1   maint     346 non-null    object
 2   doors     346 non-null    object
 3   persons   346 non-null    object
 4   lug_boot  346 non-null    object
 5   saftey    346 non-null    object
 6   car       346 non-null    object
dtypes: object(7)
memory usage: 21.6+ KB


### The five-fold cross validation

In [11]:
_t, _v = [],[]

In [12]:
# define number of splits
n_splits = 5

In [13]:
_kf = KFold(n_splits=n_splits)

In [14]:
_indices = _kf.split(df)

In [15]:
for i in range(n_splits):
    train_idx, val_idx = next(_indices)
    _train_df = df.drop(val_idx)
    _t.append(_train_df)
    _val_df = df.drop(train_idx)
    _v.append(_val_df)

In [16]:
for d in _t:
    print(d.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 346 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1382 non-null   object
 1   maint     1382 non-null   object
 2   doors     1382 non-null   object
 3   persons   1382 non-null   object
 4   lug_boot  1382 non-null   object
 5   saftey    1382 non-null   object
 6   car       1382 non-null   object
dtypes: object(7)
memory usage: 86.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1382 non-null   object
 1   maint     1382 non-null   object
 2   doors     1382 non-null   object
 3   persons   1382 non-null   object
 4   lug_boot  1382 non-null   object
 5   saftey    1382 non-null   object
 6   car       1382 non-null   object
dtypes: object(7)
memory usage: 86.4+ KB
None
<class 'pa

In [17]:
for d in _v:
    print(d.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 0 to 345
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    346 non-null    object
 1   maint     346 non-null    object
 2   doors     346 non-null    object
 3   persons   346 non-null    object
 4   lug_boot  346 non-null    object
 5   saftey    346 non-null    object
 6   car       346 non-null    object
dtypes: object(7)
memory usage: 21.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 346 to 691
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    346 non-null    object
 1   maint     346 non-null    object
 2   doors     346 non-null    object
 3   persons   346 non-null    object
 4   lug_boot  346 non-null    object
 5   saftey    346 non-null    object
 6   car       346 non-null    object
dtypes: object(7)
memory usage: 21.6+ KB
None
<class 'pandas

Cross validation score

 1. Creating cross validation datasets
 2. Training models by fitting them to the training data
 3. Evaluting the models on the validation data
 4. Returning a list of the R2 score of each model that is trained

In [18]:
_df = pd.get_dummies(df, columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'saftey'])

_df.head()

Unnamed: 0,car,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,saftey_high,saftey_low,saftey_med
0,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,1,0
1,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,1
2,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,1,0,0
3,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
4,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,1


In [19]:
features = _df.drop(['car'], axis=1).values
labels = _df[['car']].values

In [20]:
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(
    max_iter=2000,
    multi_class='auto',
    cv=5
)

In [21]:
model.fit(features, labels.ravel())

LogisticRegressionCV(cv=5, max_iter=2000)

In [22]:
score = model.score(features, labels.ravel())
print(score)

0.9456018518518519


## Hyperparameter tuning using Gridsearch CV

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [24]:
clf = DecisionTreeClassifier()


In [25]:
params = {'max_depth': np.arange(1,8)}

In [26]:
clf_cv = GridSearchCV(clf, param_grid=params, cv=5)

In [27]:
clf_cv.fit(features, labels)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7])})

In [28]:
clf_cv.best_params_

{'max_depth': 2}

In [29]:
clf_cv.best_score_

0.7778822149618833

In [30]:
model = clf_cv.best_estimator_
model

DecisionTreeClassifier(max_depth=2)

In [31]:
model.fit(features, labels)

DecisionTreeClassifier(max_depth=2)

### RandomizeSearch CV

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [33]:
clf = RandomForestClassifier()

In [34]:
params = {
    'n_estimators': [500, 1000, 2000],
    'max_depth': np.arange(1,8)
}

In [35]:
clf_cv = RandomizedSearchCV(
    clf,
    param_distributions=params,
    cv=5
    )

In [36]:
clf_cv.fit(features, labels.ravel())

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': array([1, 2, 3, 4, 5, 6, 7]),
                                        'n_estimators': [500, 1000, 2000]})

In [37]:
clf_cv.best_estimator_

RandomForestClassifier(max_depth=6, n_estimators=2000)

In [38]:
clf_cv.best_params_

{'n_estimators': 2000, 'max_depth': 6}

In [39]:
model = clf_cv.best_estimator_
model

RandomForestClassifier(max_depth=6, n_estimators=2000)