In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

In [2]:
# Importing Data
df = load_iris()
df_iris = pd.DataFrame(np.append(df.data, df.target.reshape(-1, 1), axis=1), columns=df.feature_names+['target'])

In [3]:
# Data
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [4]:
# Data Info
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    float64
dtypes: float64(5)
memory usage: 6.0 KB


In [5]:
# Data Shape
df_iris.shape

(150, 5)

In [6]:
# Features and Class
X = df.data
y = df.target

In [7]:
# Splitting into Training Set and Testing Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [8]:
# Stacking
def stacking(estimator, feature_set, class_set, test_features, n_fold):
    '''
    Stacking Initial Level Process Implementation
    Args:
        estimator::[estimator object]
            Initial Classifier Object needed to classify the data on first level of stacking
        feature_set::[numpy array of dimension (k, m)]
            Feature Set required to train model
        class_set::[numpy array of dimension (k, )]
            Associated Class Set of Feature Set required to train model
        test_features::[numpy array of dimension ((n - k), m)]
            Test Set ot evaluate classification model
        n_fold::[int]
            Number of Folds/Pairs of dataset needed to generate for Training and Testing
    Returns:
        train_pred::[numpy array of dimensions ((k * n_fold), 1)]
            Returns the array of prediction for n_fold folds of size (k, ) each
        test_pred::[numpy array of dimensions ((n - k * n_fold), 1)]
            Returns the array of prediction for test_features of dimensions ((n - k), )
    '''
    fold = StratifiedKFold(n_splits=n_fold, shuffle=False)
    test_pred = np.empty((0, 1), float)
    train_pred = np.empty((0, 1), float)
    for train_indices, test_indices in fold.split(feature_set, class_set):
        train_X, test_X = feature_set[train_indices], feature_set[test_indices]
        train_y, test_y = class_set[train_indices], class_set[test_indices]
        estimator.fit(train_X, train_y)
        train_pred = np.append(train_pred, estimator.predict(test_X))
    test_pred = np.append(test_pred, estimator.predict(test_features))
    return train_pred, test_pred

In [9]:
# Classification Model
dec_tr_clf = DecisionTreeClassifier(criterion='gini')
k_nrst_nbr_clf = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

In [10]:
# Initial Classifiers
train_pred_dec_tree, test_pred_dec_tree = stacking(estimator=dec_tr_clf, feature_set=X_train, class_set=y_train, test_features=X_test, n_fold=10)
train_pred_k_nrst_nbr, test_pred_k_nrst_nbr = stacking(estimator=k_nrst_nbr_clf, feature_set=X_train, class_set=y_train, test_features=X_test, n_fold=10)

In [11]:
# Generating Prediction Data
meta_df_train = pd.concat([pd.Series(train_pred_dec_tree, name='Decision Tree Classifier'), pd.Series(train_pred_k_nrst_nbr, name='K-Nearest Neighbor Classifier')], axis=1)
meta_df_test = pd.concat([pd.Series(test_pred_dec_tree, name='Decision Tree Classifier'), pd.Series(test_pred_k_nrst_nbr, name='K-Nearest Neighbor Classifier')], axis=1)

In [12]:
# Meta Classifier
log_reg_clf = LogisticRegression(penalty='l2')
log_reg_clf.fit(meta_df_train, y_train)
log_reg_clf.score(meta_df_test, y_test)

0.9333333333333333