# Learn-Together: Stacking Classifiers 2

*Fitting the stacked classifier using the entire dataset, rather splitting dataset into train/val.*

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.metrics import accuracy_score

seed = 42

In [2]:
def generate_submission(model, test_data, test_ids, file_name):
    predictions = model.predict(test_data)
    output = pd.DataFrame({"Id": test_ids, "Cover_Type": predictions})
    output.to_csv("submissions/"+ file_name +".csv", index=False)
    print("Submission generated.")

In [3]:
submission_ex = pd.read_csv("assets/learn-together/sample_submission.csv")
train_df = pd.read_csv("assets/learn-together/train.csv")
test_df = pd.read_csv("assets/learn-together/test.csv")

for X in [train_df, test_df]:
    X['Hydro_Elevation_diff'] = (X['Elevation'] - 
                                 X['Vertical_Distance_To_Hydrology'])

    X['Hydro_Fire_sum'] = (X['Horizontal_Distance_To_Hydrology'] + 
                           X['Horizontal_Distance_To_Fire_Points'])

    X['Hydro_Fire_diff'] = (X['Horizontal_Distance_To_Hydrology'] - 
                            X['Horizontal_Distance_To_Fire_Points']).abs()

    X['Hydro_Road_sum'] = (X['Horizontal_Distance_To_Hydrology'] +
                           X['Horizontal_Distance_To_Roadways'])

    X['Hydro_Road_diff'] = (X['Horizontal_Distance_To_Hydrology'] -
                            X['Horizontal_Distance_To_Roadways']).abs()

    X['Road_Fire_sum'] = (X['Horizontal_Distance_To_Roadways'] + 
                          X['Horizontal_Distance_To_Fire_Points'])

    X['Road_Fire_diff'] = (X['Horizontal_Distance_To_Roadways'] - 
                           X['Horizontal_Distance_To_Fire_Points']).abs()
    
target = ["Cover_Type"]
cols_to_drop = ["Id", "Soil_Type7", "Soil_Type15", "Cover_Type"]

train = train_df.copy()
test = test_df.copy()

y = train[target]
train.drop(columns=cols_to_drop, inplace=True)
test_ids = test["Id"]
test.drop(columns=["Id", "Soil_Type7", "Soil_Type15"], inplace=True)

# X_train, X_val, y_train, y_val = train_test_split(train,
#                                                   y, 
#                                                   test_size=0.2, 
#                                                   random_state=seed)

# X_train.shape, y_train.shape, X_val.shape, y_val.shape

# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_val = sc.transform(X_val)
# test = sc.transform(test)

In [4]:
ab = AdaBoostClassifier(n_estimators=200,
                            base_estimator=DecisionTreeClassifier(
                                min_samples_leaf=2,
                                random_state=seed),
                            random_state=seed)

et = ExtraTreesClassifier(max_depth=400, 
                           n_estimators=450, 
                           n_jobs=-1,
                           oob_score=False,
                           random_state=seed, 
                           warm_start=True)

lg = LGBMClassifier(n_estimators=370,
                           metric='multi_logloss',
                           num_leaves=100,
                           verbosity=0,
                           random_state=seed,
                           n_jobs=-1)

best_rf_params = {'bootstrap': False,
 'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

rf = RandomForestClassifier(n_estimators=best_rf_params["n_estimators"],
                            max_depth=best_rf_params["max_depth"],
                            min_samples_leaf=best_rf_params["min_samples_leaf"],
                            min_samples_split=best_rf_params["min_samples_split"],
                            bootstrap=False,
                            random_state=seed,
                            n_jobs=-1)

In [5]:
models = {"adaboost": ab,
          "extratrees": et,
          "lgbm": lg,
          "randomforest": rf}

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

def cross_val(models, X=train, y=y):
    r = dict()
    for name, model in models.items():
        cv_results = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
        r[name] = cv_results
        print(name, 'Accuracy Mean {0:.4f}, Std {1:.4f}'.format(cv_results.mean(), cv_results.std()))
    return r

def choose_best(results):
    errors = dict()
    
    for name, arr in results.items():
        errors[name] = arr.mean()
    
    best_model = [m for m, e in errors.items() if e == max(errors.values())][0]
    return best_model

In [6]:
results = cross_val(models)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


adaboost Accuracy Mean 0.8876, Std 0.0096


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


extratrees Accuracy Mean 0.8994, Std 0.0056


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


lgbm Accuracy Mean 0.8964, Std 0.0077


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


randomforest Accuracy Mean 0.9001, Std 0.0060


In [7]:
best_model = choose_best(results)
print(best_model)

meta_model = models[best_model]

randomforest


In [9]:
estimators = [m for m in models.values()]
stack = StackingCVClassifier(classifiers=estimators,
                             meta_classifier=meta_model,
                             cv=cv,
                             stratify=True,
                             shuffle=True,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=1,
                             random_state=seed,
                             n_jobs=-1)

stack = stack.fit(train, y)

print("Fit completed.")

Fitting 4 classifiers...
Fitting classifier1: adaboostclassifier (1/4)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   35.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier2: extratreesclassifier (2/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier3: lgbmclassifier (3/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier4: randomforestclassifier (4/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.1s finished


Fit completed.


In [10]:
generate_submission(stack, test, test_ids, "6_stacked_classifiers_entire_dataset")

Submission generated.
