# Learn-Together: Classifier Ensemble Revisions

*Further tweaks to classifier ensemble*

* https://www.kaggle.com/jakelj/basic-ensemble-model

Attempted to add more features (below) but this resulted in reduced performance of the model.

Prior bests:

* adaboost Accuracy Mean 0.8876, Std 0.0096
* extratrees Accuracy Mean 0.8994, Std 0.0056
* lgbm Accuracy Mean 0.8964, Std 0.0077
* randomforest Accuracy Mean 0.9001, Std 0.0060

```python
X['Avg_Shade'] = (X['Hillshade_9am'] + X['Hillshade_Noon'] + X['Hillshade_3pm']) / 3
X['Morn_Noon_Int'] = (X['Hillshade_9am'] + X['Hillshade_Noon']) / 2
X['Noon_Eve_Int'] = (X['Hillshade_3pm'] + X['Hillshade_Noon']) / 2
X['Slope2'] = np.sqrt(X['Horizontal_Distance_To_Hydrology']**2 + X['Vertical_Distance_To_Hydrology']**2)
```

Fit a KNeighborsClassifier. Accuracy was poor from the getgo; unlikely that hyperparameter optimization would boost this model to the level of Ada/ET/RF/LGBM.

Optimized LGBM parameters.

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.metrics import accuracy_score

seed = 42

In [11]:
def generate_submission(model, test_data, test_ids, file_name):
    predictions = model.predict(test_data)
    output = pd.DataFrame({"Id": test_ids, "Cover_Type": predictions})
    output.to_csv("submissions/"+ file_name +".csv", index=False)
    print("Submission generated.")

In [28]:
submission_ex = pd.read_csv("assets/learn-together/sample_submission.csv")
train_df = pd.read_csv("assets/learn-together/train.csv")
test_df = pd.read_csv("assets/learn-together/test.csv")

for X in [train_df, test_df]:
    X['Hydro_Elevation_diff'] = (X['Elevation'] - 
                                 X['Vertical_Distance_To_Hydrology'])

    X['Hydro_Fire_sum'] = (X['Horizontal_Distance_To_Hydrology'] + 
                           X['Horizontal_Distance_To_Fire_Points'])

    X['Hydro_Fire_diff'] = (X['Horizontal_Distance_To_Hydrology'] - 
                            X['Horizontal_Distance_To_Fire_Points']).abs()

    X['Hydro_Road_sum'] = (X['Horizontal_Distance_To_Hydrology'] +
                           X['Horizontal_Distance_To_Roadways'])

    X['Hydro_Road_diff'] = (X['Horizontal_Distance_To_Hydrology'] -
                            X['Horizontal_Distance_To_Roadways']).abs()

    X['Road_Fire_sum'] = (X['Horizontal_Distance_To_Roadways'] + 
                          X['Horizontal_Distance_To_Fire_Points'])

    X['Road_Fire_diff'] = (X['Horizontal_Distance_To_Roadways'] - 
                           X['Horizontal_Distance_To_Fire_Points']).abs()
    
target = ["Cover_Type"]
cols_to_drop = ["Id", "Soil_Type7", "Soil_Type15", "Cover_Type"]

train = train_df.copy()
test = test_df.copy()

y = train[target]
train.drop(columns=cols_to_drop, inplace=True)
test_ids = test["Id"]
test.drop(columns=["Id", "Soil_Type7", "Soil_Type15"], inplace=True)

X_train, X_val, y_train, y_val = train_test_split(train,
                                                  y, 
                                                  test_size=0.2, 
                                                  random_state=seed)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
test = sc.transform(test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [29]:
# NEW
# gm = GaussianMixture(n_components=5)
# gm.fit(X_train)

In [30]:
# gm.predict(X_train)

In [32]:
ab = AdaBoostClassifier(n_estimators=200,
                            base_estimator=DecisionTreeClassifier(
                                min_samples_leaf=2,
                                random_state=seed),
                            random_state=seed)

et = ExtraTreesClassifier(max_depth=400, 
                           n_estimators=450, 
                           n_jobs=-1,
                           oob_score=False,
                           random_state=seed, 
                           warm_start=True)

lg = LGBMClassifier(n_estimators=370,
                           metric='multi_logloss',
                           num_leaves=100,
                           verbosity=0,
                           random_state=seed,
                           n_jobs=-1)

best_rf_params = {'bootstrap': False,
 'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

rf = RandomForestClassifier(n_estimators=best_rf_params["n_estimators"],
                            max_depth=best_rf_params["max_depth"],
                            min_samples_leaf=best_rf_params["min_samples_leaf"],
                            min_samples_split=best_rf_params["min_samples_split"],
                            bootstrap=False,
                            random_state=seed,
                            n_jobs=-1)


kn = KNeighborsClassifier(n_jobs=-1, n_neighbors=1)

In [33]:
models = {"adaboost": ab,
          "extratrees": et,
          "lgbm": lg,
          "randomforest": rf,
          "kneighbors": kn}

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

def cross_val(models, X=train, y=y):
    r = dict()
    for name, model in models.items():
        cv_results = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
        r[name] = cv_results
        print(name, 'Accuracy Mean {0:.4f}, Std {1:.4f}'.format(cv_results.mean(), cv_results.std()))
    return r

def choose_best(results):
    errors = dict()
    
    for name, arr in results.items():
        errors[name] = arr.mean()
    
    best_model = [m for m, e in errors.items() if e == max(errors.values())][0]
    return best_model

In [34]:
results = cross_val(models)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


adaboost Accuracy Mean 0.8876, Std 0.0096


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


extratrees Accuracy Mean 0.8994, Std 0.0056


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


lgbm Accuracy Mean 0.8964, Std 0.0077


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


randomforest Accuracy Mean 0.9001, Std 0.0060


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


kneighbors Accuracy Mean 0.8138, Std 0.0099


In [21]:
best_model = choose_best(results)
print(best_model)

meta_model = models[best_model]

randomforest


In [22]:
estimators = [m for m in models.values()]
stack = StackingCVClassifier(classifiers=estimators,
                             meta_classifier=meta_model,
                             cv=cv,
                             stratify=True,
                             shuffle=True,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=1,
                             random_state=seed,
                             n_jobs=-1)

stack = stack.fit(X_train, y_train)

print("Fit completed.")

Fitting 4 classifiers...
Fitting classifier1: adaboostclassifier (1/4)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier2: extratreesclassifier (2/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier3: lgbmclassifier (3/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   26.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier4: randomforestclassifier (4/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.5s finished


Fit completed.


In [23]:
predictions = stack.predict(X_val)

In [24]:
accuracy = accuracy_score(y_val, predictions)
accuracy

0.9087301587301587

In [36]:
generate_submission(stack, test, test_ids, "6_classifier_ensemble_experimenting")

Submission generated.


#### Optimize LGBM Hyperparameters

In [38]:
lg.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 370,
 'n_jobs': -1,
 'num_leaves': 100,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'metric': 'multi_logloss',
 'verbosity': 0}

In [40]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

param_test_2 ={'n_estimators': [200, 300, 400, 500, 600, 700],
             'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

param_test ={'n_estimators': [200, 300, 400, 500, 600, 700],
             'num_leaves': sp_randint(6, 50)}

In [44]:
lg_test = LGBMClassifier()
lg_random = RandomizedSearchCV(estimator=lg_test, 
                               param_distributions=param_test, 
                               n_iter=100, 
                               cv=3, 
                               verbose=2, 
                               random_state=seed,
                               scoring="accuracy",
                               n_jobs=-1)

In [45]:
lg_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.1min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 300, 400, 500, 600, 700], 'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000023E00444C50>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=2)

In [46]:
lg_random.best_params_

{'n_estimators': 700, 'num_leaves': 47}

{'n_estimators': 700, 'num_leaves': 47}