# Learn-Together: Classifiers

*Experimented with different classifiers; also scaled training/test data*

* https://www.kaggle.com/kwabenantim/forest-cover-stacking-multiple-classifiers

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.metrics import accuracy_score

seed = 42

In [18]:
def generate_submission(model, test_data, test_ids, file_name):
    predictions = model.predict(test_data)
    output = pd.DataFrame({"Id": test_ids, "Cover_Type": predictions})
    output.to_csv("submissions/"+ file_name +".csv", index=False)
    print("Submission generated.")

In [19]:
submission_ex = pd.read_csv("assets/learn-together/sample_submission.csv")
train_df = pd.read_csv("assets/learn-together/train.csv")
test_df = pd.read_csv("assets/learn-together/test.csv")

for X in [train_df, test_df]:
    X['Hydro_Elevation_diff'] = (X['Elevation'] - 
                                 X['Vertical_Distance_To_Hydrology'])

    X['Hydro_Fire_sum'] = (X['Horizontal_Distance_To_Hydrology'] + 
                           X['Horizontal_Distance_To_Fire_Points'])

    X['Hydro_Fire_diff'] = (X['Horizontal_Distance_To_Hydrology'] - 
                            X['Horizontal_Distance_To_Fire_Points']).abs()

    X['Hydro_Road_sum'] = (X['Horizontal_Distance_To_Hydrology'] +
                           X['Horizontal_Distance_To_Roadways'])

    X['Hydro_Road_diff'] = (X['Horizontal_Distance_To_Hydrology'] -
                            X['Horizontal_Distance_To_Roadways']).abs()

    X['Road_Fire_sum'] = (X['Horizontal_Distance_To_Roadways'] + 
                          X['Horizontal_Distance_To_Fire_Points'])

    X['Road_Fire_diff'] = (X['Horizontal_Distance_To_Roadways'] - 
                           X['Horizontal_Distance_To_Fire_Points']).abs()
    
target = ["Cover_Type"]
cols_to_drop = ["Id", "Soil_Type7", "Soil_Type15", "Cover_Type"]

train = train_df.copy()
test = test_df.copy()

y = train[target]
train.drop(columns=cols_to_drop, inplace=True)
test_ids = test["Id"]
test.drop(columns=["Id", "Soil_Type7", "Soil_Type15"], inplace=True)

X_train, X_val, y_train, y_val = train_test_split(train,
                                                  y, 
                                                  test_size=0.2, 
                                                  random_state=seed)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

# NEW
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
test = sc.transform(test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [20]:
ab = AdaBoostClassifier(n_estimators=200,
                            base_estimator=DecisionTreeClassifier(
                                min_samples_leaf=2,
                                random_state=seed),
                            random_state=seed)

et = ExtraTreesClassifier(n_estimators=300,
                              min_samples_leaf=2,
                              min_samples_split=2,
                              max_depth=50,
                              random_state=seed,
                              n_jobs=-1)

lg = LGBMClassifier(n_estimators=300,
                        num_leaves=128,
                        verbosity=-1,
                        random_state=seed,
                        n_jobs=1)

best_rf_params = {'bootstrap': False,
 'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

rf = RandomForestClassifier(n_estimators=best_rf_params["n_estimators"],
                            max_depth=best_rf_params["max_depth"],
                            min_samples_leaf=best_rf_params["min_samples_leaf"],
                            min_samples_split=best_rf_params["min_samples_split"],
                            bootstrap=False,
                            random_state=seed,
                            n_jobs=-1)

models = [ab, et, lg, rf]

stack = StackingCVClassifier(classifiers=models,
                             meta_classifier=rf,
                             cv=5,
                             stratify=True,
                             shuffle=True,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=1,
                             random_state=seed,
                             n_jobs=-1)

stack = stack.fit(X_train, y_train)

print("Fit completed.")

Fitting 4 classifiers...
Fitting classifier1: adaboostclassifier (1/4)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   27.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier2: extratreesclassifier (2/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier3: lgbmclassifier (3/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting classifier4: randomforestclassifier (4/4)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.1s finished


Fit completed.


In [21]:
predictions = stack.predict(X_val)

In [22]:
accuracy = accuracy_score(y_val, predictions)
accuracy

0.9110449735449735

In [23]:
generate_submission(stack, test, test_ids, "4_stacked_classifiers_fixed")

Submission generated.
