# Learn-Together: Feature Engineering

**References:**

* https://rstudio-pubs-static.s3.amazonaws.com/160297_f7bcb8d140b74bd19b758eb328344908.html
* https://www.kaggle.com/kwabenantim/forest-cover-stacking-multiple-classifiers

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


seed = 42

In [32]:
def generate_submission(model, test_data, test_ids, file_name):
    predictions = model.predict(test_data)
    output = pd.DataFrame({"Id": test_ids, "Cover_Type": predictions})
    output.to_csv("submissions/"+ file_name +".csv", index=False)

In [33]:
submission_ex = pd.read_csv("assets/learn-together/sample_submission.csv")
train_df = pd.read_csv("assets/learn-together/train.csv")
test_df = pd.read_csv("assets/learn-together/test.csv")

In [34]:
for X in [train_df, test_df]:
    X['Hydro_Elevation_diff'] = (X['Elevation'] - 
                                 X['Vertical_Distance_To_Hydrology'])

    X['Hydro_Fire_sum'] = (X['Horizontal_Distance_To_Hydrology'] + 
                           X['Horizontal_Distance_To_Fire_Points'])

    X['Hydro_Fire_diff'] = (X['Horizontal_Distance_To_Hydrology'] - 
                            X['Horizontal_Distance_To_Fire_Points']).abs()

    X['Hydro_Road_sum'] = (X['Horizontal_Distance_To_Hydrology'] +
                           X['Horizontal_Distance_To_Roadways'])

    X['Hydro_Road_diff'] = (X['Horizontal_Distance_To_Hydrology'] -
                            X['Horizontal_Distance_To_Roadways']).abs()

    X['Road_Fire_sum'] = (X['Horizontal_Distance_To_Roadways'] + 
                          X['Horizontal_Distance_To_Fire_Points'])

    X['Road_Fire_diff'] = (X['Horizontal_Distance_To_Roadways'] - 
                           X['Horizontal_Distance_To_Fire_Points']).abs()

In [35]:
target = ["Cover_Type"]
cols_to_drop = ["Id", "Soil_Type7", "Soil_Type15", "Cover_Type"]

train = train_df.copy()
test = test_df.copy()

y = train[target]
train.drop(columns=cols_to_drop, inplace=True)
test_ids = test["Id"]
test.drop(columns=["Id", "Soil_Type7", "Soil_Type15"], inplace=True)

In [36]:
X_train, X_val, y_train, y_val = train_test_split(train,
                                                  y, 
                                                  test_size=0.2, 
                                                  random_state=seed)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((12096, 59), (12096, 1), (3024, 59), (3024, 1))

In [37]:
best_params = {'bootstrap': False,
 'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

rf = RandomForestClassifier(n_estimators=best_params["n_estimators"],
                            max_depth=best_params["max_depth"],
                            min_samples_leaf=best_params["min_samples_leaf"],
                            min_samples_split=best_params["min_samples_split"],
                            bootstrap=False,
                            random_state=seed)

In [38]:
rf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [39]:
rf_predictions = rf.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_predictions)
rf_accuracy

0.9077380952380952

In [40]:
generate_submission(rf, test, test_ids, "3_random_forest_feat_eng")