## This notebook creates the latest / most individually optimal version of each model and has them "vote" to predict each test data point. It outputs a CSV file capable of being scored on Kaggle. 

## The ensemble process of having each model vote with equal weight is known as bagging.

### Import Packages:

In [16]:
import sklearn as sk
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier

from scipy.stats import mode

# Set the randomizer seed so results are the same each time
np.random.seed(0)

### Import and Prep Data:

In [17]:
# import training data from relative filepath
data = pd.read_csv("../../data/train.csv")

# extract training data except labels and ID column
train_df = data.loc[:, (data.columns != "Cover_Type") & (data.columns != "Id")]

# extract labels from training data
train_labels_df = data.loc[:, "Cover_Type"]

train_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# import test data from relative filepath
test_data = pd.read_csv("../../data/test.csv")

# extract test data except ID column
test_df = test_data.loc[:, test_data.columns != "Id"]

test_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2680,354,14,0,0,2684,196,214,156,6645,...,0,0,0,0,0,0,0,0,0,0
1,2683,0,13,0,0,2654,201,216,152,6675,...,0,0,0,0,0,0,0,0,0,0
2,2713,16,15,0,0,2980,206,208,137,6344,...,0,0,0,0,0,0,0,0,0,0
3,2709,24,17,0,0,2950,208,201,125,6374,...,0,0,0,0,0,0,0,0,0,0
4,2706,29,19,0,0,2920,210,195,115,6404,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# import training data from relative filepath
boot_data = pd.read_csv("../../data/Bootstrapped Data/bootstrapped_data.csv")

# extract training data except labels and ID column
train_boot_data = boot_data.loc[:, (boot_data.columns != "Cover_Type") & (boot_data.columns != "Id")]

# extract labels from training data
train_labels_boot_data = boot_data.loc[:, "Cover_Type"]

In [27]:
# import training data from relative filepath
nn_train_data = pd.read_csv("../Neural Network/nn_train_data.csv")

# import training labels 
nn_train_labels = pd.read_csv("../Neural Network/nn_train_labels.csv")
nn_train_labels = nn_train_labels.to_numpy().ravel()

# import testing data
nn_test_data = pd.read_csv("../Neural Network/nn_test_data.csv")

nn_test_data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Total_Distance_to_Hydrology,Average_Hillshade
0,2680,354,14,0,0,2684,196,214,156,6645,...,0,0,0,0,0,0,0,0,0.0,188.666667
1,2683,0,13,0,0,2654,201,216,152,6675,...,0,0,0,0,0,0,0,0,0.0,189.666667
2,2713,16,15,0,0,2980,206,208,137,6344,...,0,0,0,0,0,0,0,0,0.0,183.666667
3,2709,24,17,0,0,2950,208,201,125,6374,...,0,0,0,0,0,0,0,0,0.0,178.0
4,2706,29,19,0,0,2920,210,195,115,6404,...,0,0,0,0,0,0,0,0,0.0,173.333333


In [28]:
# import training data from relative filepath
dt_train_data = pd.read_csv("../PCA/total_distance_convert_train.csv")

# import training labels 
dt_train_labels = pd.read_csv("../PCA/train_labels.csv")

# import testing data
dt_test_data = pd.read_csv("../PCA/total_distance_convert_test.csv")

# prep for decision tree model
dt_train_data = dt_train_data.to_numpy()
dt_test_data = dt_test_data.drop(columns=["Id"]).to_numpy()
dt_train_labels = dt_train_labels.to_numpy().ravel()

### Create KNN Model and Predictions:

In [22]:
# create model
knn_model = KNeighborsClassifier(n_neighbors = 1, metric = 'euclidean')

# fit to training data
knn_model.fit(train_df, train_labels_df)

# supply the test_df to knn_model and create predictions
knn_predictions = knn_model.predict(test_df)

# converts predictions from np array to pd dataframe
knn_predictions_df = pd.DataFrame(data = knn_predictions, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])

knn_predictions_df

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,2
15123,1
15124,1
15125,1
...,...
581008,3
581009,3
581010,3
581011,3


### Create Naive Bayes Model and Predictions:

In [23]:
# create model
nb_model = GaussianNB()

# fit to training data
nb_model.fit(train_df, train_labels_df)

# supply the test_df to GNB_model and create predictions
nb_predictions = nb_model.predict(test_df)

# converts predictions from np array to pd dataframe
nb_predictions_df = pd.DataFrame(data = nb_predictions, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])

nb_predictions_df

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,2
15123,1
15124,2
15125,2
...,...
581008,3
581009,3
581010,3
581011,3


### Create Decision Tree and Predictions:

In [29]:
dt_model = ExtraTreesClassifier(random_state=0, n_estimators = 1000)

dt_model.fit(dt_train_data, dt_train_labels)

dt_predictions = dt_model.predict(dt_test_data)

dt_predictions_df = pd.DataFrame(data = dt_predictions, 
                                 index = test_data.loc[:, "Id"], 
                                 columns = ["Cover_Type"])

dt_predictions_df 

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,1
15123,1
15124,1
15125,1
...,...
581008,3
581009,3
581010,3
581011,3


### Create Logistic Regression and Predictions:

In [25]:
# create model
log_model = LogisticRegression(C=10, penalty = 'l1', solver = "liblinear", multi_class = "auto", max_iter = 2000)

# fit to training data
log_model.fit(train_df, train_labels_df)

# supply the test_df to log_model and create predictions
log_predictions = log_model.predict(test_df)

# converts predictions from np array to pd dataframe
log_predictions_df = pd.DataFrame(data = log_predictions, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])

log_predictions_df

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,2
15123,2
15124,2
15125,2
...,...
581008,3
581009,3
581010,3
581011,3


### Create Neural Network and Predictions:

In [30]:
# number and structure of hidden layers
hidden_layer_sizes = tuple([100]*10)

# create model
nn_model = MLPClassifier(hidden_layer_sizes = hidden_layer_sizes, 
                           early_stopping = True, alpha = 0.0001, 
                           activation = "relu", random_state = 1)

# fit to training data
nn_model.fit(nn_train_data, nn_train_labels)

# supply the test_df to nn_model and create predictions
nn_predictions = nn_model.predict(nn_test_data)

# converts predictions from np array to pd dataframe
nn_predictions_df = pd.DataFrame(data = nn_predictions, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])

nn_predictions_df

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,2
15123,1
15124,1
15125,1
...,...
581008,3
581009,3
581010,3
581011,3


### Get predictions from each model and determine final test labels:

In [37]:
model_predictions_list = [knn_predictions_df, dt_predictions_df,
                          dt_predictions_df, nn_predictions_df]

# create copy of a test prediction df to overwrite
final_predictions_df = nn_predictions_df.copy()

# loop over each test point
for i in range(len(final_predictions_df)):
    
    # list to hold each prediction for the current test point from each model
    current_predictions = [current_preds.iloc[i][0] for current_preds in model_predictions_list]
    
    # finds most popular current prediction
    best_prediction = mode(current_predictions)

    # write best prediction to final df
    final_predictions_df.iloc[i][0] = best_prediction[0][0]


In [38]:
# outputs final predictions to csv file
final_predictions_df.to_csv("ensemble_predictions.csv")

In [39]:
final_predictions_df

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,1
15123,1
15124,1
15125,1
...,...
581008,3
581009,3
581010,3
581011,3
