# A Study of Feature Importance in the Forest Cover Type Prediction Dataset


Data source: https://www.kaggle.com/c/forest-cover-type-prediction

In [None]:
import os
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from pandas.tools.plotting import scatter_matrix

from IPython.core.display import display, HTML
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif 

%matplotlib inline

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

## Load Data

In [None]:
forest = pd.read_csv("../data/train.csv", index_col=0) 
forest.head()

## Create Functions for Feature Engineering

In [None]:
def labelSoilType(row):
    """
    Label soil types
    """
    for i in range(len(row)):
        if row[i] == 1:
            return 'Soil_Type'+str(i)
        
def azimuth_to_abs(x):
    """
    Only care about the absolute angle from 0 w/o respect to direction
    """
    if x>180:
        return 360-x
    else:
        return x

## Feature Engineering
#### Much of the inspiration of these can be found in the feature exploration notebooks

In [None]:
# Create Soil Type Buckets
soil_types = pd.read_csv('soil_types.csv').set_index('Soil Type')
forest['Soil Type'] = forest[['Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7',
       'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11',
       'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15',
       'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',
       'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23',
       'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',
       'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31',
       'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35',
       'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
       'Soil_Type40']].apply(lambda row: labelSoilType(row), axis=1)
forest = pd.merge(forest, soil_types, how='left', left_on='Soil Type', right_index=True)
del forest['Soil Type'] # Delete string column

# Create feature to that transforms azimuth to its absolute value
forest['Aspect2'] = forest.Aspect.map(azimuth_to_abs)
forest['Aspect2'].astype(int)

# Create feature that determines if the patch is above sea level
forest['Above_Sealevel'] = (forest.Vertical_Distance_To_Hydrology>0).astype(int)

# Bin the Elevation Feature: check the feature exploration notebook for motivation
bins = [0, 2600, 3100, 8000]
group_names = [1, 2, 3]
forest['Elevation_Bucket'] = pd.cut(forest['Elevation'], bins, labels=group_names)
forest['Elevation_0_2600'] = np.where(forest['Elevation_Bucket']== 1, 1, 0)
forest['Elevation_2600_3100'] = np.where(forest['Elevation_Bucket']== 2, 1, 0)
forest['Elevation_3100_8000'] = np.where(forest['Elevation_Bucket']== 3, 1, 0)
forest['Elevation_0_2600'].astype(int)
forest['Elevation_2600_3100'].astype(int)
forest['Elevation_3100_8000'].astype(int)
del forest['Elevation_Bucket']

# Create a feature for no hillshade at 3pm
forest['3PM_0_Hillshade'] = (forest.Hillshade_3pm == 0).astype(int)

#Direct distance to hydrology
forest['Direct_Distance_To_Hydrology'] = np.sqrt((forest.Vertical_Distance_To_Hydrology**2) + \
    (forest.Horizontal_Distance_To_Hydrology**2)).astype(float).round(2)


soil_types= ['Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6',
       'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11',
       'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',
       'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23',
       'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',
       'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31',
       'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35',
       'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
       'Soil_Type40', 'Cover_Type']

column_list = forest.columns.tolist()
column_list = [c for c in column_list if c[:9] != 'Soil_Type']
column_list.insert(10, 'Direct_Distance_To_Hydrology')
column_list.insert(11, 'Elevation_0_2600')
column_list.insert(12, 'Elevation_2600_3100')
column_list.insert(13, 'Elevation_3100_8000')
column_list.insert(14, 'Aspect2')
column_list.insert(15, 'Above_Sealevel')
column_list.insert(16, '3PM_0_Hillshade')
column_list.extend(soil_types)
columns = []
for col in column_list:
    if col not in columns:
        if col != 'Cover_Type':
            columns.append(col)
columns.append('Cover_Type')
        

forest = forest[columns]
forest.fillna(0,inplace=True) # Replace nans with 0 for our soil type bins

## Remove Base Features with no Modeling Value

In [None]:
to_remove = [] # features to drop
for c in forest.columns.tolist():
    if forest[c].std() == 0:
        to_remove.append(c)
forest = forest.drop(to_remove, 1)

## Add Feature Interactions

In [None]:
for i in range(forest.shape[1]-1):
    for j in range(54):
        if i != j:
            forest[forest.columns.tolist()[i]+"_"+forest.columns.tolist()[j]] = forest[forest.columns.tolist()[i]]*forest[forest.columns.tolist()[j]]

## Remove Columns That Have No Value

In [None]:
to_remove = [] # features to drop
for c in forest.columns.tolist():
    if forest[c].std() == 0:
        to_remove.append(c)
forest = forest.drop(to_remove, 1)


### Transform the continuous features
###### We will try Normalization, Standardized Scaling, and MinMax Scaling
###### Note: there is no need to impute any data points as this is a pretty clean data set

In [None]:
# Initialize lists and variables
chunk_size = 0.1 #Validation chunk size
seed = 0 # Use the same random seed to ensure consistent validation chunk usage

ranks1 = [] #array of importance rank of all features
X_all1 = [] # all features
X_all_add1 = [] # Additionally we will make a list of subsets
rem1 = [] # columns to be dropped
i_rem1 = [] # indexes of columns to be dropped
trans_list1 = [] # Transformations
comb1 = [] # combinations
comb1.append("All+1.0")

ranks2 = [] #array of importance rank of all features
X_all2 = [] # all features
X_all_add2 = [] # Additionally we will make a list of subsets
rem2 = [] # columns to be dropped
i_rem2 = [] # indexes of columns to be dropped
trans_list2 = [] # Transformations
comb2 = [] # combinations
comb2.append("All+1.0")

ranks3 = [] #array of importance rank of all features
X_all3 = [] # all features
X_all_add3 = [] # Additionally we will make a list of subsets
rem3 = [] # columns to be dropped
i_rem3 = [] # indexes of columns to be dropped
trans_list3 = [] # Transformations
comb3 = [] # combinations
comb3.append("All+1.0")

ranks4 = [] #array of importance rank of all features
X_all4 = [] # all features
X_all_add4 = [] # Additionally we will make a list of subsets
rem4 = [] # columns to be dropped
i_rem4 = [] # indexes of columns to be dropped
trans_list4 = [] # Transformations
comb4 = [] # combinations
comb4.append("All+1.0")

ratio_list = [0.75,0.50,0.25] #Select top 75%, 50%, 25% of features
features = [] # feature selection models
model_features = [] # names of feature selection models

# reorder the data to have continuous variables come first
continuous = [] # continuous variables
categorical = [] # categorical variables
final_columns = [] # final columns list
for col in forest.columns.tolist():
    if col in to_remove:
        pass
    elif col == 'Cover_Type':
        pass
    elif forest[col].nunique() > 4:
        continuous.append(col)
    else:
        categorical.append(col)

final_columns.extend(continuous)
final_columns.extend(categorical)
final_columns.append('Cover_Type')
forest = forest[final_columns]
num_rows, num_cols = forest.shape
cols = forest.columns
size = len(continuous) # Number of continuous columns


In [None]:
i_cols = []
for i in range(0,num_cols-1):
    i_cols.append(i)

i_cols1=i_cols[0:int((num_cols-1)/4)]
i_cols2=i_cols[int((num_cols-1)/4):int((num_cols-1)/2)]
i_cols3=i_cols[int((num_cols-1)/2):int(3*(num_cols-1)/4)]
i_cols4=i_cols[int(3*(num_cols-1)/4):(num_cols-1)]

cols1 = forest.columns[0:int((num_cols-1)/4)]
cols2 = forest.columns[int((num_cols-1)/4):int((num_cols-1)/2)]
cols3 = forest.columns[int((num_cols-1)/2):int(3*(num_cols-1)/4)]
cols4 = forest.columns[int(3*(num_cols-1)/4):(num_cols-1)]

# Create the data arrays for model building
val_array = forest.values
X1 = val_array[:,0:int((num_cols-1)/4)]
y1 = val_array[:,(num_cols-1)]
X_train1, X_val1, y_train1, y_val1 = train_test_split(X1, y1, test_size=chunk_size, random_state=seed)
X_all1.append(['Orig','X1', X_train1,X_val1,1.0,cols1,rem1,ranks1,i_cols1,i_rem1])

X2 = val_array[:,int((num_cols-1)/4):int((num_cols-1)/2)]
y2 = val_array[:,(num_cols-1)]
X_train2, X_val2, y_train2, y_val2 = train_test_split(X2, y2, test_size=chunk_size, random_state=seed)
X_all2.append(['Orig','X2', X_train2,X_val2,1.0,cols2,rem2,ranks2,i_cols2,i_rem2])

X3 = val_array[:,int((num_cols-1)/2):int(3*(num_cols-1)/4)]
y3 = val_array[:,(num_cols-1)]
X_train3, X_val3, y_train3, y_val3 = train_test_split(X3, y3, test_size=chunk_size, random_state=seed)
X_all3.append(['Orig','X3', X_train3,X_val3,1.0,cols3,rem3,ranks3,i_cols3,i_rem3])

X4 = val_array[:,int(3*(num_cols-1)/4):(num_cols-1)]
y4 = val_array[:,(num_cols-1)]
X_train4, X_val4, y_train4, y_val4 = train_test_split(X4, y4, test_size=chunk_size, random_state=seed)
X_all4.append(['Orig','X4', X_train4,X_val4,1.0,cols4,rem4,ranks4,i_cols4,i_rem4])

In [None]:
X_temp1 = StandardScaler().fit_transform(X_train1)
X_val_temp1 = StandardScaler().fit_transform(X_val1)

X_all1.append(['StdSca','All', X_temp1,X_val_temp1,1.0,cols1,rem1,ranks1,i_cols1,i_rem1])


# MinMax Scale the data

X_temp1 = MinMaxScaler().fit_transform(X_train1)
X_val_temp1 = MinMaxScaler().fit_transform(X_val1)

X_all1.append(['MinMax', 'All', X_temp1,X_val_temp1,1.0,cols1,rem1,ranks1,i_cols1,i_rem1])


#Normalize the data

X_temp1 = Normalizer().fit_transform(X_train1)
X_val_temp1 = Normalizer().fit_transform(X_val1)

X_all1.append(['Norm', 'All', X_temp1,X_val_temp1,1.0,cols1,rem1,ranks1,i_cols1,i_rem1])


# Add transformation to the list
for trans,name,X1,X_val1,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all1:
    trans_list1.append(trans)
trans_list1

# Standardize the data

X_temp2 = StandardScaler().fit_transform(X_train2)
X_val_temp2 = StandardScaler().fit_transform(X_val2)

X_all2.append(['StdSca','All', X_temp2,X_val_temp2,1.0,cols2,rem2,ranks2,i_cols2,i_rem2])


# MinMax Scale the data

X_temp2 = MinMaxScaler().fit_transform(X_train2)
X_val_temp2 = MinMaxScaler().fit_transform(X_val2)

X_all2.append(['MinMax', 'All', X_temp2,X_val_temp2,1.0,cols2,rem2,ranks2,i_cols2,i_rem2])

#Normalize the data

X_temp2 = Normalizer().fit_transform(X_train2)
X_val_temp2 = Normalizer().fit_transform(X_val2)

X_all2.append(['Norm', 'All', X_temp2,X_val_temp2,1.0,cols2,rem2,ranks2,i_cols2,i_rem2])


# Add transformation to the list
for trans,name,X2,X_val2,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all2:
    trans_list2.append(trans)
trans_list2

# Standardize the data

X_temp3 = StandardScaler().fit_transform(X_train3[:,0:size-int((num_cols-1)/2)])
X_val_temp3 = StandardScaler().fit_transform(X_val3[:,0:size-int((num_cols-1)/2)])

# Recombine data
X_con3 = np.concatenate((X_temp3,X_train3[:,size-int((num_cols-1)/2):]),axis=1)
X_val_con3 = np.concatenate((X_val_temp3,X_val3[:,size-int((num_cols-1)/2):]),axis=1)

X_all3.append(['StdSca','All', X_con3,X_val_con3,1.0,cols3,rem3,ranks3,i_cols3,i_rem3])



# MinMax Scale the data

X_temp3 = MinMaxScaler().fit_transform(X_train3[:,0:size-int((num_cols-1)/2)])
X_val_temp3 = MinMaxScaler().fit_transform(X_val3[:,0:size-int((num_cols-1)/2)])

# Recombine data
X_con3 = np.concatenate((X_temp3,X_train3[:,size-int((num_cols-1)/2):]),axis=1)
X_val_con3 = np.concatenate((X_val_temp3,X_val3[:,size-int((num_cols-1)/2):]),axis=1)

X_all3.append(['MinMax', 'All', X_con3,X_val_con3,1.0,cols3,rem3,ranks3,i_cols3,i_rem3])


#Normalize the data

X_temp3 = Normalizer().fit_transform(X_train3[:,0:size-int((num_cols-1)/2)])
X_val_temp3 = Normalizer().fit_transform(X_val3[:,0:size-int((num_cols-1)/2)])

# Recombine data
X_con3 = np.concatenate((X_temp3,X_train3[:,size-int((num_cols-1)/2):]),axis=1)
X_val_con3 = np.concatenate((X_val_temp3,X_val3[:,size-int((num_cols-1)/2):]),axis=1)

X_all3.append(['Norm', 'All', X_con3,X_val_con3,1.0,cols3,rem3,ranks3,i_cols3,i_rem3])


# Add transformation to the list
for trans,name,X3,X_val3,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all3:
    trans_list3.append(trans)
trans_list3

# Standardize the data
X_all4.append(['StdSca','All', X_train4,X_val4,1.0,cols4,rem4,ranks4,i_cols4,i_rem4])

# MinMax Scale the data
X_all4.append(['MinMax', 'All', X_train4,X_val4,1.0,cols4,rem4,ranks4,i_cols4,i_rem4])

#Normalize the data
X_all4.append(['Norm', 'All', X_train4,X_val4,1.0,cols4,rem4,ranks4,i_cols4,i_rem4])

# Add transformation to the list
for trans,name,X4,X_val4,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all4:
    trans_list4.append(trans)
trans_list4

### Create the classifiers for measuring importance
- Extra Trees Classifier
- Gradient Boosting Classifier
- Random Forest Classifier
- XGBoost Classifier
- Random Feature Elimination Classifier
- Select Percentile

In [None]:
# Add Extra Trees Classifier
n = 'Extra Trees Classifier'
model_features.append(n)
for val in ratio_list:
    comb1.append("%s+%s" % (n,val))
    features.append([n,val,ExtraTreesClassifier(n_estimators=len(cols1),max_features=val,n_jobs=-1,random_state=seed)])

# Add Random Forest Classifiers
n = 'Random Forest Classifier'
model_features.append(n)
for val in ratio_list:
    comb1.append("%s+%s" % (n,val))
    features.append([n,val,RandomForestClassifier(n_estimators=len(cols1),max_features=val,n_jobs=-1,random_state=seed)])

# Add XGBoost Classifier
n = 'XGBoost Classifier'
model_features.append(n)
for val in ratio_list:
    comb1.append("%s+%s" % (n,val))
    features.append([n,val,XGBClassifier(n_estimators=len(cols1),seed=seed)])

### Run All the Models

In [None]:
# Determine feature importance for each model and transformation combination

for trans, s, X1, X_val1, d, cols1, rem1, ra1, i_cols1, i_rem1 in X_all1:
    for name, v, model in features:
        print ("Training ", name, "with", v*100, "% features and a", trans, "transformation")
        # Train the model against y
        model.fit(X1,y_train1)

        # Combine importance and index of the column in the array joined
        joined = []
        if name == "Random Feature Elimination":
            for i, pred in enumerate(list(model.ranking_)):
                joined.append([i,cols1[i],pred])
        elif name == "Select Percentile":
            for i, pred in enumerate(list(model.scores_)):
                joined.append([i,cols1[i],pred])
        else:
            for i, pred in enumerate(list(model.feature_importances_)):
                joined.append([i,cols1[i],pred])

        cols_list = [] # List of names of columns selected
        i_cols_list = [] # Indexes of columns selected
        rank_list =[] # Ranking of all the columns
        rem_list = [] # List of columns not selected
        i_rem_list = [] # Indexes of columns not selected

        joined_sorted = sorted(joined, key=lambda x: -x[2]) # Sort in descending order
        rem_start = int(v*(len(cols1))) # Starting point of the columns to be dropped

        # Split the array. Store selected columns in cols_list and removed in rem_list
        for j, (i, col, x) in enumerate(list(joined_sorted)):
            rank_list.append([i,j])
            if(j < rem_start):
                cols_list.append(col)
                i_cols_list.append(i)
            else:
                rem_list.append(col)
                i_rem_list.append(i)

        # Sort the rank_list and store only the ranks. Drop the index
        # Append model name, array, columns selected and columns to be removed to the additional list
        X_all_add1.append([trans,name,X1,X_val1,v,cols_list,rem_list,[x[1] for x in sorted(rank_list,key=lambda x:x[0])],i_cols_list,i_rem_list])


rank_df = pd.DataFrame(data=[x[7] for x in X_all_add1],columns=cols1)
med = rank_df.median()
med.sort()
top50_set1=med[:50]

In [None]:
ratio_list = [0.75,0.50,0.25] #Select top 75%, 50%, 25% of features
features = [] # feature selection models
model_features = [] # names of feature selection models

# Add Extra Trees Classifier
n = 'Extra Trees Classifier'
model_features.append(n)
for val in ratio_list:
    comb2.append("%s+%s" % (n,val))
    features.append([n,val,ExtraTreesClassifier(n_estimators=len(cols2),max_features=val,n_jobs=-1,random_state=seed)])

# Add Random Forest Classifiers
n = 'Random Forest Classifier'
model_features.append(n)
for val in ratio_list:
    comb2.append("%s+%s" % (n,val))
    features.append([n,val,RandomForestClassifier(n_estimators=len(cols2),max_features=val,n_jobs=-1,random_state=seed)])

# Add XGBoost Classifier
n = 'XGBoost Classifier'
model_features.append(n)
for val in ratio_list:
    comb2.append("%s+%s" % (n,val))
    features.append([n,val,XGBClassifier(n_estimators=len(cols2),seed=seed)])

In [None]:
# Determine feature importance for each model and transformation combination

for trans, s, X2, X_val2, d, cols2, rem2, ra2, i_cols2, i_rem2 in X_all2:
    for name, v, model in features:
        print ("Training ", name, "with", v*100, "% features and a", trans, "transformation")
        # Train the model against y
        model.fit(X2,y_train2)

        # Combine importance and index of the column in the array joined
        joined = []
        if name == "Random Feature Elimination":
            for i, pred in enumerate(list(model.ranking_)):
                joined.append([i,cols2[i],pred])
        elif name == "Select Percentile":
            for i, pred in enumerate(list(model.scores_)):
                joined.append([i,cols2[i],pred])
        else:
            for i, pred in enumerate(list(model.feature_importances_)):
                joined.append([i,cols2[i],pred])

        cols_list = [] # List of names of columns selected
        i_cols_list = [] # Indexes of columns selected
        rank_list =[] # Ranking of all the columns
        rem_list = [] # List of columns not selected
        i_rem_list = [] # Indexes of columns not selected

        joined_sorted = sorted(joined, key=lambda x: -x[2]) # Sort in descending order
        rem_start = int(v*(len(cols2))) # Starting point of the columns to be dropped

        # Split the array. Store selected columns in cols_list and removed in rem_list
        for j, (i, col, x) in enumerate(list(joined_sorted)):
            rank_list.append([i,j])
            if(j < rem_start):
                cols_list.append(col)
                i_cols_list.append(i)
            else:
                rem_list.append(col)
                i_rem_list.append(i)

        # Sort the rank_list and store only the ranks. Drop the index
        # Append model name, array, columns selected and columns to be removed to the additional list
        X_all_add2.append([trans,name,X2,X_val2,v,cols_list,rem_list,[x[1] for x in sorted(rank_list,key=lambda x:x[0])],i_cols_list,i_rem_list])


rank_df = pd.DataFrame(data=[x[7] for x in X_all_add2],columns=cols2)
med = rank_df.median()
med.sort()
top50_set2=med[:50]

In [None]:
# Add Extra Trees Classifier
ratio_list = [0.75,0.50,0.25] #Select top 75%, 50%, 25% of features
features = [] # feature selection models
model_features = [] # names of feature selection models
# Add Extra Trees Classifier
n = 'Extra Trees Classifier'
model_features.append(n)
for val in ratio_list:
    comb3.append("%s+%s" % (n,val))
    features.append([n,val,ExtraTreesClassifier(n_estimators=len(cols3),max_features=val,n_jobs=-1,random_state=seed)])

# Add Random Forest Classifiers
n = 'Random Forest Classifier'
model_features.append(n)
for val in ratio_list:
    comb3.append("%s+%s" % (n,val))
    features.append([n,val,RandomForestClassifier(n_estimators=len(cols3),max_features=val,n_jobs=-1,random_state=seed)])

# Add XGBoost Classifier
n = 'XGBoost Classifier'
model_features.append(n)
for val in ratio_list:
    comb3.append("%s+%s" % (n,val))
    features.append([n,val,XGBClassifier(n_estimators=len(cols3),seed=seed)])

In [None]:
# Determine feature importance for each model and transformation combination

for trans, s, X3, X_val3, d, cols3, rem3, ra3, i_cols3, i_rem3 in X_all3:
    for name, v, model in features:
        print ("Training ", name, "with", v*100, "% features and a", trans, "transformation")
        # Train the model against y
        model.fit(X3,y_train3)

        # Combine importance and index of the column in the array joined
        joined = []
        if name == "Random Feature Elimination":
            for i, pred in enumerate(list(model.ranking_)):
                joined.append([i,cols3[i],pred])
        elif name == "Select Percentile":
            for i, pred in enumerate(list(model.scores_)):
                joined.append([i,cols3[i],pred])
        else:
            for i, pred in enumerate(list(model.feature_importances_)):
                joined.append([i,cols3[i],pred])

        cols_list = [] # List of names of columns selected
        i_cols_list = [] # Indexes of columns selected
        rank_list =[] # Ranking of all the columns
        rem_list = [] # List of columns not selected
        i_rem_list = [] # Indexes of columns not selected

        joined_sorted = sorted(joined, key=lambda x: -x[2]) # Sort in descending order
        rem_start = int(v*(len(cols3))) # Starting point of the columns to be dropped

        # Split the array. Store selected columns in cols_list and removed in rem_list
        for j, (i, col, x) in enumerate(list(joined_sorted)):
            rank_list.append([i,j])
            if(j < rem_start):
                cols_list.append(col)
                i_cols_list.append(i)
            else:
                rem_list.append(col)
                i_rem_list.append(i)

        # Sort the rank_list and store only the ranks. Drop the index
        # Append model name, array, columns selected and columns to be removed to the additional list
        X_all_add3.append([trans,name,X3,X_val3,v,cols_list,rem_list,[x[1] for x in sorted(rank_list,key=lambda x:x[0])],i_cols_list,i_rem_list])


rank_df = pd.DataFrame(data=[x[7] for x in X_all_add3],columns=cols3)
med = rank_df.median()
med.sort()
top50_set3=med[:50]

In [None]:
# Add Extra Trees Classifier
ratio_list = [0.75,0.50,0.25] #Select top 75%, 50%, 25% of features
features = [] # feature selection models
model_features = [] # names of feature selection models
# Add Extra Trees Classifier
n = 'Extra Trees Classifier'
model_features.append(n)
for val in ratio_list:
    comb4.append("%s+%s" % (n,val))
    features.append([n,val,ExtraTreesClassifier(n_estimators=len(cols4),max_features=val,n_jobs=-1,random_state=seed)])

# Add Random Forest Classifiers
n = 'Random Forest Classifier'
model_features.append(n)
for val in ratio_list:
    comb4.append("%s+%s" % (n,val))
    features.append([n,val,RandomForestClassifier(n_estimators=len(cols4),max_features=val,n_jobs=-1,random_state=seed)])

# Add XGBoost Classifier
n = 'XGBoost Classifier'
model_features.append(n)
for val in ratio_list:
    comb4.append("%s+%s" % (n,val))
    features.append([n,val,XGBClassifier(n_estimators=len(cols4),seed=seed)])

In [None]:
# Determine feature importance for each model and transformation combination

for trans, s, X4, X_val4, d, cols4, rem4, ra4, i_cols4, i_rem4 in X_all4:
    for name, v, model in features:
        print ("Training ", name, "with", v*100, "% features and a", trans, "transformation")
        # Train the model against y
        model.fit(X4,y_train4)

        # Combine importance and index of the column in the array joined
        joined = []
        if name == "Random Feature Elimination":
            for i, pred in enumerate(list(model.ranking_)):
                joined.append([i,cols4[i],pred])
        elif name == "Select Percentile":
            for i, pred in enumerate(list(model.scores_)):
                joined.append([i,cols4[i],pred])
        else:
            for i, pred in enumerate(list(model.feature_importances_)):
                joined.append([i,cols4[i],pred])

        cols_list = [] # List of names of columns selected
        i_cols_list = [] # Indexes of columns selected
        rank_list =[] # Ranking of all the columns
        rem_list = [] # List of columns not selected
        i_rem_list = [] # Indexes of columns not selected

        joined_sorted = sorted(joined, key=lambda x: -x[2]) # Sort in descending order
        rem_start = int(v*(len(cols4))) # Starting point of the columns to be dropped

        # Split the array. Store selected columns in cols_list and removed in rem_list
        for j, (i, col, x) in enumerate(list(joined_sorted)):
            rank_list.append([i,j])
            if(j < rem_start):
                cols_list.append(col)
                i_cols_list.append(i)
            else:
                rem_list.append(col)
                i_rem_list.append(i)

        # Sort the rank_list and store only the ranks. Drop the index
        # Append model name, array, columns selected and columns to be removed to the additional list
        X_all_add4.append([trans,name,X4,X_val4,v,cols_list,rem_list,[x[1] for x in sorted(rank_list,key=lambda x:x[0])],i_cols_list,i_rem_list])


rank_df = pd.DataFrame(data=[x[7] for x in X_all_add4],columns=cols4)
med = rank_df.median()
med.sort()
top50_set4=med[:50]

top50_set1.to_csv("top_100_set1.csv")
top50_set2.to_csv("top_100_set2.csv")
top50_set3.to_csv("top_100_set3.csv")
top50_set4.to_csv("top_100_set4.csv")

top200 = top50_set4.tolist()
top200.extend(top50_set3.tolist())
top200.extend(top50_set2.tolist())
top200.extend(top50_set1.tolist())

In [None]:
# Initialize lists and variables
chunk_size = 0.1 #Validation chunk size
seed = 0 # Use the same random seed to ensure consistent validation chunk usage

ranks = [] #array of importance rank of all features
X_all = [] # all features
X_all_add = [] # Additionally we will make a list of subsets
rem = [] # columns to be dropped
i_rem = [] # indexes of columns to be dropped
trans_list = [] # Transformations
comb = [] # combinations
comb.append("All+1.0")

ratio_list = [0.75,0.50,0.25] #Select top 75%, 50%, 25% of features
features = [] # feature selection models
model_features = [] # names of feature selection models

# reorder the data to have continuous variables come first
continuous = [] # continuous variables
categorical = [] # categorical variables
final_columns = [] # final columns list
for col in top200:
    if col in to_remove:
        pass
    elif forest[col].nunique() > 4:
        continuous.append(col)
    else:
        categorical.append(col)

final_columns.extend(continuous)
final_columns.extend(categorical)
final_columns.append('Cover_Type')
forest = forest[final_columns]
num_rows, num_cols = forest.shape
cols = forest.columns
size = len(continuous) # Number of continuous columns

In [None]:
i_cols = []
for i in range(0,num_cols-1):
    i_cols.append(i)

# Create the data arrays for model building
val_array = forest.values
X = val_array[:,0:(num_cols-1)]
y = val_array[:,(num_cols-1)]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=chunk_size, random_state=seed)
X_all.append(['Orig','All', X_train,X_val,1.0,cols[:num_cols-1],rem,ranks,i_cols,i_rem])

In [None]:
# Standardize the data

X_temp = StandardScaler().fit_transform(X_train[:,0:size])
X_val_temp = StandardScaler().fit_transform(X_val[:,0:size])

# Recombine data
X_con = np.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = np.concatenate((X_val_temp,X_val[:,size:]),axis=1)

X_all.append(['StdSca','All', X_con,X_val_con,1.0,cols,rem,ranks,i_cols,i_rem])

In [None]:
# MinMax Scale the data

X_temp = MinMaxScaler().fit_transform(X_train[:,0:size])
X_val_temp = MinMaxScaler().fit_transform(X_val[:,0:size])

# Recombine data
X_con = np.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = np.concatenate((X_val_temp,X_val[:,size:]),axis=1)

X_all.append(['MinMax', 'All', X_con,X_val_con,1.0,cols,rem,ranks,i_cols,i_rem])

In [None]:
#Normalize the data

X_temp = Normalizer().fit_transform(X_train[:,0:size])
X_val_temp = Normalizer().fit_transform(X_val[:,0:size])

# Recombine data
X_con = np.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = np.concatenate((X_val_temp,X_val[:,size:]),axis=1)

X_all.append(['Norm', 'All', X_con,X_val_con,1.0,cols,rem,ranks,i_cols,i_rem])

In [None]:
# Add transformation to the list
for trans,name,X,X_val,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all:
    trans_list.append(trans)
trans_list

In [None]:
# Add Extra Trees Classifier
n = 'Extra Trees Classifier'
model_features.append(n)
for val in ratio_list:
    comb.append("%s+%s" % (n,val))
    features.append([n,val,ExtraTreesClassifier(n_estimators=num_cols-1,max_features=val,n_jobs=-1,random_state=seed)])

# Add Random Forest Classifiers
n = 'Random Forest Classifier'
model_features.append(n)
for val in ratio_list:
    comb.append("%s+%s" % (n,val))
    features.append([n,val,RandomForestClassifier(n_estimators=num_cols-1,max_features=val,n_jobs=-1,random_state=seed)])

# Add XGBoost Classifier
n = 'XGBoost Classifier'
model_features.append(n)
for val in ratio_list:
    comb.append("%s+%s" % (n,val))
    features.append([n,val,XGBClassifier(n_estimators=num_cols-1,seed=seed)])

In [None]:
# Determine feature importance for each model and transformation combination
for trans, s, X, X_val, d, cols, rem, ra, i_cols, i_rem in X_all:
    for name, v, model in features:
        print ("Training ", name, "with", v*100, "% features and a", trans, "transformation")
        # Train the model against y
        model.fit(X,y_train)

        # Combine importance and index of the column in the array joined
        joined = []
        if name == "Random Feature Elimination":
            for i, pred in enumerate(list(model.ranking_)):
                joined.append([i,cols[i],pred])
        elif name == "Select Percentile":
            for i, pred in enumerate(list(model.scores_)):
                joined.append([i,cols[i],pred])
        else:
            for i, pred in enumerate(list(model.feature_importances_)):
                joined.append([i,cols[i],pred])

        cols_list = [] # List of names of columns selected
        i_cols_list = [] # Indexes of columns selected
        rank_list =[] # Ranking of all the columns
        rem_list = [] # List of columns not selected
        i_rem_list = [] # Indexes of columns not selected

        joined_sorted = sorted(joined, key=lambda x: -x[2]) # Sort in descending order
        rem_start = int((v*(num_cols-1))) # Starting point of the columns to be dropped

        # Split the array. Store selected columns in cols_list and removed in rem_list
        for j, (i, col, x) in enumerate(list(joined_sorted)):
            rank_list.append([i,j])
            if(j < rem_start):
                cols_list.append(col)
                i_cols_list.append(i)
            else:
                rem_list.append(col)
                i_rem_list.append(i)

        # Sort the rank_list and store only the ranks. Drop the index
        # Append model name, array, columns selected and columns to be removed to the additional list
        X_all_add.append([trans,name,X,X_val,v,cols_list,rem_list,[x[1] for x in sorted(rank_list,key=lambda x:x[0])],i_cols_list,i_rem_list])

In [None]:
rank_df = pd.DataFrame(data=[x[7] for x in X_all_add],columns=cols[:num_cols-1])
med = rank_df.median()
med.sort()
med[:15].to_csv("top_15.csv")
med[:100].to_csv("top_100.csv")

## PCA for Dimension Reduction

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=3)
pca.fit_transform(forest.ix[:,:-1])

In [None]:
print(pca.explained_variance_ratio_) 

In [None]:
pca.components_.shape

In [None]:
test_series=pd.Series(data=forest.ix[10])

In [None]:
test_series2=pd.Series(data=forest.ix[6])

In [None]:
test_series = test_series.tolist()
test_series.extend(test_series2.tolist())

In [None]:
for i in test_series:
    print(test_series[i])