In [None]:
# Class Distribution
target_names_list = ["Spruce/Fir", "Lodgepole Pine", "Ponderosa Pine", 
                "Cottonwood/Willow", "Aspen", "Douglas-fir", "Krummholz"]
target_names_dict = {1:"Spruce/Fir",2:"Lodgepole Pine",3:"Ponderosa Pine", 
                4:"Cottonwood/Willow",5:"Aspen", 6:"Douglas-fir", 7:"Krummholz"}

counts = target.value_counts(sort=False)
total = sum(counts)
print "Class Distribution in Target Var: "
print "\tClass                    |  Number of Examples"
for c in np.unique(target):
    print "\t%i, %-17s     |   %i (%0.2f%%)" %(c, target_names_dict[c] ,counts[c], (counts[c]/float(total))*100)


### Correlation matrix on continuous variables

In [None]:
corr = train_df[continuous].corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

corr_columns = list(corr.columns)
corr_comparison_list = []
corr_value_list = []
for column in corr:
    temp_corr_columns = corr_columns   
    temp_corr_columns.remove(column)
    for index in corr.loc[temp_corr_columns, column].index:
        corr_comparison_list.append(str(column) + '_vs_' + str(index))
        corr_value_list.append(corr.loc[index, column])
        
corr_df = pd.DataFrame({
    "corr_comparison" : corr_comparison_list,
    "corr_value" : corr_value_list 
})

corr_df = corr_df.iloc[corr_df.corr_value.abs().argsort()][::-1].reset_index()
corr_df = corr_df.loc[:, ~corr_df.columns.str.contains('index')]
print corr_df.loc[:10,:]

Looking at the above correlation matrix, we see combination of variables that may cause collinearity issues in the analysis.

For example Hillshade_3pm is correlated heavily with Hillshade_9am (corr ~ -0.77).

The above dataframe output shows the top 10 (absolute value) correlation values between continuous variables, so we will need to be knowledgeable about having both of the features in the analysis in the top 10 list. The duplicated information may overfit the data and reduce the generalizability of the final analysis

### Visualize continuous variables mapped to 2D plane

In [None]:
fig = plt.figure(figsize=(20, 120))
gs = gridspec.GridSpec(23, 2) 
df_column_list = list(train_df[continuous].columns)
itr = 0
cmap = mpl.colors.ListedColormap(['black','red', 'green', 'blue', 'cyan', 'violet', 'yellow'])
black_patch = mpatches.Patch(color='black', label='1')
red_patch = mpatches.Patch(color='red', label='2')
green_patch = mpatches.Patch(color='green', label='3')
blue_patch = mpatches.Patch(color='blue', label='4')
cyan_patch = mpatches.Patch(color='cyan', label='5')
violet_patch = mpatches.Patch(color='violet', label='6')
yellow_patch = mpatches.Patch(color='yellow', label='7')


for feature in corr:    
    temp_columns = df_column_list
    index_value = temp_columns.index(feature) + 1
    
    for sub_feature in temp_columns[index_value:]:
        ax = plt.subplot(gs[itr])
        ax.scatter(train_df.loc[:, feature], train_df.loc[:, sub_feature], 
                   c=target, alpha=0.3, cmap=cmap)
        ax.set_xlabel(feature)
        ax.set_ylabel(sub_feature)
        ax.set_title('{} vs {}'.format(feature, sub_feature, fontsize=12))
        ax.legend(handles=[black_patch, red_patch, green_patch, blue_patch, cyan_patch, violet_patch, yellow_patch])
        itr += 1

### Replace missing/bad data in Hillshade 3pm

In [None]:
RF_parameter_grid = {'n_estimators': [300, 500, 600, 700, 800]}


rf_3pm_fix_df = train_df[continuous]
rf_3pm_fix_df = rf_3pm_fix_df.loc[rf_3pm_fix_df['Hillshade_3pm'] != 0.0]
rf_3pm_fix_df_target = rf_3pm_fix_df['Hillshade_3pm']
rf_3pm_fix_df = rf_3pm_fix_df[['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Horizontal_Distance_To_Fire_Points']]

param_searcher = GridSearchCV(RandomForestRegressor(), RF_parameter_grid, cv=5)
X_train_3pm, X_dev_3pm, y_train_3pm, y_dev_3pm = train_test_split(rf_3pm_fix_df, rf_3pm_fix_df_target)
param_searcher.fit(X_train_3pm, y_train_3pm)

model_best_3pm = RandomForestRegressor(**param_searcher.best_params_)
model_best_3pm.fit(X_train_3pm, y_train_3pm)
pred_3pm = model_best_3pm.predict(X_dev_3pm)

In [None]:
difference_3pm_percentage = (pred_3pm/y_dev_3pm - 1)*100
difference_3pm_percentage.describe()

Very good fit, average difference is 0.04% off with the majority of data (1 and 3rd quartile) in between ~ +/- 0.35 % difference between predicted Hillshade at 3pm and Actual Hillshade at 3pm. Will move forward and replace zero values with this model

In [None]:
itr = 0
while itr < len(train_df):
    if train_df.loc[itr, 'Hillshade_3pm'] == 0:
        train_df.loc[itr, 'Hillshade_3pm'] = model_best_3pm.predict(train_df.loc[itr, ['Elevation', 'Aspect', 'Slope', 
                        'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
                       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
                       'Horizontal_Distance_To_Fire_Points']].reshape(1,9))
        
    itr += 1
print train_df.Hillshade_3pm.describe()

No more zero values for HillShade at 3pm

In [None]:
GB_parameter_grid = {
    'n_estimators': [300, 400, 500, 600, 700]
}

GB_model_score, GB_df_param, GB_df_confusion, GB_model = model_optimize(GradientBoostingClassifier, X_train, y_train, X_dev, y_dev, GB_parameter_grid, True)
GB_model_score_custom, GB_df_param_custom, GB_df_confusion_custom = model_optimize(GradientBoostingClassifier, X_train_cust, y_train_cust, X_dev_cust, y_dev_cust, GB_parameter_grid)
GB_model_score_custom_filt, GB_df_param_custom_filt, GB_df_confusion_custom_filt = model_optimize(GradientBoostingClassifier, X_train_cust_filt, y_train_cust_filt, X_dev_cust_filt, y_dev_cust_filt, GB_parameter_grid)

In [None]:
print GB_model_score
print GB_model_score_custom
print GB_model_score_custom_filt

###  Support Vector Classification

In [None]:
param_grid_svm = {'C': [1500, 2000, 2500], 
              'gamma': [0.5, 1.0, 1.5], 
              'kernel': ['rbf']}

SVM_model_score, SVM_df_param, SVM_df_confusion = model_optimize(SVC, X_train, y_train, X_dev, y_dev, param_grid_svm)
SVM_model_score_custom, SVM_df_param_custom, SVM_df_confusion_custom = model_optimize(SVC, X_train_cust, y_train_cust, X_dev_cust, y_dev_cust, param_grid_svm)
SVM_model_score_custom_filt, SVM_df_param_custom_filt, SVM_df_confusion_custom_filt = model_optimize(SVC, X_train_cust_filt, y_train_cust_filt, X_dev_cust_filt, y_dev_cust_filt, param_grid_svm)

In [None]:
print SVM_model_score
print SVM_model_score_custom
print SVM_model_score_custom_filt

In [None]:
XRF_parameter_grid = {'criterion': ['entropy'],
                     'n_estimators': [300, 500, 600, 700, 800], 
                     'min_samples_split': [2,10,20,50], 
                     'min_samples_leaf': [1,10,20,30,40,50]}

# 'min_samples_split': np.linspace(2,400,10), 
# 'min_samples_leaf': np.linspace(1,50,5)


XRF_model_score, XRF_df_param, XRF_df_confusion, XRF_model = model_optimize(ExtraTreesClassifier, X_train, y_train, X_dev, y_dev, RF_parameter_grid, True)
XRF_model_score_custom, XRF_df_param_custom, XRF_df_confusion_custom, XRF_custom_model = model_optimize(ExtraTreesClassifier, X_train_cust, y_train_cust, X_dev_cust, y_dev_cust, RF_parameter_grid, True)
XRF_model_score_custom_filt, XRF_df_param_custom_filt, XRF_df_confusion_custom_filt, XRF_custom_filt_model = model_optimize(ExtraTreesClassifier, X_train_cust_filt, y_train_cust_filt, X_dev_cust_filt, y_dev_cust_filt, RF_parameter_grid, True)


In [None]:
# min_impurity_decrease, min_samples_leaf, min_samples_split

In [None]:
print XRF_model_score
print XRF_model_score_custom
print XRF_model_score_custom_filt

In [None]:
# Summary Bar Plot
group_names = ('kNN', 'Naive Bayes','Decision Tree','Random Forest','ADA Boost',
               'Gradient Boost', 'Support Vector', 'ExtraTreesClassifier') #  Model Families
n_groups = len(group_names)


standard_df_scores = (kNN_model_score, NB_model_score, DT_model_score, RF_model_score,
                     AB_model_score, GB_model_score, SVM_model_score, XRF_model_score)
filtered_df_scores = (kNN_model_score_custom, NB_model_score_custom, DT_model_score_custom, RF_model_score_custom,
                   AB_model_score_custom, GB_model_score_custom, SVM_model_score_custom, XRF_model_score_custom)
custom_filtered_df_scores = (kNN_model_score_custom_filt, NB_model_score_custom_filt, DT_model_score_custom_filt, RF_model_score_custom_filt,
                            AB_model_score_custom_filt, GB_model_score_custom_filt, SVM_model_score_custom_filt, XRF_model_score_custom_filt)

fig, ax = plt.subplots()

index = np.arange(n_groups)
bar_width = 0.20

opacity = 0.7

rects1 = plt.bar(index, standard_df_scores, bar_width,
                  alpha=opacity, color='b', label='Standard')

rects2 = plt.bar(index + bar_width, filtered_df_scores, bar_width,
                  alpha=opacity, color='r', label='Custom')

rects3 = plt.bar(index + bar_width*2, filtered_df_scores, bar_width,
                  alpha=opacity, color='g', label='Custom Filtered')

plt.xlabel('Model Type')
plt.ylabel('Model Scores')
plt.title('Scores by Model Type and Data Preparation')
plt.xticks(index + bar_width / 2, group_names, rotation = 30)
plt.ylim(0,1)
plt.legend()

plt.tight_layout()
plt.show()


## Export Model Predictions on Test Set

In [None]:
# Choose Model to Run
test_preds = ADA_custom_filt_model.predict(custom_transformation_test_filtered)

submission_df = pd.DataFrame(data= {'Id': test_df['Id'], 'Cover_Type': test_preds})
submission_df.to_csv(file_path + '/submission.csv', index = False)


## PCA Aanlysis

### Boxplots of continuous variables by Cover_Type

In [None]:
boxplot_df_cont = train_df[:]
boxplot_df_cont['Cover_Type'] = target
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))

itr_1 = 0
itr_2 = 0
for feature in continuous:
    if itr_2 == 3:
        itr_2 = 0
        itr_1 += 1
    if itr_1 == 3:
        itr_2 = 1
    temp_list = []
    temp_list.append(feature)
    temp_list.append(u'Cover_Type')
    boxplot_df_cont[temp_list].boxplot(by='Cover_Type', ax=axes[itr_1, itr_2])
    itr_2 += 1
    


Looking at the above boxplots by forest cover type, the elevation shows very distinct values per category. Elevation is looking like it is a very important feature in determining the target value of the problem

### Binary variable exploration

In [None]:
df_binary = train_df[binary]
df_binary['Cover_Type'] = target
fig, axes = plt.subplots(nrows=21, ncols=2, figsize=(20, 60))

df_binary_columns = []
df_dict = {}
itr = 0
master_df = pd.DataFrame()
pic_itr1 = 0
pic_itr2 = 0
for feature in df_binary:
    if 'Cover_Type' not in feature:
        df_binary_columns.append(feature)
        temp_list = []
        temp_list.append(feature)
        temp_list.append(u'Cover_Type')
        
        if pic_itr2 == 2:
            pic_itr2 = 0
            pic_itr1 += 1
        
        itr = 1
        for name, group in df_binary.loc[:, temp_list].groupby('Cover_Type'):
            if itr == 1:
                df_out = group[feature].value_counts()
                df_out.name = 'type_{}'.format(itr)
            else:
                df_temp = group[feature].value_counts()
                df_temp.name = 'type_{}'.format(itr)
                df_out = pd.concat([df_out, df_temp], axis = 1)
            itr += 1
            
        
        print df_out.plot(kind='bar', title =feature, 
                          color=['black','red', 'green', 'blue', 'cyan', 'violet', 'yellow'],
                          ax=axes[pic_itr1, pic_itr2])
        master_df = master_df.append(df_out.rename({0: feature + '_0', 1: feature + '_1'}))
        
        pic_itr2 += 1
        
# print master_df

It appears that a lot of the soil type information is very similar with not a lot of added information to the data analysis. This information along with the continuous data information will help make engineering feature elimination decisions.

### Regularization study on variables

In [None]:
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

linreg = LinearRegression()
linreg.fit(train_df[predictors], target)

ridge = RidgeCV(alphas= np.linspace(5, 50, 100))
ridge.fit(train_df[predictors], target)

lasso = LassoCV(alphas= 2. ** np.arange(-10, 10))
lasso.fit(train_df[predictors], target)

en = ElasticNetCV(l1_ratio=np.linspace(.05, .95, 20), alphas= 2. ** np.arange(-10, 10))
en.fit(train_df[predictors], target)

In [None]:
print 'Train R-squared: {:.3}'.format(linreg.score(train_df[predictors], target))
print 'Ridge Train R-squared: {:.3}'.format(ridge.score(train_df[predictors], target))
print 'Lasso Train R-squared: {:.3}'.format(lasso.score(train_df[predictors], target))
print 'EN Train R-squared: {:.3}'.format(en.score(train_df[predictors], target))

In [None]:
coeffs = pd.DataFrame({
        'variable': predictors,
        'OLS': linreg.coef_,
        'Ridge': ridge.coef_,
        'Lasso': lasso.coef_,
        'ElasticNet': en.coef_
    })

coeffs

Looking at the above regression equations, we see that they are not good for fit prediction. However, we also see that some variables are poor added information for prediction:

Soil_Type1
Soil_Type8
Soil_Type9
Soil_Type11
Soil_Type25
Soil_Type27
Soil_Type28
Soil_Type34

### Split train/dev sets

In [None]:
print ADA_model_score_custom_filt

### Gradient Boost