# Baseline Model Testing

Data source: https://www.kaggle.com/c/forest-cover-type-prediction

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import cross_validation
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif
from IPython.core.display import display, HTML
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from feature_eng_function import feature_eng_forest, forest_interactions
from confusion_matrix_score_function import confusion_matrix_scoring 
%matplotlib inline
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
display(HTML("<style>.container { width:100% !important; }</style>"))



## Load Data

In [2]:
forest_train = pd.read_csv('../ml_project_data/train.csv')
original_cols = list(forest_train.columns)
original_cols.remove('Cover_Type')
forest_train.shape

(15120, 56)

In [3]:
forest_test = pd.read_csv('../ml_project_data/test.csv')
output_df = forest_test[['Id']]
forest_test.shape

(565892, 55)

In [4]:
forest_train_eng = pd.read_csv('../ml_project_data/train_eng.csv')
forest_train_eng_cols = list(forest_train_eng.columns)
forest_train_eng_cols.remove('Cover_Type')
forest_train_eng.shape

(15120, 97)

In [5]:
forest_test_eng = pd.read_csv('../ml_project_data/test_eng.csv')
forest_test_eng.shape

(565892, 96)

In [6]:
# forest_train_interactions = pd.read_csv('../ml_project_data/train_interactions.csv')
# all_cols = list(forest_train_interactions.columns)
# all_cols.remove('Cover_Type')
# forest_train_interactions.shape

In [7]:
# forest_test_interactions = pd.read_csv('../ml_project_data/test_interactions.csv')
# forest_test_interactions.shape

In [8]:
forest_train_100 = pd.read_csv('../ml_project_data/train_100.csv')
top_100_cols = list(forest_train_100.columns)
top_100_cols.remove('Cover_Type')
forest_train_100.shape

(15120, 102)

In [9]:
forest_test_100 = pd.read_csv('../ml_project_data/test_100.csv')
forest_test_100.shape

(565892, 101)

### Transform the continuous features
###### We will try Normalization, Standardized Scaling, and MinMax Scaling
###### Note: there is no need to impute any data points as this is a pretty clean data set

In [10]:
chunk_size = 0.1 #Validation chunk size
seed = 0 # Use the same random seed to ensure consistent validation chunk usage

In [11]:
def transformTrainData(data):
    #Reorder the data to have continuous variables come first
    continuous = []
    categorical = []
    final_columns = []
    for col in data.columns.tolist():
        if col == 'Cover_Type':
            pass
        elif data[col].nunique() > 4:
            continuous.append(col)
        else:
            categorical.append(col)
    final_columns.extend(continuous)
    final_columns.extend(categorical)
    final_columns.append('Cover_Type')
    data = data[final_columns]
    num_row, num_cols = data.shape
    cols = data.columns
    size = len(continuous) # Number of continuous columns
    #Create the data arrays for model building
    val_array = data.values
    X = val_array[:,0:(num_cols-1)]
    y = val_array[:,(num_cols-1)]
    X_train, X_val, y_train, y_val = cross_validation.train_test_split(X, y, test_size=chunk_size, random_state=seed)
    return [X_train,X_val,y_train, y_val, cols[:num_cols-1]]

In [12]:
def transformMinMaxTrainData(data):
    #Reorder the data to have continuous variables come first
    continuous = []
    categorical = []
    final_columns = []
    for col in data.columns.tolist():
        if col == 'Cover_Type':
            pass
        elif data[col].nunique() > 4:
            continuous.append(col)
        else:
            categorical.append(col)
    final_columns.extend(continuous)
    final_columns.extend(categorical)
    final_columns.append('Cover_Type')
    data = data[final_columns]
    num_row, num_cols = data.shape
    cols = data.columns
    size = len(continuous) # Number of continuous columns
    #Create the data arrays for model building
    val_array = data.values
    X = val_array[:,0:(num_cols-1)]
    y = val_array[:,(num_cols-1)]
    X_train, X_val, y_train, y_val = cross_validation.train_test_split(X, y, test_size=chunk_size, random_state=seed)
    # MinMax Scale the data
    X_temp = MinMaxScaler().fit_transform(X_train[:,0:size])
    X_val_temp = MinMaxScaler().fit_transform(X_val[:,0:size])
    # Recombine data
    X_con = np.concatenate((X_temp,X_train[:,size:]),axis=1)
    X_val_con = np.concatenate((X_val_temp,X_val[:,size:]),axis=1)
    return [X_con,X_val_con,y_train, y_val,cols[:num_cols-1]]

In [13]:
def transformTestData(data):
    #Reorder the data to have continuous variables come first
    continuous = []
    categorical = []
    final_columns = []
    for col in data.columns.tolist():
        if data[col].nunique() > 4:
            continuous.append(col)
        else:
            categorical.append(col)
    final_columns.extend(continuous)
    final_columns.extend(categorical)
    data = data[final_columns]
    num_row, num_cols = data.shape
    cols = data.columns
    size = len(continuous) # Number of continuous columns
    #Create the data arrays for model building
    X = data.values
    return [X, cols]

In [14]:
def transformMinMaxTestData(data):
    #Reorder the data to have continuous variables come first
    continuous = []
    categorical = []
    final_columns = []
    for col in data.columns.tolist():
        if data[col].nunique() > 4:
            continuous.append(col)
        else:
            categorical.append(col)
    final_columns.extend(continuous)
    final_columns.extend(categorical)
    data = data[final_columns]
    num_row, num_cols = data.shape
    cols = data.columns
    size = len(continuous) # Number of continuous columns
    #Create the data arrays for model building
    X = data.values
    # MinMax Scale the data
    X_temp = MinMaxScaler().fit_transform(X[:,0:size])
    # Recombine data
    X_con = np.concatenate((X_temp,X[:,size:]),axis=1)
    return [X_con, cols]

In [15]:
datasets = [
    {
        'data name':'original',
        'train':transformTrainData(forest_train),
        'test':transformTestData(forest_test)
    },
    {
        'data name':'original scaled',
        'train':transformMinMaxTrainData(forest_train),
        'test':transformMinMaxTestData(forest_test)
    },
    {
        'data name':'engineered',
        'train':transformTrainData(forest_train_eng),
        'test':transformTestData(forest_test_eng)
    },
    {
        'data name':'engineered scaled',
        'train':transformMinMaxTrainData(forest_train_eng),
        'test':transformMinMaxTestData(forest_test_eng)
    },
    {
        'data name':'top 100',
        'train':transformTrainData(forest_train_100),
        'test':transformTestData(forest_test_100)
    },
    {
        'data name':'top 100 scaled',
        'train':transformMinMaxTrainData(forest_train_100),
        'test':transformMinMaxTestData(forest_test_100)
    }
    
]

# Create classifiers and Grid Search
- Logistic Regression
- SVM

In [16]:
features = [
    {
        'name':'Logistic Regression',
        'model':LogisticRegression(random_state=seed)
    },
    {
        'name':'SVM',
        'model':LinearSVC(random_state=seed)
    }
]

# Run models on selected features

In [17]:
# Determine feature importance for each model and transformation combination
with open('model_testing.txt', 'w+') as file:
    for model in features:
        print ('')
        print (model['name'])
        print (model['model'])
        for d in datasets:
            print (d['data name'])
            X_train,X_val,y_train, y_val, cols = d['train']
            X_test, cols_test = d['test']
            model['model'].fit(X_train, y_train)
            print (model['model'].score(X_val, y_val))
            print (confusion_matrix_scoring(model['model'].predict(X_val), y_val))
            output_df['Cover_Type'] = model['model'].predict(X_test)
            print (output_df.groupby(['Cover_Type']).count())
            output_df.to_csv('%s_%s_prediction.csv'%(model['name'], d['data name']))


Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
original
0.643518518519
6392
                Id
Cover_Type        
1            39809
2            65833
3            17175
5               99
6           441779
7             1197
original scaled
0.647486772487
4784
                Id
Cover_Type        
1           170893
2           191787
3            33089
4            11698
5            94380
6            21497
7            42548
engineered
0.650793650794
5578
                Id
Cover_Type        
1.0         475351
2.0          37562
3.0          49555
4.0            147
5.0            583
6.0           2058
7.0            636
engineered scaled
0.668650793651
4864
                Id
Cover_Type        
1.0         174647
2.0         178264
3.0          