In [3]:
#Determine what platform and the path to data
import platform
import numpy as np
import pandas as pd

#Predetermine the column types, saves on any transformtions later on. 
#Currently pd.Categorical is not a valid dtype so they come in as objects. Ned to convert
colTypes = {'Elevation':np.float64,'Aspect':np.float64,'Slope':np.float64,'HD.Hydro':np.float64,'VD.Hyrdro': np.float64,
            'HD.Road': np.float64, 'HS.9am':np.float64,'HS.noon':np.float64,'HS.3pm':np.float64,
            'wildernessArea':pd.Categorical,'soilType':pd.Categorical,'climaticZone':pd.Categorical,
            'geologicZone':pd.Categorical,'coverType':pd.Categorical} 




if platform.system()  == "Windows":
     pathToReadData =  "C:\\Users\\IBM_ADMIN\\Desktop\\OneDrive\\PublicData\\AM\\v2\\"
     pathToWriteData = "C:\\Users\\IBM_ADMIN\\Desktop\\OneDrive\\PublicData\\AM\\v3\\"
elif platform.system() == "Darwin":
     pathToReadData = "/Users/briancarter1/Desktop/OneDrive/PublicData/AM/v2/"
     pathToWriteData = "/Users/briancarter1/Desktop/OneDrive/PublicData/AM/v3/"

In [4]:
#Read in the data without binary encoding, 15 original columns 
import pandas as pd
coverData = pd.read_csv(pathToReadData+"covtype15.csv",dtype=colTypes )
coverData.head()


Unnamed: 0,Elevation,Aspect,Slope,HD.Hydro,VD.Hydro,HD.Road,HD.Fire,HS.9am,HS.noon,HS.3pm,wildernessArea,soilType,climaticZone,geologicZone,coverType
0,3391,209,12,558,174,3306,211,251,173,552,ComanchePeak,Bross,alpine,igneous and metamorphic,1.SpruceFir
1,3374,199,17,499,168,3276,212,252,170,576,ComanchePeak,Bross,alpine,igneous and metamorphic,1.SpruceFir
2,3366,197,17,484,160,3252,213,252,168,600,ComanchePeak,Bross,alpine,igneous and metamorphic,1.SpruceFir
3,3357,196,16,469,151,3228,214,252,167,624,ComanchePeak,Bross,alpine,igneous and metamorphic,1.SpruceFir
4,3346,206,17,426,140,3223,207,253,177,633,ComanchePeak,Bross,alpine,igneous and metamorphic,1.SpruceFir


In [5]:
cat_columns = ['wildernessArea','soilType','climaticZone','geologicZone']
wilderness = list(coverData['wildernessArea'].unique())
soil = list(coverData['soilType'].unique())
climaticZone = list(coverData['climaticZone'].unique())
geo = list(coverData['geologicZone'].unique())
cover = list(coverData['coverType'].unique())

coverData['wildernessArea'] = pd.Categorical(coverData['wildernessArea'], categories=wilderness)
coverData['soilType'] = pd.Categorical(coverData['soilType'], categories=soil)
coverData['climaticZone'] = pd.Categorical(coverData['climaticZone'], categories=climaticZone)
coverData['geologicZone'] = pd.Categorical(coverData['geologicZone'], categories=geo)
coverData['coverType'] = pd.Categorical(coverData['coverType'], categories=cover)

In [6]:

import sklearn.cross_validation as cv

#X = coverData.ix[:, coverData.columns != 'coverType']
# .ix makes no data.frame categorical variables are reset to objects as not valid dtype


#Used later
targetLabels=['1.Spruce','2.Lodge','3.Ponder','4.Cotton','5.Aspen','6.Douglas','7.Krymmholz']
targetWeights={0 : 0.365, 1 : 0.498, 2 :0.062, 3 : 0.005, 4 : 0.016, 5 : 0.03, 6: 0.035}

X = coverData.drop('coverType', axis=1)


from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(coverData['coverType'])
y=le.transform(coverData['coverType'])

In [7]:
import sklearn.cross_validation as cv
X_train, X_test, y_train, y_test = cv.train_test_split(X,y,test_size=0.9,random_state=79)

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


from sklearn import tree

#from sklearn.svm import SVC
#from sklearn import neighbors

from sklearn.pipeline import make_pipeline


#from sklearn.svm import SVC
#from sklearn import neighbors

CLASSIFIERS = [make_pipeline(CategoricalTransformer(), StandardScaler(),
                             tree.DecisionTreeClassifier(criterion='gini',max_depth=10,min_samples_split=20)),
               
               make_pipeline(CategoricalTransformer(), StandardScaler(),
                             tree.DecisionTreeClassifier(criterion='gini',max_depth=10,min_samples_split=20,
                                                         class_weight=targetWeights)),
               
               make_pipeline(CategoricalTransformer(), StandardScaler(),
                             tree.DecisionTreeClassifier(criterion='gini',max_depth=10,min_samples_split=20)),
              ]

MODELNAMES =  ['Tree1','WeightTarget','Tree3']

In [11]:

targetWeights={0 : 0, 1 : 0, 2 : 2, 3 : 30 , 4 :  8, 5 : 4 , 6: 4}

In [12]:
#n_samples / (n_classes * np.bincount(y))
len(y_test) / (len(targetLabels)*np.bincount(y_test))

array([ 0,  0,  2, 30,  8,  4,  4])

In [13]:
from sklearn.metrics import confusion_matrix,classification_report


#Lists to hold results
RESULTS = []
CONFUSIONMATRIX = []
CLASSIFICATIONREPORT = []
scores = ['precision_macro','recall_macro','f1_macro']

for pipe,model_name in zip(CLASSIFIERS, MODELNAMES):
    
    #Fit the model 
    pipe.fit(X_train, y_train)
    
    #Cross validation 5 k-fold get accurate estatimate of the training scores. 
    train_scores = {score: cv.cross_val_score(pipe,X_train,y_train,cv=5,scoring=score) for score in scores}
    
    #Save the train scores in RESULTS
    results_dict = {'Model': model_name ,           
                    'precision_macro' : round(train_scores['precision_macro'].mean(),3),
                    'recall_macro' :    round(train_scores['recall_macro'].mean(),3),
                    'f1-score' :  round(train_scores['f1_macro'].mean(),3)
                   }
    
    RESULTS.append(dict(results_dict))
    
    
    #Create Train prediction for calculating Decision Report and Confusion Matrix. 
    y_train_pred = pipe.predict(X_train)
    #y_test_pred = pipe.predict(X_test)
    
    # Create measures based on confusion matrix
    cm = confusion_matrix(y_train, y_train_pred)
    CONFUSIONMATRIX.append(cm)
    
    classreport = classification_report(y_train, y_train_pred,target_names=targetLabels)
    CLASSIFICATIONREPORT.append(classreport)


print(pd.DataFrame(RESULTS))
multiConfusion(CONFUSIONMATRIX,MODELNAMES)

NameError: name 'n_samples' is not defined

In [9]:
from itertools import chain
from sklearn.pipeline import TransformerMixin

class CategoricalTransformer(TransformerMixin):

    def fit(self, X, y=None, *args, **kwargs):
        self.columns_ = X.columns
        self.cat_columns_ = X.select_dtypes(include=['category']).columns
        self.non_cat_columns_ = X.columns.drop(self.cat_columns_)

        self.cat_map_ = {col: X[col].cat.categories
                         for col in self.cat_columns_}
        self.ordered_ = {col: X[col].cat.ordered
                         for col in self.cat_columns_}

        self.dummy_columns_ = {col: ["_".join([col, v])
                                     for v in self.cat_map_[col]]
                               for col in self.cat_columns_}
        self.transformed_columns_ = pd.Index(
            self.non_cat_columns_.tolist() +
            list(chain.from_iterable(self.dummy_columns_[k]
                                     for k in self.cat_columns_))
        )
        
        return self

    def transform(self, X, y=None, *args, **kwargs):
        return (pd.get_dummies(X)
                  .reindex(columns=self.transformed_columns_)
                  .fillna(0))

    def inverse_transform(self, X):
        X = np.asarray(X)
        series = []
        non_cat_cols = (self.transformed_columns_
                            .get_indexer(self.non_cat_columns_))
        non_cat = pd.DataFrame(X[:, non_cat_cols],
                               columns=self.non_cat_columns_)
        for col, cat_cols in self.dummy_columns_.items():
            locs = self.transformed_columns_.get_indexer(cat_cols)
            codes = X[:, locs].argmax(1)
            cats = pd.Categorical.from_codes(codes, self.cat_map_[col],
                                             ordered=self.ordered_[col])
            series.append(pd.Series(cats, name=col))
        # concats sorts, we want the original order
        df = (pd.concat([non_cat] + series, axis=1)
                .reindex(columns=self.columns_))
        return df

self = CategoricalTransformer()  # for testing later

In [8]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

def multiConfusion(CM,MN,cmap=plt.cm.Blues):
    numberPlots = len(CM)
    fig, axs = plt.subplots(1,numberPlots, figsize=(18,4), facecolor='w', edgecolor='k')
    fig.suptitle("Confusion Matrices \n Normalized by Class Size", fontsize=14,fontweight='bold') 
    
    axs = axs.ravel()
    
    for c,m,i in zip(CM,MN,range(len(MN))) :
        cm_normalized = c.astype('float') / c.sum(axis=1)[:, np.newaxis]
        axs[i].imshow(cm_normalized, interpolation='nearest', cmap=cmap)
        axs[i].set_title(m)
        tick_marks = np.arange(len(targetLabels)+1)
        axs[i].set_xticklabels(tick_marks, rotation=90)
        axs[i].set_yticklabels(tick_marks)
        axs[i].set_ylabel('True label')
        axs[i].set_xlabel('Predicted label')

