In [None]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
%matplotlib inline
np.random.seed(9)

In [None]:
fname = 'train.csv'
df = pd.read_csv(fname)

In [None]:
# Some descriptive statistics
# Correlation matrix, looks like the majority of variables are not significantly correlated.
corr = df.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr, vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
plt.suptitle('Correlation matrix')
plt.show()

# Data types - all integers
print(df.dtypes)

# Classification count - 2160 of each
print(df.groupby('Cover_Type').size())

In [None]:
# Split data for testing and validation
array = df.values
X = array[:,0:54]
Y = array[:,55]

Xtrn, Xval, Ytrn, Yval = cross_validation.train_test_split(X, Y, test_size=0.3)

In [None]:
# Spot-checking ensemble methods
scoring = 'accuracy'
names = []
results = []
estimators = []
ensembles = [
    #('Adaboost', AdaBoostClassifier()),
    ('Gradboost', GradientBoostingClassifier()),
    #('Randfor', RandomForestClassifier()),
    #('Extratree', ExtraTreesClassifier())
]
for name, model in ensembles:
    kfold = cross_validation.KFold(n=len(Xtrn),n_folds=10)
    cvres = cross_validation.cross_val_score(model,Xtrn,Ytrn,cv=kfold,scoring=scoring)
    results.append(cvres)
    names.append(name)
    estimators.append(model)
    print("%s: %f, %f" % (name, cvres.mean(), cvres.std()))