In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('datasets/covtype_train.csv')
test = pd.read_csv('datasets/covtype_test.csv')

### Punto 1

In [41]:
from sklearn.cross_validation import cross_val_score

#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

X_train = train.drop('Cover_Type', axis = 1)
X_test = test.drop('Cover_Type', axis = 1)
Y_train = train['Cover_Type']
Y_test = test['Cover_Type']

models = []
models.append(('DT', DecisionTreeClassifier()))
models.append(('BNB', BernoulliNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('GNB', GaussianNB()))

results = []
names = []

for name, model in models:
    model.fit(X_train, Y_train)
    scores = model.score(X_test, Y_test)
    names.append(name)
    results.append(results)
    msg = "{}: {:0.3f}% ({:0.3f}%)".format(name, scores.mean(), scores.std())
    print(msg)

DT: 0.838% (0.000%)
BNB: 0.451% (0.000%)
KNN: 0.713% (0.000%)
GNB: 0.505% (0.000%)


In [42]:
# Con cross-validation
dataset = train.append(test)
X = dataset.drop('Cover_Type', axis = 1)
Y = dataset['Cover_Type']

num_folds = 10
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    scores = cross_val_score(model, X, Y, cv = num_folds, scoring = scoring)
    names.append(name)
    results.append(results)
    msg = "{}: {:0.3f}% ({:0.3f}%)".format(name, scores.mean(), scores.std())
    print(msg)

DT: 0.880% (0.003%)
BNB: 0.618% (0.005%)
KNN: 0.833% (0.005%)
GNB: 0.623% (0.005%)


### Punto 2

In [43]:
"""
Reduce the column to be used by the estimator (with some ratio) and perform an
evaluation
"""
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_new['Vertical_Distance_To_Hydrology'] = scaler.fit_transform(X_new[['Vertical_Distance_To_Hydrology']])

selector = SelectKBest(score_func = chi2, k = 10)
selector.fit(X_new, Y)
X = selector.transform(X_new)

num_folds = 10
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    scores = cross_val_score(model, X_new, Y, cv = num_folds, scoring = scoring)
    names.append(name)
    results.append(results)
    msg = "{}: {:0.3f}% ({:0.3f}%)".format(name, scores.mean(), scores.std())
    print(msg)

DT: 0.881% (0.004%)
BNB: 0.622% (0.004%)
KNN: 0.832% (0.005%)
GNB: 0.627% (0.006%)


### Punto 3

In [51]:
"""
Try to improve the accuracy by discretizing some meaningful attributes (which one?)
"""
num_chunks = 10
name_chunks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
X['Elevation'] = pd.qcut(X['Elevation'], num_chunks, name_chunks)
X['Horizontal_Distance_To_Roadways'] = pd.qcut(X['Horizontal_Distance_To_Roadways'], num_chunks, name_chunks)
X['Aspect'] = pd.qcut(X['Aspect'], num_chunks, name_chunks)

num_folds = 10
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    scores = cross_val_score(model, X, Y, cv = num_folds, scoring = scoring)
    names.append(name)
    results.append(results)
    msg = "{}: {:0.3f}% ({:0.3f}%)".format(name, scores.mean(), scores.std())
    print(msg)

DT: 0.874% (0.003%)
BNB: 0.617% (0.005%)
KNN: 0.728% (0.005%)
GNB: 0.533% (0.004%)


In [50]:
dataset = train.append(test)
X = dataset.drop('Cover_Type', axis = 1)
Y = dataset['Cover_Type']

In [52]:
X

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,22023,2,6,6,390,-2,4,230,237,140,...,0,0,0,0,0,0,0,0,0,0
1,128690,5,6,10,324,54,8,236,236,130,...,0,0,0,0,0,0,0,0,0,0
2,440995,8,1,17,234,113,6,205,201,128,...,0,0,1,0,0,0,0,0,0,0
3,557114,7,6,15,218,28,1,243,230,112,...,0,0,0,0,0,0,0,0,0,0
4,180502,7,5,8,30,-2,9,234,235,133,...,0,0,0,0,0,0,0,0,0,0
5,326331,10,3,5,510,111,6,222,229,143,...,0,0,0,0,0,0,0,1,0,0
6,282108,6,9,21,268,-68,3,158,231,212,...,0,0,0,0,0,0,0,0,0,0
7,234752,5,9,2,306,26,5,215,237,160,...,0,1,0,0,0,0,0,0,0,0
8,553343,5,6,16,331,47,6,242,234,116,...,0,0,0,0,0,0,0,0,0,0
9,444670,6,1,18,437,173,4,197,202,138,...,0,0,0,0,0,0,0,0,0,0
