In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.neighbors
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test-full.csv')
train_df.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

In [3]:
data = train_df.T.to_dict()

In [4]:
train_df_input = train_df.drop(['Id', 'Cover_Type'], axis=1)
train_df_input.values.sum(0)

array([41559587,  2356218,   250338,  3453053,   775833, 25975823,
        3220996,  3309250,  2033294, 23093650,     3568,      569,
           6302,     4681,      339,      627,     1006,      839,
            181,      679,        1,        2,        4,     2096,
            376,      260,      513,      173,        0,      106,
            640,       44,       53,      132,       10,      332,
            742,      265,        6,       48,        8,        7,
           1308,      736,      304,      663,      619,       18,
            103,       14,       32,      744,      634,      456],
      dtype=int64)

In [5]:
test_df_input = test_df.drop(['Id'], axis=1)
test_df_input.values.sum(0)

array([1719426752,   90438473,    8194421,  156541027,   26969912,
       1365463383,  123259400,  129750854,   82810631, 1150572966,
           260796,      29884,     253364,      36968,       3031,
             7525,       4823,      12396,       1597,       6575,
              105,        179,       1147,      32634,      12410,
            29971,      17431,        599,          3,       2845,
             3422,       1899,       4021,       9259,        838,
            33373,      57752,      21278,        474,       2589,
             1086,        946,     115247,      30170,      25666,
            52519,      45154,       1611,       1891,        119,
              298,      15573,      13806,       8750], dtype=int64)

In [6]:
train_df_input = train_df_input.drop(train_df_input.columns[28], axis=1)
test_df_input = test_df_input.drop(train_df_input.columns[28], axis=1)

In [7]:
full_train_X = (train_df_input.values - train_df_input.values.mean(0)) / train_df_input.values.std(0)

pca = sklearn.decomposition.PCA(38)
pca.fit(full_train_X)

print(pca.explained_variance_ratio_.sum())

0.9001537314219382


In [8]:
def kfold_cv(X: pd.DataFrame,
             y: pd.DataFrame,
             n_splits: int = 10,
             apply_pca: bool = False,
             method: str = 'knn',
             n_neighbors: int = 1):
    kf = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=123456)
    all_splits = [i for i in kf.split(X)]

    train_accuracy = []
    val_accuracy = []

    if method == 'knn':
        model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
    elif method == 'ncc':
        model = sklearn.neighbors.NearestCentroid()
    elif method == 'log':
        model = sklearn.linear_model.LogisticRegression(max_iter=500)

    for k in range(n_splits):
        train_indexes, val_indexes = all_splits[k]

        train_X = X.iloc[train_indexes].values
        train_X = (train_X - X.values.mean(0)) / X.values.std(0)
        train_y = y.iloc[train_indexes].values - 1

        val_X = X.iloc[val_indexes].values
        val_X = (val_X - X.values.mean(0)) / X.values.std(0)
        val_y = y.iloc[val_indexes].values - 1

        if apply_pca:
            train_X = pca.transform(train_X)
            val_X = pca.transform(val_X)

        model.fit(train_X, train_y)

        train_predictions = model.predict(train_X)
        val_predictions = model.predict(val_X)

        train_accuracy.append((train_predictions == train_y).mean())
        val_accuracy.append((val_predictions == val_y).mean())

    return (sum(train_accuracy) / len(train_accuracy), sum(val_accuracy) / len(val_accuracy))

In [9]:
methods = ['knn', 'ncc', 'log']

for method in methods:
    train_acc, val_acc = kfold_cv(X=train_df_input,
                                  y=train_df['Cover_Type'],
                                  n_splits=10,
                                  apply_pca=False,
                                  method=method,
                                  n_neighbors=1)
    print(method + ':', '{:.4f}'.format(train_acc), '{:.4f}'.format(val_acc))

knn: 1.0000 0.8095
ncc: 0.6036 0.5998
log: 0.7099 0.7046
