In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.neighbors
import sklearn.ensemble
import matplotlib.pyplot as plt
import xgboost
import lightgbm
import os
import scipy
import torch

from models.embed_mlp import EmbedMLPModel

  from pandas import MultiIndex, Int64Index


In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test-full.csv')

In [3]:
train_df_input = train_df.drop(['Id', 'Cover_Type'], axis=1)
train_df_input.values.sum(0)

array([41559587,  2356218,   250338,  3453053,   775833, 25975823,
        3220996,  3309250,  2033294, 23093650,     3568,      569,
           6302,     4681,      339,      627,     1006,      839,
            181,      679,        1,        2,        4,     2096,
            376,      260,      513,      173,        0,      106,
            640,       44,       53,      132,       10,      332,
            742,      265,        6,       48,        8,        7,
           1308,      736,      304,      663,      619,       18,
            103,       14,       32,      744,      634,      456],
      dtype=int64)

In [4]:
test_df_input = test_df.drop(['Id'], axis=1)
test_df_input.values.sum(0)

array([1719426752,   90438473,    8194421,  156541027,   26969912,
       1365463383,  123259400,  129750854,   82810631, 1150572966,
           260796,      29884,     253364,      36968,       3031,
             7525,       4823,      12396,       1597,       6575,
              105,        179,       1147,      32634,      12410,
            29971,      17431,        599,          3,       2845,
             3422,       1899,       4021,       9259,        838,
            33373,      57752,      21278,        474,       2589,
             1086,        946,     115247,      30170,      25666,
            52519,      45154,       1611,       1891,        119,
              298,      15573,      13806,       8750], dtype=int64)

In [5]:
train_df_input = train_df_input.drop(train_df_input.columns[28], axis=1)
test_df_input = test_df_input.drop(train_df_input.columns[28], axis=1)

In [17]:
full_train_X = (train_df_input.values - train_df_input.values.mean(0)) / train_df_input.values.std(0)
full_train_y = train_df['Cover_Type'].values - 1

full_test_X = (test_df_input.values - train_df_input.values.mean(0)) / train_df_input.values.std(0)

label_encoding = False

if label_encoding:
    new_train_X = np.zeros((full_train_X.shape[0], 12))
    new_test_X = np.zeros((full_test_X.shape[0], 12))

    new_train_X[:, :-2] = full_train_X[:, :10]
    new_train_X[:, -2] = np.argmax(full_train_X[:, -43:-39], axis=1)
    new_train_X[:, -1] = np.argmax(full_train_X[:, -39:], axis=1)

    new_test_X[:, :-2] = full_test_X[:, :10]
    new_test_X[:, -2] = np.argmax(full_test_X[:, -43:-39], axis=1)
    new_test_X[:, -1] = np.argmax(full_test_X[:, -39:], axis=1)

    full_test_X = new_test_X
    full_train_X = new_train_X

# Neighbor-based methods

In [7]:
# scores = sklearn.model_selection.cross_validate(sklearn.neighbors.KNeighborsClassifier(n_neighbors=1), full_train_X, train_df['Cover_Type'], cv=10, scoring='accuracy', return_train_score=True, verbose=2)

# Trees, Boosting

### Random Forest

In [8]:
perform_grid_search = False

if perform_grid_search:
    rf_parameters = {'n_estimators': [80, 100, 150, 300],
                     'max_depth': [6, 10, 20, None],
                     'max_features': ['sqrt', 'log2', 0.5, None],
                     'criterion': ['gini', 'entropy', 'log_loss']}
    grid_search_random_forest = sklearn.model_selection.GridSearchCV(sklearn.ensemble.RandomForestClassifier(),
                                                                     rf_parameters,
                                                                     scoring='accuracy',
                                                                     cv=5,
                                                                     verbose=2)

    grid_search_random_forest.fit(full_train_X, full_train_y)

In [9]:
perform_cv = False

if perform_cv:
    scores_random_forest = sklearn.model_selection.cross_validate(sklearn.ensemble.RandomForestClassifier(),
                                                                  full_train_X,
                                                                  full_train_y,
                                                                  cv=5,
                                                                  verbose=2,
                                                                  scoring='accuracy')

    print(scores_random_forest['test_score'].mean())

### AdaBoost

In [10]:
perform_cv = False

if perform_cv:
    scores_adaboost = sklearn.model_selection.cross_validate(sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(max_depth=15), n_estimators=100, learning_rate=2.0),
                                                            full_train_X,
                                                            full_train_y,
                                                            cv=5,
                                                            verbose=2,
                                                            scoring='accuracy')

    print(scores_adaboost['test_score'].mean())

### XGBoost

In [11]:
perform_grid_search = False

if perform_grid_search:
    xgb_parameters = {'max_depth': [12, 13, 14],
                      'learning_rate': [0.22, 0.25, 0.27]}
    grid_search_xgb = sklearn.model_selection.GridSearchCV(xgboost.XGBClassifier(verbosity=0, use_label_encoder=False),
                                                           xgb_parameters,
                                                                     scoring='accuracy',
                                                                     cv=5,
                                                                     verbose=2)

    grid_search_xgb.fit(full_train_X, full_train_y)

Best found: ``{'learning_rate': 0.25, 'max_depth': 14}``

In [12]:
perform_cv = False

if perform_cv:
    scores_xgboost = sklearn.model_selection.cross_validate(xgboost.XGBClassifier(verbosity=0, use_label_encoder=False, max_depth=14, learning_rate=0.25),
                                                             full_train_X,
                                                             full_train_y,
                                                             cv=5,
                                                             verbose=2,
                                                             scoring='accuracy')

    print(scores_xgboost['test_score'].mean())

### LightGBM

In [13]:
perform_grid_search = False

if perform_grid_search:
    lightgbm_parameters = {'max_depth': [15, 20, 25, 30, -1],
                           'colsample_bytree': [0.8, 1.0],
                           'learning_rate': [0.25, 0.3, 0.35, 0.4],
                           'num_leaves': [25, 31, 40, 50, 62],
                           'reg_lambda': [0.0, 1.0],
                           'min_child_samples': [10, 20, 30],
                           'n_estimators': [200]}
    grid_search_lightgbm = sklearn.model_selection.GridSearchCV(lightgbm.sklearn.LGBMClassifier(),
                                                                lightgbm_parameters,
                                                                scoring='accuracy',
                                                                cv=5,
                                                                verbose=2)

    grid_search_lightgbm.fit(full_train_X, full_train_y)

In [14]:
perform_cv = False

if perform_cv:
    scores_lightgbm = sklearn.model_selection.cross_validate(lightgbm.sklearn.LGBMClassifier(max_depth=25, learning_rate=0.25, n_estimators=200, num_leaves=62, colsample_bytree=1.0, reg_lambda=0.0, min_child_samples=10),
                                                             full_train_X,
                                                             full_train_y,
                                                             cv=5,
                                                             verbose=2,
                                                             scoring='accuracy')

    print(scores_lightgbm['test_score'].mean())

# Make submission

In [15]:
use_dl = False

if use_dl:
    ckpt_folder = 'checkpoints/EmbedMLP_53_5_256_LR1E-3expGamma0.985_embed10_dropout0.5_AdamW_CV/checkpoints'

    model = EmbedMLPModel()

    dl_results = []
    for ckpt_name in os.listdir(ckpt_folder):
        ckpt_path = os.path.join(ckpt_folder, ckpt_name)
        model = model.load_from_checkpoint(ckpt_path)
        model.eval()

        test_results = model((torch.tensor(full_test_X), -1))

        predictions = torch.argmax(test_results, dim=1).numpy() + 1
        dl_results.append(predictions)

    dl_predictions = scipy.stats.mode(np.array(dl_results)).mode[0]

In [16]:
make_submission = False

if make_submission:
    if full_train_X.shape[1] > 12:
        new_train_X = np.zeros((full_train_X.shape[0], 12))
        new_test_X = np.zeros((full_test_X.shape[0], 12))

        new_train_X[:, :-2] = full_train_X[:, :10]
        new_train_X[:, -2] = np.argmax(full_train_X[:, -43:-39], axis=1)
        new_train_X[:, -1] = np.argmax(full_train_X[:, -39:], axis=1)

        new_test_X[:, :-2] = full_test_X[:, :10]
        new_test_X[:, -2] = np.argmax(full_test_X[:, -43:-39], axis=1)
        new_test_X[:, -1] = np.argmax(full_test_X[:, -39:], axis=1)

        full_test_X = new_test_X
        full_train_X = new_train_X

    results = []

    model = lightgbm.sklearn.LGBMClassifier(max_depth=25, learning_rate=0.25, n_estimators=200, num_leaves=62, colsample_bytree=1.0, reg_lambda=0.0, min_child_samples=10)
    model.fit(full_train_X, full_train_y)
    results.append(model.predict(full_test_X) + 1)

    model = xgboost.XGBClassifier(verbosity=0, use_label_encoder=False, max_depth=14, learning_rate=0.25)
    model.fit(full_train_X, full_train_y)
    results.append(model.predict(full_test_X) + 1)

    results.append(dl_predictions)

    test_ids = test_df['Id']
    final_predictions = scipy.stats.mode(np.array(results)).mode[0]
    submission_df = pd.DataFrame(data={'Cover_Type': final_predictions})
    submission_df = pd.concat([test_ids, submission_df], axis=1)
    submission_df.to_csv('data/submission.csv', index=False)