In [20]:
from xgboost import XGBClassifier

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
! ls ../input

In [76]:
raw_train = pd.read_csv('../input/train.csv', index_col='Id')
raw_test = pd.read_csv('../input/test.csv', index_col='Id')

In [114]:
X_train = raw_train.copy().drop(['Cover_Type'], axis=1)
y_train = raw_train.copy()['Cover_Type']
X_test = raw_test.copy()

In [115]:
X_train.nunique().sort_values()[:3]

Soil_Type7     1
Soil_Type15    1
Soil_Type13    2
dtype: int64

In [116]:
X_train = X_train.drop(['Soil_Type7', 'Soil_Type15'], axis=1)
X_test = X_test.drop(['Soil_Type7', 'Soil_Type15'], axis=1)

# Adding new features

In [117]:
# Sin and cos for Aspect angle
X_train['Aspect_sin'] = np.sin(np.radians(X_train['Aspect']))
X_train['Aspect_cos'] = np.cos(np.radians(X_train['Aspect']))
X_test['Aspect_sin'] = np.sin(np.radians(X_test['Aspect']))
X_test['Aspect_cos'] = np.cos(np.radians(X_test['Aspect']))

In [118]:
# Categories for Aspect (see next cell)
# for df in [X_train, X_test]:
#     df['Aspect_North'] = (df['Aspect'] <= 45) | (df['Aspect'] >= 315)
#     df['Aspect_East'] = (df['Aspect'] >= 45) & (df['Aspect'] <= 135)
#     df['Aspect_South'] = (df['Aspect'] >= 135) & (df['Aspect'] <= 225)
#     df['Aspect_West'] = (df['Aspect'] >= 225) & (df['Aspect'] <= 315)

In [119]:
# Categories for Aspect
for df in [X_train, X_test]:
    df['Aspect_N'] = (df['Aspect'] <= 22.5) | (df['Aspect'] >= 337.5)
    df['Aspect_NE'] = (df['Aspect'] >= 22.5) & (df['Aspect'] <= 67.5)
    df['Aspect_E'] = (df['Aspect'] >= 67.5) & (df['Aspect'] <= 112.5)
    df['Aspect_SE'] = (df['Aspect'] >= 112.5) & (df['Aspect'] <= 157.5)
    df['Aspect_S'] = (df['Aspect'] >= 157.5) & (df['Aspect'] <= 202.5)
    df['Aspect_SW'] = (df['Aspect'] >= 202.5) & (df['Aspect'] <= 247.5)
    df['Aspect_W'] = (df['Aspect'] >= 247.5) & (df['Aspect'] <= 292.5)
    df['Aspect_NW'] = (df['Aspect'] >= 292.5) & (df['Aspect'] <= 337.5)

In [120]:
# Distance to hydrology
# for df in [X_train, X_test]:
#     df['Distance_To_Hydrology'] = np.sqrt(df['Horizontal_Distance_To_Hydrology']**2 
#                                           + df['Vertical_Distance_To_Hydrology']**2)

In [121]:
# Stony soils
stony_soils_indices = [1, 2, 6, 9, 12, 18, 24, 25, 26, 27, 28, 
                       29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40]
for df in [X_train, X_test]:
    df['Stony_Soil'] = pd.Series(np.zeros(len(df)), dtype='bool')
    for index in stony_soils_indices:
        df['Stony_Soil'] = df['Stony_Soil'] | df['Soil_Type' + str(index)]

In [107]:
# Rubbly soils
# rubbly_soils_indices = [3, 4, 5, 10, 11, 13]
# for df in [X_train, X_test]:
#     df['Rubbly_Soil'] = pd.Series(np.zeros(len(df)), dtype='bool')
#     for index in rubbly_soils_indices:
#         df['Rubbly_Soil'] = df['Rubbly_Soil'] | df['Soil_Type' + str(index)]

In [108]:
# Rock outcrop
# rock_outcrop_indices = [1, 3, 4, 5, 6, 10, 27, 28, 32, 33, 35, 37]
# for df in [X_train, X_test]:
#     df['Rock_Outcrop'] = pd.Series(np.zeros(len(df)), dtype='bool')
#     for index in rock_outcrop_indices:
#         df['Rock_Outcrop'] = df['Rock_Outcrop'] | df['Soil_Type' + str(index)]

In [109]:
# Catamount
# catamount_indices = [10, 11, 13, 26, 31, 32, 33]
# for df in [X_train, X_test]:
#     df['Catamount'] = pd.Series(np.zeros(len(df)), dtype='bool')
#     for index in catamount_indices:
#         df['Catamount'] = df['Catamount'] | df['Soil_Type' + str(index)]

In [110]:
# RockLand
# rockland_indices = [11, 12, 13, 34, 36, 40]
# for df in [X_train, X_test]:
#     df['RockLand'] = pd.Series(np.zeros(len(df)), dtype='bool')
#     for index in rockland_indices:
#         df['RockLand'] = df['RockLand'] | df['Soil_Type' + str(index)]

In [122]:
# Leighcan soils
leighcan_indices = [21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 38, 39]
for df in [X_train, X_test]:
    df['Leighcan_Soil'] = pd.Series(np.zeros(len(df)), dtype='bool')
    for index in leighcan_indices:
        df['Leighcan_Soil'] = df['Leighcan_Soil'] | df['Soil_Type' + str(index)]

# Model fitting

In [123]:
model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=11, tree_method='gpu_hist')

In [124]:
import time

start = time.time()
cv_folds = 5
scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring='accuracy')
print('Mean accuracy: {}'.format(scores.mean()))
print('Standard error: {}'.format(scores.std() / cv_folds**0.5))
end = time.time()
print('Time: {}'.format(end - start))

Mean accuracy: 0.793915343915344
Standard error: 0.014419324242856017
Time: 98.40036249160767


In [None]:
scores

In [125]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='gpu_hist', verbosity=1)

In [126]:
importances = pd.DataFrame({'feature':X_train.columns,
                            'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head()

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
Soil_Type3,0.069
Soil_Type12,0.068
Elevation,0.064
Wilderness_Area4,0.064
Soil_Type10,0.059


In [127]:
predictions = model.predict(X_test)
preds_df = pd.DataFrame({'Id': X_test.index, 'Cover_Type': predictions})
preds_df.to_csv('submission5.csv', index=False)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
