In [86]:
import pandas as pd
import numpy as np

import matplotlib
%matplotlib widget

import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from scipy.stats import mode

import seaborn as sns; 
sns.set_theme()

from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

In [87]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [115]:

train_dataframe = pd.read_csv('data/train.csv')
test_dataframe = pd.read_csv('data/test.csv')

train_dataframe.describe()


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
count,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,...,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0
mean,2000000.0,2980.192,151.5857,15.09754,271.3154,51.66262,1766.642,211.8375,221.0614,140.8109,...,0.037462,0.03782075,0.011995,0.0160535,0.01071275,0.0122075,0.0407515,0.03923925,0.0316185,1.771335
std,1154701.0,289.0482,109.9611,8.546731,226.5497,68.21597,1315.61,30.75996,22.23134,43.69864,...,0.189891,0.1907625,0.1088629,0.1256813,0.1029465,0.1098111,0.197714,0.1941637,0.1749822,0.893806
min,0.0,1773.0,-33.0,-3.0,-92.0,-317.0,-287.0,-4.0,49.0,-53.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,999999.8,2760.0,60.0,9.0,110.0,4.0,822.0,198.0,210.0,115.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2000000.0,2966.0,123.0,14.0,213.0,31.0,1436.0,218.0,224.0,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,2999999.0,3217.0,247.0,20.0,361.0,78.0,2365.0,233.0,237.0,169.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,3999999.0,4383.0,407.0,64.0,1602.0,647.0,7666.0,301.0,279.0,272.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


In [116]:
# Remove the only Cover_Type 5 sample

train_dataframe = train_dataframe[train_dataframe.Cover_Type!=5]
# idx = train_dataframe[train_dataframe["Cover_Type"] == 5].index
# train_dataframe.drop(idx, axis=0, inplace=True)

# Droping same values columns
cols = ["Soil_Type7", "Soil_Type15"]

train_dataframe.drop(cols, axis=1, inplace=True)
test_dataframe.drop(cols, axis=1, inplace=True)

In [117]:
encoder = LabelEncoder()
train_dataframe["Cover_Type"] = encoder.fit_transform(train_dataframe["Cover_Type"])

In [118]:
train_dataframe["Aspect"][train_dataframe["Aspect"] < 0] += 360
train_dataframe["Aspect"][train_dataframe["Aspect"] > 359] -= 360

test_dataframe["Aspect"][test_dataframe["Aspect"] < 0] += 360
test_dataframe["Aspect"][test_dataframe["Aspect"] > 359] -= 360

train_dataframe["Aspect"] *= 2*np.pi/360
test_dataframe["Aspect"] *= 2*np.pi/360

In [119]:
# Manhhattan Distance
train_dataframe["Manhhattan_Distance"] = np.abs(train_dataframe["Horizontal_Distance_To_Hydrology"])+np.abs(train_dataframe["Vertical_Distance_To_Hydrology"])
test_dataframe["Manhhattan_Distance"] = np.abs(test_dataframe["Horizontal_Distance_To_Hydrology"])+np.abs(test_dataframe["Vertical_Distance_To_Hydrology"])

# Euclidian Distance
train_dataframe["Euclidian_Distance"] = (train_dataframe["Horizontal_Distance_To_Hydrology"]**2 + train_dataframe["Vertical_Distance_To_Hydrology"]**2)**0.5
test_dataframe["Euclidian_Distance"] = (test_dataframe["Horizontal_Distance_To_Hydrology"]**2 + test_dataframe["Vertical_Distance_To_Hydrology"]**2)**0.5


In [120]:
soil_features = [x for x in train_dataframe.columns if x.startswith("Soil_Type")]
train_dataframe["Soil_Type_Count"] = train_dataframe[soil_features].sum(axis=1)
test_dataframe["Soil_Type_Count"] = test_dataframe[soil_features].sum(axis=1)

wilderness_features = [x for x in train_dataframe.columns if x.startswith("Wilderness_Area")]
train_dataframe["Wilderness_Area_Count"] = train_dataframe[wilderness_features].sum(axis=1)
test_dataframe["Wilderness_Area_Count"] = test_dataframe[wilderness_features].sum(axis=1)

In [121]:
train_dataframe.loc[train_dataframe["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test_dataframe.loc[test_dataframe["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

train_dataframe.loc[train_dataframe["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test_dataframe.loc[test_dataframe["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

train_dataframe.loc[train_dataframe["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test_dataframe.loc[test_dataframe["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

train_dataframe.loc[train_dataframe["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test_dataframe.loc[test_dataframe["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

train_dataframe.loc[train_dataframe["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test_dataframe.loc[test_dataframe["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

train_dataframe.loc[train_dataframe["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test_dataframe.loc[test_dataframe["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [122]:

cols = [
    "Elevation",
    "Aspect",
    "Manhhattan_Distance",
    "Euclidian_Distance",
    "Soil_Type_Count",
    "Wilderness_Area_Count",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

scaler = RobustScaler()

train_dataframe[cols] = scaler.fit_transform(train_dataframe[cols])
test_dataframe[cols] = scaler.transform(test_dataframe[cols])

In [123]:
train_dataframe = reduce_mem_usage(train_dataframe)
test_dataframe = reduce_mem_usage(test_dataframe)

Memory usage after optimization is: 445.62 MB
Decreased by 76.9%
Memory usage after optimization is: 70.57 MB
Decreased by 83.8%


In [124]:
# train_dataframe = train_dataframe.dropna()

train_dataframe.head()
train_dataframe.describe()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Manhhattan_Distance,Euclidian_Distance,Soil_Type_Count,Wilderness_Area_Count
count,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,...,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0
mean,1999999.0,,,,,,,,,,...,0.01071275,0.0122075,0.04075151,0.03923926,0.03161851,0.7529129,,,,
std,1154701.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1029466,0.1098111,0.197714,0.1941637,0.1749822,0.7937505,0.0,0.0,0.0,0.0
min,0.0,-2.611328,-0.6557617,-1.545898,-1.214844,-4.703125,-1.116211,-6.230469,-6.480469,-2.628906,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.014648,-0.9697266,-1.0,-1.0
25%,999999.5,-0.4506836,-0.333252,-0.4545898,-0.4104004,-0.3647461,-0.3979492,-0.5712891,-0.5185547,-0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.4216309,-0.4130859,-1.0,0.0
50%,1999999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,2999998.0,0.5493164,0.6665039,0.5454102,0.5898438,0.6352539,0.6020508,0.4284668,0.4814453,0.5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.578125,0.5869141,0.0,0.0
max,3999999.0,3.101562,1.305664,4.546875,5.535156,8.328125,4.039062,1.057617,1.148438,2.091797,...,1.0,1.0,1.0,1.0,1.0,5.0,6.125,5.71875,6.0,2.0


In [125]:
# train_dataframe = train_dataframe[train_dataframe["Cover_Type"] != 3]

X = train_dataframe.drop("Cover_Type", axis=1).values
y = train_dataframe["Cover_Type"].values

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

classes_number = len(train_dataframe["Cover_Type"].unique())

In [126]:

print(classes_number, train_dataframe["Cover_Type"].unique())

print(0,len(train_dataframe[train_dataframe["Cover_Type"] == 0]))
print(1,len(train_dataframe[train_dataframe["Cover_Type"] == 1]))
print(2,len(train_dataframe[train_dataframe["Cover_Type"] == 2]))
print(3,len(train_dataframe[train_dataframe["Cover_Type"] == 3]))
print(4,len(train_dataframe[train_dataframe["Cover_Type"] == 4]))
print(5,len(train_dataframe[train_dataframe["Cover_Type"] == 5]))

print(np.unique(y_train))

6 [0 1 2 4 5 3]
0 1468136
1 2262087
2 195712
3 377
4 11426
5 62261
[0 1 2 3 4 5]


In [127]:

params = {
    'learning_rate': 0.02, 
    'reg_lambda': 17.1, 
    'reg_alpha': 1.2e-04, 
    'subsample': 0.25, 
    'colsample_bytree': 0.92, 
    'max_depth': 12,
    'grow_policy': 'depthwise'
}

model = XGBClassifier(
    
    random_state = 69,
    tree_method='gpu_hist',
    objective = 'multi:softmax',
    sampling_method = 'gradient_based',
    n_estimators=10000,
    n_jobs=-1,
    num_class = classes_number,
    use_label_encoder=False,
    eval_metric = 'mlogloss',
    **params
)

In [128]:
model.fit(x_train, y_train, early_stopping_rounds=10, eval_set=[(x_val, y_val)], verbose=10)

[0]	validation_0-mlogloss:1.73888
[10]	validation_0-mlogloss:1.35302
[20]	validation_0-mlogloss:1.07940
[30]	validation_0-mlogloss:0.88651
[40]	validation_0-mlogloss:0.73224
[50]	validation_0-mlogloss:0.61449
[60]	validation_0-mlogloss:0.51985
[70]	validation_0-mlogloss:0.44328
[80]	validation_0-mlogloss:0.38275
[90]	validation_0-mlogloss:0.33110
[100]	validation_0-mlogloss:0.28882
[110]	validation_0-mlogloss:0.25595
[120]	validation_0-mlogloss:0.22719
[130]	validation_0-mlogloss:0.20361
[140]	validation_0-mlogloss:0.18439
[150]	validation_0-mlogloss:0.16915
[160]	validation_0-mlogloss:0.15604
[170]	validation_0-mlogloss:0.14496
[180]	validation_0-mlogloss:0.13577
[190]	validation_0-mlogloss:0.12774
[200]	validation_0-mlogloss:0.12100
[210]	validation_0-mlogloss:0.11539
[220]	validation_0-mlogloss:0.11077
[230]	validation_0-mlogloss:0.10676
[240]	validation_0-mlogloss:0.10344
[250]	validation_0-mlogloss:0.10059
[260]	validation_0-mlogloss:0.09800
[270]	validation_0-mlogloss:0.09574
[28

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.92,
              enable_categorical=False, eval_metric='mlogloss', gamma=0,
              gpu_id=0, grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.02, max_delta_step=0,
              max_depth=12, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10000, n_jobs=-1,
              num_class=6, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=69, reg_alpha=0.00012,
              reg_lambda=17.1, sampling_method='gradient_based',
              scale_pos_weight=None, subsample=0.25, ...)

In [129]:
model.save_model("xgboost.json")

In [130]:
val_predictions = encoder.inverse_transform(model.predict(x_val))
test_predictions = encoder.inverse_transform(model.predict(test_dataframe))

print(accuracy_score(y_val, val_predictions))

0.014965


In [131]:

submission_df = pd.read_csv("data/sample_submission.csv")

test_ids = test_dataframe.Id.values.tolist()
print(len(test_predictions))
print(len(test_ids))
submission_df = pd.DataFrame(list(zip(test_ids, test_predictions)), columns=["Id", "Cover_Type"])

submission_df.columns = ["Id", "Cover_Type"]
submission_df.to_csv("submission.csv", index=False)
submission_df.head()

1000000
1000000


Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2
