### Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import time

### Load Data

In [2]:
DATA_PATH = "../../data/Forest-Cover/"

train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

cat_columns = [col for col in train.columns if train[col].dtype == object]
print("Categorical columns:")
print(" --- ".join(cat_columns))

### Numerical columns
num_columns = [col for col in train.columns if train[col].dtype != object]
print("Numerical columns:")
print(" --- ".join(num_columns))
print()
print("Shape of train:", train.shape)
print("Shape of test:",  test.shape)

Categorical columns:

Numerical columns:
Id --- Elevation --- Aspect --- Slope --- Horizontal_Distance_To_Hydrology --- Vertical_Distance_To_Hydrology --- Horizontal_Distance_To_Roadways --- Hillshade_9am --- Hillshade_Noon --- Hillshade_3pm --- Horizontal_Distance_To_Fire_Points --- Wilderness_Area1 --- Wilderness_Area2 --- Wilderness_Area3 --- Wilderness_Area4 --- Soil_Type1 --- Soil_Type2 --- Soil_Type3 --- Soil_Type4 --- Soil_Type5 --- Soil_Type6 --- Soil_Type7 --- Soil_Type8 --- Soil_Type9 --- Soil_Type10 --- Soil_Type11 --- Soil_Type12 --- Soil_Type13 --- Soil_Type14 --- Soil_Type15 --- Soil_Type16 --- Soil_Type17 --- Soil_Type18 --- Soil_Type19 --- Soil_Type20 --- Soil_Type21 --- Soil_Type22 --- Soil_Type23 --- Soil_Type24 --- Soil_Type25 --- Soil_Type26 --- Soil_Type27 --- Soil_Type28 --- Soil_Type29 --- Soil_Type30 --- Soil_Type31 --- Soil_Type32 --- Soil_Type33 --- Soil_Type34 --- Soil_Type35 --- Soil_Type36 --- Soil_Type37 --- Soil_Type38 --- Soil_Type39 --- Soil_Type40 --- 

### Split into id, target, and predictors

In [3]:
train_y = train["Cover_Type"]
train_id = train["Id"]
train_x = train.drop(["Cover_Type", "Id"], axis=1)

test_id = test["Id"]
test_x  = test.drop("Id", axis=1)

full    = pd.concat([train_x, test_x])
train_N = len(train_x)

### Create Features

In [4]:
full['HF1'] = abs(full['Horizontal_Distance_To_Hydrology']   + full['Horizontal_Distance_To_Fire_Points'])
full['HF2'] = abs(full['Horizontal_Distance_To_Hydrology']   - full['Horizontal_Distance_To_Fire_Points'])
full['HR1'] = abs(full['Horizontal_Distance_To_Hydrology']   + full['Horizontal_Distance_To_Roadways'])
full['HR2'] = abs(full['Horizontal_Distance_To_Hydrology']   - full['Horizontal_Distance_To_Roadways'])
full['FR1'] = abs(full['Horizontal_Distance_To_Fire_Points'] + full['Horizontal_Distance_To_Roadways'])
full['FR2'] = abs(full['Horizontal_Distance_To_Fire_Points'] - full['Horizontal_Distance_To_Roadways'])
full['ele_vert'] = full.Elevation-full.Vertical_Distance_To_Hydrology

full['slope_hyd'] = (full['Horizontal_Distance_To_Hydrology']**2 + full['Vertical_Distance_To_Hydrology']**2)**0.5
full["slope_hyd"] = full.slope_hyd.map(lambda x: 0 if np.isinf(x) else x)

full['Mean_Amenities'] = (full.Horizontal_Distance_To_Fire_Points +
                          full.Horizontal_Distance_To_Hydrology +
                          full.Horizontal_Distance_To_Roadways) / 3 
full['Mean_Fire_Hyd']  = (full.Horizontal_Distance_To_Fire_Points +
                          full.Horizontal_Distance_To_Hydrology) / 2 

### Split back into train and test

In [5]:
train_x = full[:train_N]
test_x  = full[train_N:]

### Machine Learning Imports

In [6]:
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier,
                              ExtraTreesClassifier, VotingClassifier)
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

TAKE_CV = False
first_layer_train = pd.DataFrame()
first_layer_preds = pd.DataFrame()

### Extra Trees 1

This works really well, so I'm going to create a second model with different params and random_state

In [9]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

etc_model_1 = ExtraTreesClassifier(
    n_estimators=25,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    # old: max_features='auto'  -> use 'sqrt' (old "auto" behavior) or omit to take default
    max_features='sqrt',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,   # <- replaces min_impurity_split
    bootstrap=False,
    random_state=17,
    n_jobs=-1
)

if TAKE_CV:
    scores = cross_val_score(etc_model_1, train_x, train_y, cv=5, verbose=1)
    score_mean = round(np.mean(scores), 4)
    score_std  = round(np.std(scores), 3)
    print(f"Score is {score_mean} +/- {score_std}")

etc_model_1.fit(train_x, train_y)
first_layer_train["ETC1"] = etc_model_1.predict(train_x)
first_layer_preds["ETC1"] = etc_model_1.predict(test_x)


### Extra Trees 2

In [11]:
etc_model_2 = ExtraTreesClassifier(
    n_estimators=500,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    # old: 'auto' -> use 'sqrt' (equivalent for classifiers) or omit
    max_features='sqrt',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=False,
    random_state=71,
    n_jobs=-1
)

if TAKE_CV:
    scores = cross_val_score(etc_model_2, train_x, train_y, cv=5, verbose=1)
    score_mean = round(np.mean(scores), 4)
    score_std  = round(np.std(scores), 3)
    print(f"Score is {score_mean} +/- {score_std}")

etc_model_2.fit(train_x, train_y)

first_layer_train["ETC2"] = etc_model_2.predict(train_x)
first_layer_preds["ETC2"] = etc_model_2.predict(test_x)


### XGB Model 1

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import numpy as np

# 1) Encode labels ONCE so they’re consistent across CV and final fit
le = LabelEncoder()
y_all = le.fit_transform(train_y)   # e.g., maps {1..7} -> {0..6}

# 2) Define a base model (note: learning_rate=1 is very high; consider 0.1)
base_xgb = XGBClassifier(
    max_depth=4,
    learning_rate=0.1,          # <-- more typical than 1.0
    n_estimators=400,           # <-- more trees when lowering learning_rate
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=1,
    colsample_bytree=0.6,
    colsample_bylevel=1,
    reg_alpha=3,
    reg_lambda=3,
    base_score=0.5,
    random_state=17,
    n_jobs=-1
)

# 3) Cross-validate (safe: cross_val_score clones internally)
if TAKE_CV:
    scores = cross_val_score(base_xgb, train_x, y_all, cv=5, verbose=1)
    print(f"Score is {scores.mean():.4f} +/- {scores.std():.3f}")

# 4) Fresh instance for the actual fit
xgb_model_1 = XGBClassifier(**base_xgb.get_params())
xgb_model_1.fit(train_x, y_all)

# 5) Predictions
# If you need original label space, inverse-transform:
first_layer_train["XGB1"] = le.inverse_transform(xgb_model_1.predict(train_x))
first_layer_preds["XGB1"] = le.inverse_transform(xgb_model_1.predict(test_x))


### XGB Model 2

This model is taken from [Siddharth Yadav](https://www.kaggle.com/thebrownviking20) and his excellent [kernel](https://www.kaggle.com/thebrownviking20/voting-classifier-for-victory)

In [21]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import numpy as np

# 1. Encode labels consistently
le = LabelEncoder()
y_all = le.fit_transform(train_y)   # maps [1..7] -> [0..6]

# 2. Define model
xgb_model_2 = XGBClassifier(
    max_depth=20,
    n_estimators=1000,
    random_state=71,
    n_jobs=-1
)

# 3. Cross-validation
if TAKE_CV:
    scores = cross_val_score(xgb_model_2, train_x, y_all, cv=5, verbose=1)
    print(f"Score is {scores.mean():.4f} +/- {scores.std():.3f}")

# 4. Final fit
xgb_model_2.fit(train_x, y_all)

# 5. Predictions (inverse transform to restore original labels if needed)
first_layer_train["XGB2"] = le.inverse_transform(xgb_model_2.predict(train_x))
first_layer_preds["XGB2"] = le.inverse_transform(xgb_model_2.predict(test_x))


### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

rfc_model = RandomForestClassifier(
    n_estimators=25,
    criterion='gini',
    max_depth=None,
    min_samples_split=6,
    min_samples_leaf=3,
    min_weight_fraction_leaf=0.0,
    # old: 'auto' → use 'sqrt' (equivalent for classifiers) or omit for default
    max_features='sqrt',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    random_state=17,
    n_jobs=-1
)

if TAKE_CV:
    scores = cross_val_score(rfc_model, train_x, train_y, cv=5, verbose=1)
    score_mean = round(np.mean(scores), 4)
    score_std  = round(np.std(scores), 3)
    print(f"Score is {score_mean} +/- {score_std}")

rfc_model.fit(train_x, train_y)

first_layer_train["RFC"] = rfc_model.predict(train_x)
first_layer_preds["RFC"] = rfc_model.predict(test_x)


### Light Gradient Boosting 1

In [24]:
lgb_model_1  = LGBMClassifier(num_leaves=45, max_depth=7,
                              learning_rate=0.3,
                              reg_lambda=0.5, reg_alpha=0.5,
                              min_split_gain=0.1, min_child_weight=0.5,
                              min_data_in_leaf=5,
                              feature_fraction=0.5,
                              random_state=17)

if TAKE_CV:
    scores = cross_val_score(lgb_model_1, train_x, train_y, cv=5, verbose=1)
    score_mean = round(np.mean(scores), 4)
    score_std  = round(np.std(scores), 3)
    print(f"Score is {score_mean} +/- {score_std}")

lgb_model_1.fit(train_x, train_y)

first_layer_train["LGB1"] = lgb_model_1.predict(train_x)
first_layer_preds["LGB1"] = lgb_model_1.predict(test_x);

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4728
[LightGBM] [Info] Number of data points in the train set: 15120, number of used features: 60
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910


### Light Gradient Boosting 2

In [30]:
from lightgbm import LGBMClassifier

lgb_model_2 = LGBMClassifier(
    num_leaves=70, max_depth=8,
    learning_rate=0.1,
    reg_lambda=1, reg_alpha=1,
    min_split_gain=0.1, min_child_weight=0.5,
    min_data_in_leaf=5,
    feature_fraction=0.3,
    random_state=71,
    verbosity=-1   # silence LightGBM logs
)

lgb_model_2.fit(train_x, train_y)


0,1,2
,boosting_type,'gbdt'
,num_leaves,70
,max_depth,8
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.1
,min_child_weight,0.5


### Light Gradient Boosting 3

This model is taken from [Siddharth Yadav](https://www.kaggle.com/thebrownviking20) and his excellent [kernel](https://www.kaggle.com/thebrownviking20/voting-classifier-for-victory)

In [31]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

lgb_model_3 = LGBMClassifier(
    n_estimators=2000,
    max_depth=12,          # was 15; shallower → faster
    num_leaves=256,        # cap leaves to match depth
    learning_rate=0.05,    # enable effective early stopping
    subsample=0.8,         # speed + regularization
    colsample_bytree=0.8,  # speed + regularization
    n_jobs=-1,
    verbosity=-1,          # silence all LightGBM logs
    random_state=171
)

# use a tiny validation split just to enable early stopping
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train_x, train_y, test_size=0.1, stratify=train_y, random_state=171)

lgb_model_3.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="logloss",
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)]  # no logs
)

first_layer_train["LGB3"] = lgb_model_3.predict(train_x)
first_layer_preds["LGB3"]  = lgb_model_3.predict(test_x)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[190]	valid_0's multi_logloss: 0.263009


### Ada Boost 1

This model is taken from [Siddharth Yadav](https://www.kaggle.com/thebrownviking20) and his excellent [kernel](https://www.kaggle.com/thebrownviking20/voting-classifier-for-victory)

In [33]:
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier

ada_model_1 = AdaBoostClassifier(
    estimator=ExtraTreesClassifier(n_estimators=500),
    n_estimators=250,
    learning_rate=0.01,
    random_state=17
)


if TAKE_CV:
    scores = cross_val_score(ada_model_1, train_x, train_y, cv=5, verbose=1)
    score_mean = round(np.mean(scores), 4)
    score_std  = round(np.std(scores), 3)
    print(f"Score is {score_mean} +/- {score_std}")

ada_model_1.fit(train_x, train_y)

first_layer_train["ADA1"] = ada_model_1.predict(train_x)
first_layer_preds["ADA1"] = ada_model_1.predict(test_x);

### Ada Boost 2

This model is taken from [Siddharth Yadav](https://www.kaggle.com/thebrownviking20) and his excellent [kernel](https://www.kaggle.com/thebrownviking20/voting-classifier-for-victory)

In [34]:
ada_model_2 = AdaBoostClassifier(
    estimator=GradientBoostingClassifier(n_estimators=1000, max_depth=10),
    n_estimators=1000,
    learning_rate=0.01,
    random_state=17
)


if TAKE_CV:
    scores = cross_val_score(ada_model_2, train_x, train_y, cv=5, verbose=1)
    score_mean = round(np.mean(scores), 4)
    score_std  = round(np.std(scores), 3)
    print(f"Score is {score_mean} +/- {score_std}")

ada_model_2.fit(train_x, train_y)

first_layer_train["ADA2"] = ada_model_2.predict(train_x)
first_layer_preds["ADA2"] = ada_model_2.predict(test_x);

KeyboardInterrupt: 

### Ensemble Classifer

In [36]:
# enforce identical columns + order
cols = list(first_layer_train.columns)
X_tr = first_layer_train[cols]
X_te = first_layer_preds.reindex(columns=cols)  # drops extras, inserts missing as NaN

# (optional) if any NaNs appear because of missing test cols:
X_te = X_te.fillna(0)

voting_model = SVC()
if TAKE_CV:
    scores = cross_val_score(voting_model, X_tr, train_y, cv=5, verbose=1)
    print(f"Score is {scores.mean():.4f} +/- {scores.std():.3f}")

voting_model.fit(X_tr, train_y)
predictions = voting_model.predict(X_te)


### Save Predictions

In [41]:
import numpy as np
import pandas as pd

# 1) Ensure predictions are 1-D labels (not (n,1) or probs)
cover = np.asarray(predictions).squeeze()
if cover.ndim == 2:                      # e.g., probs -> take argmax
    idx = cover.argmax(axis=1)
    cover = voting_model.classes_[idx]

# 2) Build the submission (Id first, then Cover_Type)
sub = pd.DataFrame({"Id": test_id, "Cover_Type": cover})

# 3) Hard checks
assert len(sub) == 581_012, f"Expected 581012 rows, got {len(sub)}"
assert set(sub.columns) == {"Id", "Cover_Type"}
assert not sub.isna().any().any(), "NaNs found in submission"
# If `Id` must be unique:
assert sub["Id"].is_unique, "Duplicate Ids detected"

# 4) Save plain CSV (with header, no index)
sub.to_csv("submission.csv", index=False)

# 5) (Optional) ZIP or GZ archive for upload
# ZIP:
sub.to_csv("submission.zip", index=False, compression={"method":"zip","archive_name":"submission.csv"})
# GZ:
sub.to_csv("submission.csv.gz", index=False, compression="gzip")
