In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

from scipy.stats import mode

from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import class_weight

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

from xgboost import XGBClassifier, DMatrix, train

import lightgbm as lgb

import matplotlib.pyplot as plt

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:

train_dataframe = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test_dataframe = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

train_dataframe.describe()


In [None]:
def feature_engineer(dataframe):
    
    cols = ["Soil_Type7", "Soil_Type15"]

    dataframe.drop(cols, axis=1, inplace=True)

    dataframe["Aspect"][dataframe["Aspect"] < 0] += 360
    dataframe["Aspect"][dataframe["Aspect"] > 359] -= 360

    # Manhhattan Distance
    dataframe["Manhhattan_Distance"] = np.abs(dataframe["Horizontal_Distance_To_Hydrology"])+np.abs(dataframe["Vertical_Distance_To_Hydrology"])

    # Euclidian Distance
    dataframe["Euclidian_Distance"] = (dataframe["Horizontal_Distance_To_Hydrology"]**2 + dataframe["Vertical_Distance_To_Hydrology"]**2)**0.5

    # Combine Soil features
    soil_features = [x for x in dataframe.columns if x.startswith("Soil_Type")]
    dataframe["Soil_Type_Count"] = dataframe[soil_features].sum(axis=1)

    # Combine Wilderness features
    wilderness_features = [x for x in dataframe.columns if x.startswith("Wilderness_Area")]
    dataframe["Wilderness_Area_Count"] = dataframe[wilderness_features].sum(axis=1)

    dataframe.loc[dataframe["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    dataframe.loc[dataframe["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    dataframe.loc[dataframe["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

    dataframe.loc[dataframe["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    dataframe.loc[dataframe["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    dataframe.loc[dataframe["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

    dataframe['Hillshade'] = dataframe['Hillshade_9am'] + dataframe['Hillshade_Noon'] + dataframe['Hillshade_3pm']

    dataframe['Hydro_Elevation'] = dataframe['Elevation'] - dataframe['Vertical_Distance_To_Hydrology']

    dataframe['Binned_Elevation'] = [np.floor(v/50.0) for v in dataframe['Elevation']]

    dataframe['Horizontal_Distance_To_Roadways'][dataframe['Horizontal_Distance_To_Roadways'] < 0] = 0
    dataframe['Horizontal_Distance_To_Roadways_Log'] = [np.log(v+1) for v in dataframe['Horizontal_Distance_To_Roadways']]

    dataframe['Horizontal_Distance_To_Fire_Points'][dataframe['Horizontal_Distance_To_Fire_Points'] < 0] = 0
    dataframe['Horizontal_Distance_To_Fire_Points_Log'] = [np.log(v+1) for v in dataframe['Horizontal_Distance_To_Fire_Points']]

    return dataframe

In [None]:
train_dataframe = feature_engineer(train_dataframe)
test_dataframe = feature_engineer(test_dataframe)

In [None]:

cols = [
    "Elevation",
    "Aspect",
    "Manhhattan_Distance",
    "Euclidian_Distance",
    "Soil_Type_Count",
    "Wilderness_Area_Count",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Horizontal_Distance_To_Roadways_Log",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Hillshade",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Fire_Points_Log",
    "Binned_Elevation",
    "Hydro_Elevation"
]

scaler = RobustScaler()

train_dataframe[cols] = scaler.fit_transform(train_dataframe[cols])
test_dataframe[cols] = scaler.transform(test_dataframe[cols])

In [None]:
encoder = LabelEncoder()
train_dataframe["Cover_Type"] = encoder.fit_transform(train_dataframe["Cover_Type"])

In [None]:
train_dataframe = reduce_mem_usage(train_dataframe)
test_dataframe = reduce_mem_usage(test_dataframe)

In [None]:
train_dataframe.head()
train_dataframe.describe()

In [None]:
X = train_dataframe.drop("Cover_Type", axis=1).values
y = train_dataframe["Cover_Type"].values


# x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.15)

classes_number = len(train_dataframe["Cover_Type"].unique())

In [None]:
from keras import layers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras import Sequential

early_stopping = EarlyStopping(
    patience=10,
    min_delta=0,
    monitor='val_acc',
    mode='max',
    restore_best_weights=True,       
    baseline=None,
    verbose=2,
)

learning_rate_reduction = ReduceLROnPlateau(
        patience=5,
        factor=0.5,
        monitor='val_loss', 
        mode='min',
        verbose=2,
)


checkpoint = ModelCheckpoint(
    "snn_model.hdf5",
    monitor='loss',
    verbose=1,
    save_best_only=True,
    mode='auto',
    save_freq='epoch'
)

callbacks = [early_stopping, learning_rate_reduction, checkpoint]

ACTIVATION = "swish"
DROPOUT = 0.1

def build_model():
    model = Sequential([
        layers.BatchNormalization(input_shape = [X.shape[-1]], name='input'),
        layers.Dense(300, kernel_initializer="lecun_normal", activation=ACTIVATION),
        layers.Dropout(rate = DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(200, kernel_initializer="lecun_normal", activation=ACTIVATION),
        layers.Dropout(rate = DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(100, kernel_initializer="lecun_normal", activation=ACTIVATION),
        layers.Dropout(rate = DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(50, kernel_initializer="lecun_normal", activation=ACTIVATION),
        layers.Dropout(rate = DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(classes_number, activation = 'softmax'),
    ])

    model.compile(
        optimizer= 'adam',
        loss='sparse_categorical_crossentropy',
        metrics=['acc'],
    )
    return model

In [None]:
EPOCHS = 200
BATCH_SIZE = 4096
FOLDS = 20

test_predictions = np.zeros((1, 1))
scores = []

kfolds = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kfolds.split(X, y)):
    x_train, x_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = build_model()
    model.fit(
        x_train,
        y_train,
        validation_data=(x_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=1
    )

    y_pred = np.argmax(model.predict(x_val), axis=1)
    score = accuracy_score(y_val, y_pred)
    scores.append(score)

    test_predictions = test_predictions + model.predict(test_dataframe)
    print(f"Fold n°{fold} -> Accuracy: {score}")

print()
print(f"Mean Accuracy: {np.mean(scores)}")

In [None]:

# history = model.fit(
#     x_train, y_train,
#     validation_data = (x_val, y_val),
#     batch_size      = BATCH_SIZE, 
#     epochs          = EPOCHS,
#     callbacks       = [early_stopping, learning_rate_reduction, checkpoint],
#     shuffle         = True,
#     verbose         = 1,
# )

pd.DataFrame(test_predictions).to_csv("test_predictions.csv", index=False)

In [None]:

test_predictions = pd.read_csv("test_predictions.csv").to_numpy()

test_predictions = encoder.inverse_transform(np.argmax(test_predictions, axis=1))


In [None]:

submission_df = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

test_ids = test_dataframe.Id.values.tolist()

submission_df = pd.DataFrame(list(zip(test_ids, test_predictions)), columns=["Id", "Cover_Type"])

submission_df.columns = ["Id", "Cover_Type"]
submission_df.to_csv("submission.csv", index=False)
submission_df.head(20)

In [None]:
print(submission_df.describe())