Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

Submission File
For each id in the test set, you must predict a probability for the target variable. The file should contain a header and have the following format:

https://www.kaggle.com/c/tabular-playground-series-nov-2021/overview

In [1]:
!conda info


     active environment : kaggle-pgdec21
    active env location : C:\ProgramData\Anaconda3\envs\kaggle-pgdec21
            shell level : 1
       user config file : C:\Users\globetrekker\.condarc
 populated config files : C:\Users\globetrekker\.condarc
          conda version : 4.10.3
    conda-build version : 3.21.4
         python version : 3.8.8.final.0
       virtual packages : __win=0=0
                          __archspec=1=x86_64
       base environment : C:\ProgramData\Anaconda3  (writable)
      conda av data dir : C:\ProgramData\Anaconda3\etc\conda
  conda av metadata url : None
           channel URLs : https://repo.anaconda.com/pkgs/main/win-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/win-64
                          https://repo.anaconda.com/pkgs/r/noarch
                          https://repo.anaconda.com/pkgs/msys2/win-64
                          https://repo.anaconda.com/pkgs/msys2

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [23]:
import time, gc, copy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


In [4]:
import optuna
import utility as ut

pd.options.mode.chained_assignment = None  # default='warn'
%config Completer.use_jedi = False
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
def get_datasets(path):
    df_train = pd.read_csv(path + 'train.csv')
    df_test = pd.read_csv(path + 'test.csv')

    # drop this because only 1 row
    df_train.drop(list(df_train[df_train['Cover_Type']==5].index), axis=0, inplace=True)

    # drop these because only 1 value
    print('Singular columns')
    for col in df_train.columns:
        if df_train[col].nunique() == 1:
            print(col)
            df_train.drop(col, axis=1, inplace=True)
            df_test.drop(col, axis=1, inplace=True)

    # drop these because no additional info
    ids = df_test['Id']
    df_train.drop(['Id'], axis=1, inplace=True)
    df_test.drop(['Id'], axis=1, inplace=True)
    
    memory_usage = df_train.memory_usage(deep=True) / 1024 ** 2
    print('start mem %.2f MB' % (memory_usage.sum()))

    for col in df_train.columns:
        if 'Soil_Type' in col:
            df_train[col] = df_train[col].astype('category')
            df_test[col] = df_test[col].astype('category')
            continue
        if 'Wilderness' in col:
            df_train[col] = df_train[col].astype('category')
            df_test[col] = df_test[col].astype('category')
            continue
        if 'Cover_Type' in col:
            df_train[col] = df_train[col].astype('uint8')
            continue

        if df_train[col].min() >= 0 and df_train[col].max() <= 255:
            df_train[col] = df_train[col].astype('uint8')
            df_test[col] = df_test[col].astype('uint8')
        elif df_train[col].min() >= 0 and df_train[col].max() <= 65535:
            df_train[col] = df_train[col].astype('uint16')
            df_test[col] = df_test[col].astype('uint16')
        elif df_train[col].min() >= -128 and df_train[col].max() <= 127:
            df_train[col] = df_train[col].astype('int8')
            df_test[col] = df_test[col].astype('int8')
        elif df_train[col].min() >= -32768 and df_train[col].max() <= 32767:
            df_train[col] = df_train[col].astype('int16')
            df_test[col] = df_test[col].astype('int16')

    memory_usage = df_train.memory_usage(deep=True) / 1024 ** 2
    print('end mem %.2f MB' % (memory_usage.sum()))
    
    return df_train, df_test, ids

In [15]:
def get_scaled_df(df_train, df_test, target):
    
    df_train_in = copy.deepcopy(df_train)
    df_test_in = copy.deepcopy(df_test)
    
    df_target = df_train_in[target]
    df_train_in.drop(target, axis=1, inplace=True)
    
    cols_train = df_train_in.columns
    cols_test = df_test_in.columns
    
    scale_features = []
    for col in df_train_in.columns:
        if df_train_in[col].dtypes == 'category':
            continue
        else:
            scale_features.append(col)
    
    # define transformer
    transformer = ColumnTransformer(
        transformers=[('scale', StandardScaler(), scale_features)], 
        remainder='passthrough')
    
    # transform training data
    df_train_in = pd.DataFrame(transformer.fit_transform(df_train_in))
    df_train_in.columns = cols_train
    df_train_in[target] = df_target
    df_train_in.loc[df_train_in[target].isnull(), target] = 1
    df_train_in[target] = df_train_in[target].astype('uint8')
    
    # transform test data
    df_test_in = pd.DataFrame(transformer.transform(df_test_in))
    df_test_in.columns = cols_test
    
    # reset data types
    for col in df_train_in.columns:
        if df_train[col].dtypes == 'category':
            df_train_in[col] = df_train_in[col].astype('category') 
        if col in scale_features:
            df_train_in[col] = df_train_in[col].astype('float32') 
              
    return df_train_in, df_test_in

## Main block

In [16]:
# get data
df_train, df_test, ids = get_datasets('../input/tabular-playground-series-dec-2021/')
df_train_sc, df_test_sc = get_scaled_df(df_train, df_test, 'Cover_Type')

# get models
models = ut.get_models()

# split data
X_train, X_val, y_train, y_val, original_features = ut.split_dataset(df_train, df_test, 'Cover_Type')
X_train_sc, X_val_sc, y_train_sc, y_val_sc, original_features = ut.split_dataset(df_train_sc, df_test_sc, 'Cover_Type')

Singular columns
Soil_Type7
Soil_Type15
start mem 1647.95 MB
end mem 267.03 MB


In [None]:
%%time

# get baseline score - rf
model = RandomForestClassifier(max_depth=8, random_state=5)
val_score, cv_score = ut.get_baseline_score(model, X_train, X_val, y_train, y_val)

# get baseline score - lgbm
model = LGBMClassifier(random_state=5)
val_score, cv_score = ut.get_baseline_score(model, X_train, X_val, y_train, y_val)

In [None]:
# get correlation matrix
df_temp = copy.deepcopy(df_train)
for col in df_temp.columns:
    if df_temp[col].dtypes == 'category':
        df_temp[col] = df_temp[col].astype('uint8')

target = df_temp['Cover_Type']
features = df_temp.drop('Cover_Type', axis=1)
df_temp = features.iloc[:, :27]
df_temp = pd.concat([df_temp, target], axis=1)

corrmat = df_temp.corr()
top_corr_features = corrmat.index

plt.figure(figsize=(40,40))
g=sns.heatmap(df_temp[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [32]:
# get feature importances
featureScores = ut.get_feature_importances(X_train, y_train, 5)
featureScores.head()

val score 0.903
cross val score 0.905
Wall time: 44min 41s


In [33]:
%%time

# get drop scores
model = LGBMClassifier(random_state=5)
df_drops = ut.get_drop_scores(model, X_train, X_val, y_train, y_val, False)
val_cols = list(df_drops.sort_values(by='val_score', ascending=False).head(15)['col'])
cv_cols = list(df_drops.sort_values(by='cv_score', ascending=False).head(15)['col'])
common_cols = [c for c in val_cols if c in cv_cols]
common_cols

X_train_new = copy.deepcopy(X_train)
X_val_new = copy.deepcopy(X_val)
X_train_new.drop(common_cols, axis=1, inplace=True)
X_val_new.drop(common_cols, axis=1, inplace=True)

val_score, cv_score = ut.get_baseline_score(model, X_train, X_val, y_train, y_val)

val score 0.952
cross val score 0.952
Wall time: 8min 42s


In [68]:
# get kmeans scores
ut.get_kmeans_scores(featureScores, X_train, X_val, df_test, models)

# get kmeans scores on validation set
ut.get_kmeans_scores_validation(featureScores, X_train, X_val, df_test, models)

# get stacking scores
stack_model = LogisticRegression(solver='sag', C=1.6213309780417264, max_iter=1800, random_state=10)
ut.get_stacking_scores(featureScores, X_train, y_train, X_val, df_test, models, stack_model)

# get binning scores
baseline_score = 0.749
model = LinearSVC(dual=False, C=0.012249147757314706, max_iter=10000)
improved_cols = ut.get_binning_scores(baseline_score, X_train, y_train, model, 0.01)

# get binning scores on validation set
baseline_score = 0.749
improvement = 0.5
model = LinearSVC(dual=False, C=0.012249147757314706, max_iter=10000)
improved_cols = ut.get_binning_scores_val(baseline_score, X_train, y_train, X_val, y_val, model, improvement)

# optimize model
ut.optimize_linear_svc(X_train, y_train, X_val, y_val)

# final prediction - single model - logistic regression
clf = LogisticRegression(solver='newton-cg', C=0.023672809391721117, max_iter=200)
ut.make_final_pred_single(clf, X_train, y_train)

In [None]:
preds = model.predict(df_test)

output = pd.DataFrame({'Id': ids, 'Cover_Type': preds})
output.to_csv('submission.csv', index=False)