In [None]:
ver = 'v6'

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from lightgbm import LGBMClassifier

import gc
from tqdm import *

In [None]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('sample_submission.csv')

In [None]:
df = pd.concat([train, test]).reset_index(drop = True)

In [None]:
df.nunique()

Area_Code        30900
Locality_Code       22
Region_Code       2185
Height             110
Diameter           151
Class                8
Species            263
dtype: int64

In [None]:
df['height*diameter'] = df['Height'] * df['Diameter']
df['height/diameter'] = df['Height'] / df['Diameter']

In [None]:
df.columns

Index(['Area_Code', 'Locality_Code', 'Region_Code', 'Height', 'Diameter',
       'Class', 'Species', 'height*diameter', 'height/diameter'],
      dtype='object')

In [None]:
cat_fts = ['Area_Code', 'Locality_Code', 'Region_Code', 'Species']
num_fts = ['Height', 'Diameter','height*diameter', 'height/diameter']

for g in tqdm_notebook(cat_fts):

  grp = df.groupby(g)[num_fts].mean()
  grp.columns = [c + f'_grpd_by_{g}_mean' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

  grp = df.groupby(g)[num_fts].std()
  grp.columns = [c + f'_grpd_by_{g}_std' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

  grp = df.groupby(g)[num_fts].min()
  grp.columns = [c + f'_grpd_by_{g}_min' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

  grp = df.groupby(g)[num_fts].max()
  grp.columns = [c + f'_grpd_by_{g}_max' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [None]:
fts = ['Area_Code','Locality_Code','Region_Code','Height','Diameter', 'Species']
for g in tqdm_notebook(fts):
  rf = [c for c in fts if c != g]
  grp = df.groupby(g)[rf].nunique()
  grp.columns = [c + f'_grpd_by_{g}_nunique' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [None]:
for f in fts:
  df[f + '_freq'] = df[f].map(df[f].value_counts(normalize = True))

In [None]:
train, test = df[:train.shape[0]].reset_index(drop = True), df[train.shape[0]:].reset_index(drop = True)

In [None]:
features = [c for c in train.columns if c not in ['Class', "Area_Code"]]

In [None]:
oofs = np.zeros((len(train), 8))
preds = np.zeros((len(test), 8))
N_SPLITS = 15
folds = StratifiedKFold(N_SPLITS, shuffle = True, random_state = 77)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['Class'])):

  print(f'\n\n Fold {fold_} \\n\n')
  X_trn, y_trn = train[features].iloc[trn_idx], train["Class"][trn_idx]
  X_val, y_val = train[features].iloc[val_idx], train["Class"][val_idx]

  clf = LGBMClassifier(n_estimators = 2000, learning_rate = 0.03, colsample_bytree = 0.3, reg_alpha = 0.5, reg_lambda = 2, random_state = 2)

  clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 200, verbose = 50, eval_metric = 'logloss')

  oofs[val_idx] = clf.predict_proba(X_val)
  preds += clf.predict_proba(test[features]) / N_SPLITS




 Fold 0 \n

Training until validation scores don't improve for 200 rounds.
[50]	valid_0's multi_logloss: 1.00786	valid_0's multi_logloss: 1.00786
[100]	valid_0's multi_logloss: 0.866884	valid_0's multi_logloss: 0.866884
[150]	valid_0's multi_logloss: 0.81391	valid_0's multi_logloss: 0.81391
[200]	valid_0's multi_logloss: 0.787964	valid_0's multi_logloss: 0.787964
[250]	valid_0's multi_logloss: 0.774951	valid_0's multi_logloss: 0.774951
[300]	valid_0's multi_logloss: 0.766229	valid_0's multi_logloss: 0.766229
[350]	valid_0's multi_logloss: 0.760503	valid_0's multi_logloss: 0.760503
[400]	valid_0's multi_logloss: 0.757054	valid_0's multi_logloss: 0.757054
[450]	valid_0's multi_logloss: 0.754423	valid_0's multi_logloss: 0.754423
[500]	valid_0's multi_logloss: 0.752673	valid_0's multi_logloss: 0.752673
[550]	valid_0's multi_logloss: 0.751944	valid_0's multi_logloss: 0.751944
[600]	valid_0's multi_logloss: 0.752293	valid_0's multi_logloss: 0.752293
[650]	valid_0's multi_logloss: 0.752031	

In [None]:
oof_val_score = log_loss(train['Class'], oofs)
print(f'Final Log loss: {oof_val_score}')

Final Log loss: 0.7216102438734435


In [None]:
ss[ss.columns.tolist()] = preds

In [None]:
SUB_FILE_NAME = f'mft_{ver}.csv'
ss.to_csv(SUB_FILE_NAME, index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>