In [1]:
import pandas as pd
import numpy as np
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

path = "../data/mlhackfest-haribon1//"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

print(train.shape)
print(test.shape)

(241, 21)
(80, 20)


In [2]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [3]:
train.head()

Unnamed: 0,indexid,scientific name,annual mean temperature,mean diurnal range,isothermality,temperature seasonality,max temperature of warmest month,min temperature of coldest month,temperature annual range,mean temperature of wettest quarter,...,mean temperature of warmest quarter,mean temperature of coldest quarter,annual precipitation,precipitation of wettest month,precipitation of driest month,precipitation seasonality,precipitation of wettest quarter,precipitation of driest quarter,precipitation of warmest quarter,precipitation of coldest quarter
0,1,Cacatua haematuropygia,,,,,,,,,...,,,,,,,,,,
1,4,Anas luzonica,,,,,,,,,...,,,,,,,,,,
2,7,Cacatua haematuropygia,63.09,19.902,2432.0,25.639,117.493,420.0,7.148,25.357,...,178.0,896.0,11.33,26.975,52.334,31.232,53.0,305.0,26.028,23.993
3,8,Pithecophaga jeffeyri,66.983,20.849,2044.0,26.936,112.042,166.0,8.192,25.87,...,71.0,971.0,12.23,28.357,73.092,33.079,15.0,340.0,27.275,25.442
4,10,Cacatua haematuropygia,76.739,21.074,2113.0,26.943,63.116,569.0,9.091,27.528,...,429.0,663.0,11.847,27.757,26.627,32.921,135.0,298.0,27.095,26.139


# Missing values

In [4]:
missing_data(train)

Unnamed: 0,Total,Percent
precipitation of coldest quarter,54,22.406639
precipitation of warmest quarter,54,22.406639
annual mean temperature,54,22.406639
mean diurnal range,54,22.406639
isothermality,54,22.406639
temperature seasonality,54,22.406639
max temperature of warmest month,54,22.406639
min temperature of coldest month,54,22.406639
temperature annual range,54,22.406639
mean temperature of wettest quarter,54,22.406639


In [5]:
missing_data(test)

Unnamed: 0,Total,Percent
precipitation of coldest quarter,0,0.0
precipitation of warmest quarter,0,0.0
annual mean temperature,0,0.0
mean diurnal range,0,0.0
isothermality,0,0.0
temperature seasonality,0,0.0
max temperature of warmest month,0,0.0
min temperature of coldest month,0,0.0
temperature annual range,0,0.0
mean temperature of wettest quarter,0,0.0


In [6]:
train = train.dropna()
train.shape

(187, 21)

In [7]:
train = reduce_mem_usage(train, verbose=1)
test = reduce_mem_usage(test, verbose=1)

Memory usage of properties dataframe is : 0.0313873291015625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.014088630676269531  MB
This is  44.88636363636363 % of the initial size
Memory usage of properties dataframe is : 0.0123291015625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0049285888671875  MB
This is  39.975247524752476 % of the initial size


In [8]:
index = 'indexid'
label = 'scientific name'
cats, nums = get_cats_nums(train)
cats = [x for x in cats if x not in [index, label]]
nums = [x for x in nums if x not in [index, label]]
display(cats)
display(nums)

[]

['annual mean temperature',
 'mean diurnal range',
 'isothermality',
 'temperature seasonality',
 'max temperature of warmest month',
 'min temperature of coldest month',
 'temperature annual range',
 'mean temperature of wettest quarter',
 'mean temperature of driest quarter',
 'mean temperature of warmest quarter',
 'mean temperature of coldest quarter',
 'annual precipitation',
 'precipitation of wettest month',
 'precipitation of driest month',
 'precipitation seasonality',
 'precipitation of wettest quarter',
 'precipitation of driest quarter',
 'precipitation of warmest quarter',
 'precipitation of coldest quarter']

In [9]:
train[label].value_counts()

Pithecophaga jeffeyri     69
Cacatua haematuropygia    51
Anas luzonica             30
Egretta eulophotes        20
Alcedo argentata          17
Name: scientific name, dtype: int64

# Preprocessing

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
enc = LabelEncoder()
train[label] = enc.fit_transform(train[label])

## Feature engineering

In [12]:
feats = [f for f in cats if f not in [index, label]] + [f for f in nums if f not in [index, label]]
print(len(feats))
print(feats)

19
['annual mean temperature', 'mean diurnal range', 'isothermality', 'temperature seasonality', 'max temperature of warmest month', 'min temperature of coldest month', 'temperature annual range', 'mean temperature of wettest quarter', 'mean temperature of driest quarter', 'mean temperature of warmest quarter', 'mean temperature of coldest quarter', 'annual precipitation', 'precipitation of wettest month', 'precipitation of driest month', 'precipitation seasonality', 'precipitation of wettest quarter', 'precipitation of driest quarter', 'precipitation of warmest quarter', 'precipitation of coldest quarter']


In [13]:
for feat in feats:
    col_name = feat + '_0'
    train[col_name] = train[feat].transform(np.log)
    test[col_name] = test[feat].transform(np.log)

In [14]:
for feat in feats:
    col_name = feat + '_1'
    train[col_name] = train[feat].transform(np.log)
    test[col_name] = test[feat].transform(np.log)

# Model

In [15]:
from sklearn.model_selection import KFold, StratifiedKFold
from utils.models import CatBoostCV

In [16]:
feats = [f for f in train.columns if f not in [index, label]] 
nums = feats
print(len(feats))

57


In [17]:
params = {
    'eval_metric':'TotalF1',
    'loss_function': 'MultiClass',
   # 'task_type': 'GPU', if you want to train on GPU
    'iterations': 1000,
    'learning_rate': 0.005,
    'early_stopping_rounds':50,
    'cat_features': cats,
   # 'one_hot_max_size': 5, helpful in high cardinality features
    'bootstrap_type': 'Bayesian', #Bernoulli is a good alternative
    'depth': 8, #depth is useful if range 4~12
    'l2_leaf_reg': 0.8,  #reg_lambda is the alias of l2_leaf_reg
   # 'random_strength': 1,  #works like gamma in XGBOOST
 #   'subsample': 0.8 #works like feature_fraction in lightgbm if bootstrap_type = Bernoulli
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
skf_splitted = skf.split(train[feats], train[label])

cb_model = CatBoostCV(
    cv=skf_splitted,
    cats=cats,
    nums=nums,
    random_state=21,
    obj='multiclass',
    **params
)

cb_model = cb_model.fit(
    train, train[label],
    verbose_eval=1000,
    use_best_model=True
)

0:	learn: 0.6189877	test: 0.4709402	best: 0.4709402 (0)	total: 167ms	remaining: 2m 46s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.611033411
bestIteration = 8

Shrink model to first 9 iterations.
0:	learn: 0.6922038	test: 0.5052544	best: 0.5052544 (0)	total: 86.8ms	remaining: 1m 26s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.560416192
bestIteration = 2

Shrink model to first 3 iterations.
0:	learn: 0.6299286	test: 0.5250965	best: 0.5250965 (0)	total: 43.5ms	remaining: 43.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6563223302
bestIteration = 20

Shrink model to first 21 iterations.
0:	learn: 0.6565630	test: 0.4487237	best: 0.4487237 (0)	total: 65.8ms	remaining: 1m 5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6348609479
bestIteration = 16

Shrink model to first 17 iterations.
0:	learn: 0.5992770	test: 0.4332883	best: 0.4332883 (0)	total: 36.7ms	remaining: 36.7s
Stopped by overfitting det

In [18]:
cv_score = np.mean(cb_model.model_scores_)
cv_score

0.6114120734525006

In [19]:
cb_model.model_scores_

[0.611033411033411,
 0.5604161919951394,
 0.6563223302353738,
 0.6348609479044262,
 0.5944274860941527]

In [20]:
np.std(cb_model.model_scores_)

0.03303325716891991

In [21]:
cb_model.feature_importances_

Unnamed: 0,feature_names,importance_0,importance_1,importance_2,importance_3,importance_4
0,annual mean temperature,1.145308,0.0,0.715027,0.63959,0.0
1,mean diurnal range,0.0,0.0,0.503927,1.295676,0.0
2,isothermality,0.544274,0.0,0.554554,2.513167,0.0
3,temperature seasonality,0.0,3.796166,2.872705,2.955095,0.0
4,max temperature of warmest month,1.504321,0.0,1.362648,1.005522,0.0
5,min temperature of coldest month,0.0,4.502436,0.094303,1.169108,0.0
6,temperature annual range,5.345721,11.144868,1.841698,2.361023,5.205798
7,mean temperature of wettest quarter,8.543338,0.0,2.809692,0.948169,0.0
8,mean temperature of driest quarter,1.677871,3.202488,1.158084,0.568552,0.0
9,mean temperature of warmest quarter,0.0,2.520983,0.0,0.0,8.850312


In [22]:
# for i, model in enumerate(cb_model.models_):
#     model.save_model(fname='models/cb_0.584/'+str(i))

# Test set

In [23]:
test[label] = cb_model.predict(test[feats])

In [24]:
test[label].value_counts()

4    40
2    28
3     4
1     4
0     4
Name: scientific name, dtype: int64

In [25]:
test[label] = enc.inverse_transform(test[label])

In [26]:
mapping = {key:val for key, val in enumerate(enc.classes_)}
mapping

{0: 'Alcedo argentata',
 1: 'Anas luzonica',
 2: 'Cacatua haematuropygia',
 3: 'Egretta eulophotes',
 4: 'Pithecophaga jeffeyri'}

In [27]:
test.rename(columns={
    'indexid': 'indexId',
    label: 'Predicted'
}, inplace=True)

In [28]:
sub_path = "submission/haribon_submission/"

test[['indexId', 'Predicted']].to_csv(sub_path+\
         "cb_haribon_{0:.3f}_meanvoting.csv".format(cv_score), index=False)