In [1]:
import pandas as pd
import numpy as np
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

path = "../data/mlhackfest-haribon1//"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

print(train.shape)
print(test.shape)

(241, 21)
(80, 20)


In [2]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [3]:
train.head()

Unnamed: 0,indexid,scientific name,annual mean temperature,mean diurnal range,isothermality,temperature seasonality,max temperature of warmest month,min temperature of coldest month,temperature annual range,mean temperature of wettest quarter,...,mean temperature of warmest quarter,mean temperature of coldest quarter,annual precipitation,precipitation of wettest month,precipitation of driest month,precipitation seasonality,precipitation of wettest quarter,precipitation of driest quarter,precipitation of warmest quarter,precipitation of coldest quarter
0,1,Cacatua haematuropygia,,,,,,,,,...,,,,,,,,,,
1,4,Anas luzonica,,,,,,,,,...,,,,,,,,,,
2,7,Cacatua haematuropygia,63.09,19.902,2432.0,25.639,117.493,420.0,7.148,25.357,...,178.0,896.0,11.33,26.975,52.334,31.232,53.0,305.0,26.028,23.993
3,8,Pithecophaga jeffeyri,66.983,20.849,2044.0,26.936,112.042,166.0,8.192,25.87,...,71.0,971.0,12.23,28.357,73.092,33.079,15.0,340.0,27.275,25.442
4,10,Cacatua haematuropygia,76.739,21.074,2113.0,26.943,63.116,569.0,9.091,27.528,...,429.0,663.0,11.847,27.757,26.627,32.921,135.0,298.0,27.095,26.139


# Missing values

In [4]:
missing_data(train)

Unnamed: 0,Total,Percent
precipitation of coldest quarter,54,22.406639
precipitation of warmest quarter,54,22.406639
annual mean temperature,54,22.406639
mean diurnal range,54,22.406639
isothermality,54,22.406639
temperature seasonality,54,22.406639
max temperature of warmest month,54,22.406639
min temperature of coldest month,54,22.406639
temperature annual range,54,22.406639
mean temperature of wettest quarter,54,22.406639


In [5]:
missing_data(test)

Unnamed: 0,Total,Percent
precipitation of coldest quarter,0,0.0
precipitation of warmest quarter,0,0.0
annual mean temperature,0,0.0
mean diurnal range,0,0.0
isothermality,0,0.0
temperature seasonality,0,0.0
max temperature of warmest month,0,0.0
min temperature of coldest month,0,0.0
temperature annual range,0,0.0
mean temperature of wettest quarter,0,0.0


In [6]:
train = train.dropna()
train.shape

(187, 21)

In [7]:
train = reduce_mem_usage(train, verbose=1)
test = reduce_mem_usage(test, verbose=1)

Memory usage of properties dataframe is : 0.0313873291015625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.014088630676269531  MB
This is  44.88636363636363 % of the initial size
Memory usage of properties dataframe is : 0.0123291015625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0049285888671875  MB
This is  39.975247524752476 % of the initial size


In [8]:
index = 'indexid'
label = 'scientific name'
cats, nums = get_cats_nums(train)
cats = [x for x in cats if x not in [index, label]]
nums = [x for x in nums if x not in [index, label]]
display(cats)
display(nums)

[]

['annual mean temperature',
 'mean diurnal range',
 'isothermality',
 'temperature seasonality',
 'max temperature of warmest month',
 'min temperature of coldest month',
 'temperature annual range',
 'mean temperature of wettest quarter',
 'mean temperature of driest quarter',
 'mean temperature of warmest quarter',
 'mean temperature of coldest quarter',
 'annual precipitation',
 'precipitation of wettest month',
 'precipitation of driest month',
 'precipitation seasonality',
 'precipitation of wettest quarter',
 'precipitation of driest quarter',
 'precipitation of warmest quarter',
 'precipitation of coldest quarter']

In [9]:
train[label].value_counts()

Pithecophaga jeffeyri     69
Cacatua haematuropygia    51
Anas luzonica             30
Egretta eulophotes        20
Alcedo argentata          17
Name: scientific name, dtype: int64

# Preprocessing

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
enc = LabelEncoder()
train[label] = enc.fit_transform(train[label])

## Feature label encoding

In [12]:
# from sklearn.preprocessing import LabelEncoder

In [13]:
# enc = LabelEncoder()
# new_cols_0 = []
# for col in cats:
#     new_col = str(col) + '_0'
#     train[new_col] = enc.fit_transform(train[col])
#     test[new_col] = enc.transform(test[col])
#     new_cols_0.append(new_col)

## Frequency encoding

In [14]:
# new_cols_1 = []
# for col in cats:
#     new_col = str(col) + '_1'
#     encoding = train.groupby(col).size()/len(train)    
#     train[new_col] = train[col].map(encoding)
#     test[new_col] = test[col].map(encoding)
#     new_cols_1.append(new_col)

In [15]:
# train.drop(cats, axis=1, inplace=True)
# test.drop(cats, axis=1, inplace=True)
# cats = []
# cats.extend(new_cols_0)
# cats.extend(new_cols_1)

# Model

In [16]:
from sklearn.model_selection import KFold, StratifiedKFold
from utils.models import CatBoostCV

In [17]:
feats = [f for f in cats if f not in [index, label]] + [f for f in nums if f not in [index, label]]
print(len(feats))
print(feats)

19
['annual mean temperature', 'mean diurnal range', 'isothermality', 'temperature seasonality', 'max temperature of warmest month', 'min temperature of coldest month', 'temperature annual range', 'mean temperature of wettest quarter', 'mean temperature of driest quarter', 'mean temperature of warmest quarter', 'mean temperature of coldest quarter', 'annual precipitation', 'precipitation of wettest month', 'precipitation of driest month', 'precipitation seasonality', 'precipitation of wettest quarter', 'precipitation of driest quarter', 'precipitation of warmest quarter', 'precipitation of coldest quarter']


In [18]:
params = {
    'eval_metric':'TotalF1',
   # 'task_type': 'GPU', if you want to train on GPU
    'iterations': 1000,
    'learning_rate': 0.01,
    'early_stopping_rounds':50,
    'cat_features': cats,
   # 'one_hot_max_size': 5, helpful in high cardinality features
    'bootstrap_type': 'Bayesian', #Bernoulli is a good alternative
    'depth': 6, #depth is useful if range 4~12
    'l2_leaf_reg': 0.3,  #reg_lambda is the alias of l2_leaf_reg
   # 'random_strength': 1,  #works like gamma in XGBOOST
 #   'subsample': 0.8 #works like feature_fraction in lightgbm if bootstrap_type = Bernoulli
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
skf_splitted = skf.split(train[feats], train[label])

cb_model = CatBoostCV(
    cv=skf_splitted,
    cats=cats,
    nums=nums,
    random_state=21,
    obj='multiclass',
    **params
)

cb_model = cb_model.fit(
    train, train[label],
    verbose_eval=1000,
    use_best_model=True
)

0:	learn: 0.6151437	test: 0.3945708	best: 0.3945708 (0)	total: 65.9ms	remaining: 1m 5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5499222999
bestIteration = 5

Shrink model to first 6 iterations.
0:	learn: 0.4883640	test: 0.3959900	best: 0.3959900 (0)	total: 8.41ms	remaining: 8.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5292804203
bestIteration = 2

Shrink model to first 3 iterations.
0:	learn: 0.5747116	test: 0.4606607	best: 0.4606607 (0)	total: 4.49ms	remaining: 4.48s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6279279279
bestIteration = 13

Shrink model to first 14 iterations.
0:	learn: 0.5791124	test: 0.4550183	best: 0.4550183 (0)	total: 2.98ms	remaining: 2.97s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6337428337
bestIteration = 13

Shrink model to first 14 iterations.
0:	learn: 0.5850315	test: 0.4179894	best: 0.4179894 (0)	total: 2.84ms	remaining: 2.84s
Stopped by overfitting det

In [19]:
cv_score = np.mean(cb_model.model_scores_)
cv_score

0.5841647833461292

In [20]:
cb_model.model_scores_

[0.5499222999222999,
 0.5292804203020921,
 0.627927927927928,
 0.6337428337428338,
 0.5799504348354922]

In [21]:
np.std(cb_model.model_scores_)

0.04141448446129832

In [22]:
# cb_model.feature_importances_

# Test set

In [23]:
test[label] = cb_model.predict(test[feats])

In [24]:
test[label].value_counts()

4    46
2    23
3     4
1     4
0     3
Name: scientific name, dtype: int64

In [25]:
test[label] = enc.inverse_transform(test[label])

In [26]:
test.rename(columns={
    'indexid': 'indexId',
    label: 'Predicted'
}, inplace=True)

In [26]:
sub_path = "submission/haribon_submission/"

test[['indexId', 'Predicted']].to_csv(sub_path+\
         "cb_haribon_{0:.3f}_meanvoting.csv".format(cv_score), index=False)