In [1]:
import pandas as pd
import numpy as np
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

path = "../data/mlhackfest-haribon1//"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

print(train.shape)
print(test.shape)

(241, 21)
(80, 20)


In [2]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [3]:
train.head()

Unnamed: 0,indexid,scientific name,annual mean temperature,mean diurnal range,isothermality,temperature seasonality,max temperature of warmest month,min temperature of coldest month,temperature annual range,mean temperature of wettest quarter,...,mean temperature of warmest quarter,mean temperature of coldest quarter,annual precipitation,precipitation of wettest month,precipitation of driest month,precipitation seasonality,precipitation of wettest quarter,precipitation of driest quarter,precipitation of warmest quarter,precipitation of coldest quarter
0,1,Cacatua haematuropygia,,,,,,,,,...,,,,,,,,,,
1,4,Anas luzonica,,,,,,,,,...,,,,,,,,,,
2,7,Cacatua haematuropygia,63.09,19.902,2432.0,25.639,117.493,420.0,7.148,25.357,...,178.0,896.0,11.33,26.975,52.334,31.232,53.0,305.0,26.028,23.993
3,8,Pithecophaga jeffeyri,66.983,20.849,2044.0,26.936,112.042,166.0,8.192,25.87,...,71.0,971.0,12.23,28.357,73.092,33.079,15.0,340.0,27.275,25.442
4,10,Cacatua haematuropygia,76.739,21.074,2113.0,26.943,63.116,569.0,9.091,27.528,...,429.0,663.0,11.847,27.757,26.627,32.921,135.0,298.0,27.095,26.139


# Missing values

In [4]:
missing_data(train)

Unnamed: 0,Total,Percent
precipitation of coldest quarter,54,22.406639
precipitation of warmest quarter,54,22.406639
annual mean temperature,54,22.406639
mean diurnal range,54,22.406639
isothermality,54,22.406639
temperature seasonality,54,22.406639
max temperature of warmest month,54,22.406639
min temperature of coldest month,54,22.406639
temperature annual range,54,22.406639
mean temperature of wettest quarter,54,22.406639


In [5]:
missing_data(test)

Unnamed: 0,Total,Percent
precipitation of coldest quarter,0,0.0
precipitation of warmest quarter,0,0.0
annual mean temperature,0,0.0
mean diurnal range,0,0.0
isothermality,0,0.0
temperature seasonality,0,0.0
max temperature of warmest month,0,0.0
min temperature of coldest month,0,0.0
temperature annual range,0,0.0
mean temperature of wettest quarter,0,0.0


In [6]:
train = train.dropna()
train.shape

(187, 21)

In [7]:
train = reduce_mem_usage(train, verbose=1)
test = reduce_mem_usage(test, verbose=1)

Memory usage of properties dataframe is : 0.0313873291015625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.014088630676269531  MB
This is  44.88636363636363 % of the initial size
Memory usage of properties dataframe is : 0.0123291015625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0049285888671875  MB
This is  39.975247524752476 % of the initial size


In [8]:
index = 'indexid'
label = 'scientific name'
cats, nums = get_cats_nums(train)
cats = [x for x in cats if x not in [index, label]]
nums = [x for x in nums if x not in [index, label]]
display(cats)
display(nums)

[]

['annual mean temperature',
 'mean diurnal range',
 'isothermality',
 'temperature seasonality',
 'max temperature of warmest month',
 'min temperature of coldest month',
 'temperature annual range',
 'mean temperature of wettest quarter',
 'mean temperature of driest quarter',
 'mean temperature of warmest quarter',
 'mean temperature of coldest quarter',
 'annual precipitation',
 'precipitation of wettest month',
 'precipitation of driest month',
 'precipitation seasonality',
 'precipitation of wettest quarter',
 'precipitation of driest quarter',
 'precipitation of warmest quarter',
 'precipitation of coldest quarter']

In [9]:
train[label].value_counts()

Pithecophaga jeffeyri     69
Cacatua haematuropygia    51
Anas luzonica             30
Egretta eulophotes        20
Alcedo argentata          17
Name: scientific name, dtype: int64

# Preprocessing

In [10]:
display(nums)

['annual mean temperature',
 'mean diurnal range',
 'isothermality',
 'temperature seasonality',
 'max temperature of warmest month',
 'min temperature of coldest month',
 'temperature annual range',
 'mean temperature of wettest quarter',
 'mean temperature of driest quarter',
 'mean temperature of warmest quarter',
 'mean temperature of coldest quarter',
 'annual precipitation',
 'precipitation of wettest month',
 'precipitation of driest month',
 'precipitation seasonality',
 'precipitation of wettest quarter',
 'precipitation of driest quarter',
 'precipitation of warmest quarter',
 'precipitation of coldest quarter']

## Feature label encoding

In [11]:
# from sklearn.preprocessing import LabelEncoder

In [12]:
# enc = LabelEncoder()
# new_cols_0 = []
# for col in cats:
#     new_col = str(col) + '_0'
#     train[new_col] = enc.fit_transform(train[col])
#     test[new_col] = enc.transform(test[col])
#     new_cols_0.append(new_col)

## Frequency encoding

In [13]:
# new_cols_1 = []
# for col in cats:
#     new_col = str(col) + '_1'
#     encoding = train.groupby(col).size()/len(train)    
#     train[new_col] = train[col].map(encoding)
#     test[new_col] = test[col].map(encoding)
#     new_cols_1.append(new_col)

In [14]:
# train.drop(cats, axis=1, inplace=True)
# test.drop(cats, axis=1, inplace=True)
# cats = []
# cats.extend(new_cols_0)
# cats.extend(new_cols_1)

# Model

In [15]:
from sklearn.model_selection import KFold, StratifiedKFold
from utils.models import RandomForestCV

In [16]:
feats = [f for f in cats if f not in [index, label]] + [f for f in nums if f not in [index, label]]
print(len(feats))
print(feats)

19
['annual mean temperature', 'mean diurnal range', 'isothermality', 'temperature seasonality', 'max temperature of warmest month', 'min temperature of coldest month', 'temperature annual range', 'mean temperature of wettest quarter', 'mean temperature of driest quarter', 'mean temperature of warmest quarter', 'mean temperature of coldest quarter', 'annual precipitation', 'precipitation of wettest month', 'precipitation of driest month', 'precipitation seasonality', 'precipitation of wettest quarter', 'precipitation of driest quarter', 'precipitation of warmest quarter', 'precipitation of coldest quarter']


In [17]:
params = {
    'random_state': 13,
    'n_estimators': 500,
    'max_depth': 6,
    'n_jobs': -1, # all cores
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
skf_splitted = skf.split(train[feats], train[label])

rf_model = RandomForestCV(cv=skf_splitted, obj='binary', **params)
rf_model.fit(train[feats], train[label])

cv_score = np.mean(rf_model.model_scores_)
cv_score

0.5004945310383784

In [18]:
rf_model.model_classes_

[array(['Alcedo argentata', 'Anas luzonica', 'Cacatua haematuropygia', 'Egretta eulophotes', 'Pithecophaga jeffeyri'],
       dtype=object),
 array(['Alcedo argentata', 'Anas luzonica', 'Cacatua haematuropygia', 'Egretta eulophotes', 'Pithecophaga jeffeyri'],
       dtype=object),
 array(['Alcedo argentata', 'Anas luzonica', 'Cacatua haematuropygia', 'Egretta eulophotes', 'Pithecophaga jeffeyri'],
       dtype=object),
 array(['Alcedo argentata', 'Anas luzonica', 'Cacatua haematuropygia', 'Egretta eulophotes', 'Pithecophaga jeffeyri'],
       dtype=object),
 array(['Alcedo argentata', 'Anas luzonica', 'Cacatua haematuropygia', 'Egretta eulophotes', 'Pithecophaga jeffeyri'],
       dtype=object)]

In [19]:
rf_model.model_scores_

[0.3670995670995671,
 0.5089285714285714,
 0.6300395256916996,
 0.5339989759344598,
 0.46240601503759393]

In [20]:
np.std(rf_model.model_scores_)

0.0862792419533294

In [21]:
# rf_model.feature_importances_

# Test set

In [22]:
test[label] = rf_model.predict(test[feats])

In [23]:
test[label].value_counts()

4    32
2    30
3     7
1     6
0     5
Name: scientific name, dtype: int64

In [24]:
mapping = {key:val for key, val in enumerate(rf_model.model_classes_[0])}
mapping

{0: 'Alcedo argentata',
 1: 'Anas luzonica',
 2: 'Cacatua haematuropygia',
 3: 'Egretta eulophotes',
 4: 'Pithecophaga jeffeyri'}

In [25]:
test[label] = test[label].map(mapping)

In [26]:
test.rename(columns={
    'indexid': 'indexId',
    label: 'Predicted'
}, inplace=True)

In [27]:
sub_path = "submission/haribon_submission/"

test[['indexId', 'Predicted']].to_csv(sub_path+\
         "rf_haribon_{0:.3f}_meanvoting.csv".format(cv_score), index=False)