In [1]:
import lightgbm as lgbm
from sklearn import utils

from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = y_hat.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
#     print(f"yhat {y_hat} y_true {y_true}")# scikits f1 doesn't like probabilities
#     print(f" true {y_true.shape}, preds {y_hat.shape}")
    return 'f1', f1_score(y_true, y_hat, average="weighted"), True

class MultiClassLGBMCV():
    def __init__(self, cv=None, **kwargs):
        self.cv = cv
        self.lgbm_params = kwargs
        self.metric = kwargs['metric']

    def fit(self, X, y=None, **kwargs):
        self.models_ = []
        feature_names = X.columns if isinstance(X, pd.DataFrame) else list(range(X.shape[1]))
        self.feature_importances_ = pd.DataFrame(index=feature_names)
        self.evals_results_ = {}
        self.model_scores_ = []
        self.model_best_iterations_ = []

        for i, (fit_idx, val_idx) in enumerate(self.cv):

            # Split the dataset according to the fold indexes
            if isinstance(X, pd.DataFrame):
                X_fit = X.iloc[fit_idx]
                X_val = X.iloc[val_idx]
            else:
                X_fit = X[fit_idx]
                X_val = X[val_idx]

            if isinstance(y, pd.Series):
                y_fit = y.iloc[fit_idx]
                y_val = y.iloc[val_idx]
            else:
                y_fit = y[fit_idx]
                y_val = y[val_idx]

            # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
            
            fit_set = lgbm.Dataset(X_fit, y_fit)
            val_set = lgbm.Dataset(X_val, y_val)
                
            #print(f'fit set {len(X_fit)} and val_set {len(X_val)}')
            # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
            self.evals_results_[i] = {}
            model = lgbm.train(
                params=self.lgbm_params,
                train_set=fit_set,
                valid_sets=(fit_set, val_set),
                valid_names=('train', 'eval'),
                evals_result=self.evals_results_[i],
                feval=lgb_f1_score,
                **kwargs
            )
            self.model_scores_.append(model.best_score['eval']['f1'])
            # Store the feature importances
            self.feature_importances_['gain_{}'.format(i)] = model.feature_importance('gain')
            self.feature_importances_['split_{}'.format(i)] = model.feature_importance('split')
            self.model_best_iterations_.append(model.best_iteration)
            # Store the model
            self.models_.append(model)

        return self

    def predict(self, X):

        utils.validation.check_is_fitted(self, ['models_'])

        y = np.zeros(len(X))

        for model in self.models_:
            
            y += model.predict(X, num_iteration=model.best_iteration).argmax(axis=1)
        
        return y // len(self.models_)

In [2]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [3]:
X, proxy_y = load_iris(True)

In [4]:
X = pd.DataFrame(X, columns=['sepal-length', 'sepal-width', 'petal-length', 'petal-width'])

In [5]:
X['class'] = proxy_y

In [6]:
from sklearn.model_selection import train_test_split

train, test, y, y_test = train_test_split(X.drop('class', axis=1), proxy_y, test_size=.1, random_state=13, stratify=X['class'])

In [7]:
# import pandas as pd
# import numpy as np

# path = "../data/titanic/"
# train = pd.read_csv(path+"train.csv")
# test = pd.read_csv(path+"test.csv")

In [8]:
from utils.models import CatBoostCV, LGBMCV
from utils.eda import reduce_mem_usage, missing_data, get_cats_nums

In [9]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of properties dataframe is : 0.00514984130859375  MB
******************************
Column:  sepal-length
dtype before:  float64
min for this col:  4.3
max for this col:  7.9
dtype after:  float32
******************************
******************************
Column:  sepal-width
dtype before:  float64
min for this col:  2.0
max for this col:  4.4
dtype after:  float32
******************************
******************************
Column:  petal-length
dtype before:  float64
min for this col:  1.0
max for this col:  6.7
dtype after:  float32
******************************
******************************
Column:  petal-width
dtype before:  float64
min for this col:  0.1
max for this col:  2.5
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.00308990478515625  MB
This is  60.0 % of the initial size
Memory usage of properties dataframe is : 0.00057220458984375  MB
******************************
Column:  sepal-length
dt

In [10]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [11]:
train.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
46,5.1,3.8,1.6,0.2
123,6.3,2.7,4.9,1.8
11,4.8,3.4,1.6,0.2
47,4.6,3.2,1.4,0.2
132,6.4,2.8,5.6,2.2


In [12]:
train.columns

Index(['sepal-length', 'sepal-width', 'petal-length', 'petal-width'], dtype='object')

In [13]:
# drop_cols = ['name', 'ticket', 'cabin']

In [14]:
# train.drop(drop_cols, axis=1, inplace=True)
# test.drop(drop_cols, axis=1, inplace=True)

In [15]:
train

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
46,5.1,3.8,1.6,0.2
123,6.3,2.7,4.9,1.8
11,4.8,3.4,1.6,0.2
47,4.6,3.2,1.4,0.2
132,6.4,2.8,5.6,2.2
...,...,...,...,...
70,5.9,3.2,4.8,1.8
100,6.3,3.3,6.0,2.5
62,6.0,2.2,4.0,1.0
146,6.3,2.5,5.0,1.9


In [16]:
missing_data(train)

Unnamed: 0,Total,Percent
petal-width,0,0.0
petal-length,0,0.0
sepal-width,0,0.0
sepal-length,0,0.0


In [17]:
missing_data(test)

Unnamed: 0,Total,Percent
petal-width,0,0.0
petal-length,0,0.0
sepal-width,0,0.0
sepal-length,0,0.0


In [18]:
cats, nums = get_cats_nums(train)

In [19]:
cats

[]

In [20]:
nums

['sepal-length', 'sepal-width', 'petal-length', 'petal-width']

In [21]:
feats = train.columns

In [22]:
feats

Index(['sepal-length', 'sepal-width', 'petal-length', 'petal-width'], dtype='object')

In [23]:
### MODEL
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
   # 'eta': 0.001,
#     'bagging_fraction': 0.9,
#     'bagging_freq': 5,
    'random_state': 13,
    'num_leaves': 20,# (2**7)-20,
    'max_depth': -1, #default
    'n_jobs': -1, #all cores
#     'feature_fraction': 0.8,
#     'reg_alpha': 0.3,
#     'reg_lambda': 0.1,
#    'categorical_feature': cats#randomstuff
}

In [24]:
train.shape, test.shape, y.shape, y_test.shape

((135, 4), (15, 4), (135,), (15,))

In [25]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

In [26]:
skf_splitted = skf.split(train[feats], y)
#train[cats] = train[cats].astype('category')

In [27]:
# for i, b in skf_splitted:
#     print(len(i), len(b))

In [28]:
lgb_model = MultiClassLGBMCV(cv=skf_splitted, **params)

In [29]:
train.shape, y.shape

((135, 4), (135,))

In [30]:
test.shape, y_test.shape

((15, 4), (15,))

In [31]:
lgb_model.fit(train[feats], y, num_boost_round=100, \
              early_stopping_rounds=20, verbose_eval=1, categorical_feature=cats)

New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	train's multi_logloss: 0.972035	train's f1: 0.955506	eval's multi_logloss: 0.972226	eval's f1: 0.955357
Training until validation scores don't improve for 20 rounds
[2]	train's multi_logloss: 0.867096	train's f1: 0.955506	eval's multi_logloss: 0.867603	eval's f1: 0.955357
[3]	train's multi_logloss: 0.778354	train's f1: 0.955506	eval's multi_logloss: 0.777774	eval's f1: 0.955357
[4]	train's multi_logloss: 0.702327	train's f1: 0.955506	eval's multi_logloss: 0.69849	eval's f1: 0.955357
[5]	train's multi_logloss: 0.636456	train's f1: 0.955506	eval's multi_logloss: 0.63308	eval's f1: 0.955357
[6]	train's multi_logloss: 0.578926	train's f1: 0.955506	eval's multi_logloss: 0.572737	eval's f1: 0.955357
[7]	train's multi_logloss: 0.528288	train's f1: 0.955506	eval's multi_logloss: 0.522984	eval's f1: 0.955357
[8]	train's multi_logloss: 0.483379	train's f1: 0.955506	eval's multi_logloss: 0.474522	eval's f1: 0.955357
[9]	train's multi_logloss: 0.443588	train's f1: 0.955506	eval's multi_logloss

<__main__.MultiClassLGBMCV at 0x7fc37ec9fe10>

In [32]:
cv_score = np.mean(lgb_model.model_scores_); cv_score

0.9554564595582393

In [33]:
lgb_model.feature_importances_

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2
sepal-length,0.073707,2,0.219057,4,0.509789,2
sepal-width,0.0,0,0.982871,7,1.5e-05,1
petal-length,118.3848,4,471.863596,30,142.011143,7
petal-width,0.0,0,107.16309,7,82.9354,3


In [34]:
lgb_model.model_scores_

[0.9553571428571429, 0.9777530589543938, 0.9332591768631814]

In [35]:
lgb_model.predict(test[feats])

array([0., 0., 0., 2., 2., 1., 2., 1., 0., 0., 2., 1., 1., 2., 1.])

In [36]:
# test[cats] = test[cats].astype('category')

In [37]:
test_dataset_score = f1_score(y_test, lgb_model.predict(test[feats]), average='weighted')

In [38]:
print(f"test dataset performance is: {test_dataset_score}")

test dataset performance is: 1.0
