In [42]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

import lightgbm as lgb

In [2]:
folder = "without_na"

train = pd.read_csv("{}/train_clean.csv".format(folder))
val = pd.read_csv("{}/validation_clean.csv".format(folder))

In [3]:
def get_class_weights(labels):
    classes, class_counts = np.unique(labels, return_counts=True)
    weights = 1 - class_counts / np.sum(class_counts)
    return {c:w for c, w in zip(classes, weights)}

In [4]:
def split_datasets(train, test):
    features_to_drop = ['objid', 'class']
    
    X_train = train.drop(features_to_drop, axis=1).values
    y_train = train['class'].values
    
    X_test = test.drop(features_to_drop, axis=1).values
    y_test = test['class'].values
    
    return X_train, y_train, X_test, y_test

In [5]:
# divide data into 2 groups: clean and 'notclean'
    
clean_df = train[train['clean'] == 1].drop('clean', axis=1)
not_clean_df = train[train['clean'] == 0].drop('clean', axis=1)

val_clean_df = val[val['clean'] == 1].drop('clean', axis=1)
val_not_clean_df = val[val['clean'] == 0].drop('clean', axis=1)

#### Clean

In [6]:
X_train_clean, y_train_clean, X_test_clean, y_test_clean = split_datasets(clean_df, val_clean_df)

In [7]:
gbm_for_clean = lgb.LGBMClassifier(objective='multiclass',
                                  num_class = 3,
                                  class_weight=get_class_weights(y_train_clean),
                                  boosting_type='dart',
                                  num_leaves=90,
                                  learning_rate=0.07,
                                  n_estimators=700)

gbm_for_clean.fit(X_train_clean, y_train_clean,
                 eval_set=[(X_test_clean, y_test_clean)],
                 early_stopping_rounds=50)
                  

[1]	valid_0's multi_logloss: 1.02717
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_logloss: 0.964032
[3]	valid_0's multi_logloss: 0.907921
[4]	valid_0's multi_logloss: 0.859411
[5]	valid_0's multi_logloss: 0.81573
[6]	valid_0's multi_logloss: 0.775845
[7]	valid_0's multi_logloss: 0.740444
[8]	valid_0's multi_logloss: 0.756096
[9]	valid_0's multi_logloss: 0.722716
[10]	valid_0's multi_logloss: 0.69208
[11]	valid_0's multi_logloss: 0.664514
[12]	valid_0's multi_logloss: 0.675031
[13]	valid_0's multi_logloss: 0.648458
[14]	valid_0's multi_logloss: 0.62398
[15]	valid_0's multi_logloss: 0.601683
[16]	valid_0's multi_logloss: 0.58061
[17]	valid_0's multi_logloss: 0.560954
[18]	valid_0's multi_logloss: 0.543357
[19]	valid_0's multi_logloss: 0.527167
[20]	valid_0's multi_logloss: 0.512501
[21]	valid_0's multi_logloss: 0.516969
[22]	valid_0's multi_logloss: 0.503044
[23]	valid_0's multi_logloss: 0.490269
[24]	valid_0's multi_logloss: 0.478371
[25]	valid_0's m

[210]	valid_0's multi_logloss: 0.333383
[211]	valid_0's multi_logloss: 0.334057
[212]	valid_0's multi_logloss: 0.332949
[213]	valid_0's multi_logloss: 0.331899
[214]	valid_0's multi_logloss: 0.330971
[215]	valid_0's multi_logloss: 0.330393
[216]	valid_0's multi_logloss: 0.331109
[217]	valid_0's multi_logloss: 0.330551
[218]	valid_0's multi_logloss: 0.329675
[219]	valid_0's multi_logloss: 0.330243
[220]	valid_0's multi_logloss: 0.32943
[221]	valid_0's multi_logloss: 0.329961
[222]	valid_0's multi_logloss: 0.329422
[223]	valid_0's multi_logloss: 0.329059
[224]	valid_0's multi_logloss: 0.329652
[225]	valid_0's multi_logloss: 0.330321
[226]	valid_0's multi_logloss: 0.330862
[227]	valid_0's multi_logloss: 0.331263
[228]	valid_0's multi_logloss: 0.329901
[229]	valid_0's multi_logloss: 0.33045
[230]	valid_0's multi_logloss: 0.329315
[231]	valid_0's multi_logloss: 0.328557
[232]	valid_0's multi_logloss: 0.327906
[233]	valid_0's multi_logloss: 0.328404
[234]	valid_0's multi_logloss: 0.32776
[23

[417]	valid_0's multi_logloss: 0.320886
[418]	valid_0's multi_logloss: 0.320894
[419]	valid_0's multi_logloss: 0.320814
[420]	valid_0's multi_logloss: 0.320867
[421]	valid_0's multi_logloss: 0.320867
[422]	valid_0's multi_logloss: 0.320876
[423]	valid_0's multi_logloss: 0.320744
[424]	valid_0's multi_logloss: 0.320785
[425]	valid_0's multi_logloss: 0.320799
[426]	valid_0's multi_logloss: 0.320794
[427]	valid_0's multi_logloss: 0.320589
[428]	valid_0's multi_logloss: 0.320589
[429]	valid_0's multi_logloss: 0.32054
[430]	valid_0's multi_logloss: 0.32058
[431]	valid_0's multi_logloss: 0.320735
[432]	valid_0's multi_logloss: 0.320833
[433]	valid_0's multi_logloss: 0.320703
[434]	valid_0's multi_logloss: 0.320734
[435]	valid_0's multi_logloss: 0.320745
[436]	valid_0's multi_logloss: 0.320794
[437]	valid_0's multi_logloss: 0.320897
[438]	valid_0's multi_logloss: 0.320919
[439]	valid_0's multi_logloss: 0.320932
[440]	valid_0's multi_logloss: 0.32089
[441]	valid_0's multi_logloss: 0.320632
[44

LGBMClassifier(boosting_type='dart',
        class_weight={0: 0.7568048533872599, 1: 0.6343377148634985, 2: 0.6088574317492417},
        colsample_bytree=1.0, learning_rate=0.07, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=700, n_jobs=-1, num_class=3, num_leaves=90,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [44]:
prediction_clean = gbm_for_clean.predict(X_test_clean)
f1_score(y_test_clean, prediction_clean, average='macro')

  if diff:


0.8776581862008542

#### Not clean

In [10]:
X_train_not_clean, y_train_not_clean, X_test_not_clean, y_test_not_clean = split_datasets(not_clean_df, val_not_clean_df)

In [46]:
grid_params = {
    'learning_rate': [0.05, 0.07, 0.1],
    'boosting_type': ['dart', 'gbdt'],
    'num_leaves': [30, 80, 130]
}

gbm = lgb.LGBMClassifier(objective='multiclass',
                        num_class = 3,
                        class_weight=get_class_weights(y_train_not_clean),
                        n_estimators=500)

kf = KFold(n_splits=4, random_state=41, shuffle=True)
gridSearch = GridSearchCV(gbm, grid_params, cv=kf, scoring='f1_macro')

gridSearch.fit(X_train_not_clean, y_train_not_clean)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=KFold(n_splits=4, random_state=41, shuffle=True),
       error_score='raise',
       estimator=LGBMClassifier(boosting_type='gbdt',
        class_weight={0: 0.24417061611374402, 1: 0.8181990521327014, 2: 0.9376303317535545},
        colsample_bytree=1.0, learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=50...     reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.07, 0.1], 'boosting_type': ['dart', 'gbdt'], 'num_leaves': [30, 80]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [47]:
print(gridSearch.best_params_)
print(gridSearch.best_score_)

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'num_leaves': 30}
0.8100557027973075


In [48]:
gbm_for_not_clean = lgb.LGBMClassifier(objective='multiclass',
                                     num_class = 3,
                                     class_weight=get_class_weights(y_train_not_clean),
                                     boosting_type='gbdt',
                                     num_leaves=30,
                                     learning_rate=0.1,
                                     n_estimators=700)

gbm_for_not_clean.fit(X_train_not_clean, y_train_not_clean,
                    eval_set=[(X_test_not_clean, y_test_not_clean)],
                    early_stopping_rounds=150)
                  

[1]	valid_0's multi_logloss: 0.988274
Training until validation scores don't improve for 150 rounds.
[2]	valid_0's multi_logloss: 0.896486
[3]	valid_0's multi_logloss: 0.818769
[4]	valid_0's multi_logloss: 0.752875
[5]	valid_0's multi_logloss: 0.695356
[6]	valid_0's multi_logloss: 0.645915
[7]	valid_0's multi_logloss: 0.602268
[8]	valid_0's multi_logloss: 0.56429
[9]	valid_0's multi_logloss: 0.531177
[10]	valid_0's multi_logloss: 0.50163
[11]	valid_0's multi_logloss: 0.475836
[12]	valid_0's multi_logloss: 0.453116
[13]	valid_0's multi_logloss: 0.432773
[14]	valid_0's multi_logloss: 0.414823
[15]	valid_0's multi_logloss: 0.398858
[16]	valid_0's multi_logloss: 0.384076
[17]	valid_0's multi_logloss: 0.371995
[18]	valid_0's multi_logloss: 0.360312
[19]	valid_0's multi_logloss: 0.349964
[20]	valid_0's multi_logloss: 0.340776
[21]	valid_0's multi_logloss: 0.332063
[22]	valid_0's multi_logloss: 0.32477
[23]	valid_0's multi_logloss: 0.317897
[24]	valid_0's multi_logloss: 0.311304
[25]	valid_0'

[212]	valid_0's multi_logloss: 0.335137
[213]	valid_0's multi_logloss: 0.335743
Early stopping, best iteration is:
[63]	valid_0's multi_logloss: 0.252541


LGBMClassifier(boosting_type='gbdt',
        class_weight={0: 0.24417061611374402, 1: 0.8181990521327014, 2: 0.9376303317535545},
        colsample_bytree=1.0, learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=700, n_jobs=-1, num_class=3, num_leaves=30,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [49]:
prediction_not_clean = gbm_for_not_clean.predict(X_test_not_clean)
f1_score(y_test_not_clean, prediction_not_clean, average='macro')

  if diff:


0.7530398663339963

#### Combine 2

In [50]:
def predict(clean_cls, not_clean_cls, X_test):
    
    # divide X_test into clean and not_clean
    X_test_clean = X_test[X_test['clean'] == 1].drop('clean', axis=1)
    X_test_not_clean = X_test[X_test['clean'] == 0].drop('clean', axis=1)
    
    # make predictions for these dfs
    clean_predictions = clean_cls.predict(X_test_clean)
    not_clean_predictions = not_clean_cls.predict(X_test_not_clean)
    
    # form answer: place predictions in the right order
    num_of_samples = len(X_test)
    predictions = np.zeros(num_of_samples)
    
    clean_idxs = X_test[X_test['clean'] == 1].index
    predictions[[clean_idxs]] = clean_predictions
    
    not_clean_idxs = X_test[X_test['clean'] == 0].index
    predictions[[not_clean_idxs]] = not_clean_predictions
    
    return predictions
    

In [51]:
X_test = val.drop(['objid', 'class'], axis=1)
y_test = val['class']
predictions = predict(gbm_for_clean, gbm_for_not_clean, X_test)
f1_score(y_test, predictions, average='macro')

  if diff:
  if diff:


0.8744739263474676