In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

In [2]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
def create_data(df):
    
    df['gill-attachment'] = df['gill-attachment'].fillna('z')
    df['ring-type'] = df['ring-type'].fillna('a')
    df['season'] = df['season'].fillna('z')
    
    data = pd.DataFrame()
    
    
    #---CAT---#
    data['c0'] = df['cap-shape'].map({'x':0, 'f':1, 's':2, 'b':3, 'o':4, 'p':5, 'c':6})
    
    data['c1'] = df['cap-color'].map({'n':0, 'y':1, 'w':2, 'g':3, 'e':4, 'o':5, 'r':6,
                                      'u':7, 'p':8, 'k':9, 'b':10, 'l':11})
    
    data['c2'] = df['gill-attachment'].map({'a':0, 'd':1, 'x':2, 'p':3, 's':4, 'e':5, 'f':6, 'z':7})
    
    data['c3'] = df['gill-color'].map({'w':0, 'n':1, 'y':2, 'p':3, 'g':4, 'f':5, 'o':6,
                                       'k':7, 'r':8, 'e':9, 'b':10, 'u':11})
    
    data['c4'] = df['stem-color'].map({'w':0, 'n':1, 'y':2, 'g':3, 'o':4, 'e':5, 'u':6,
                                       'f':7, 'p':8, 'k':9, 'r':10, 'l':11, 'b':12})
    
    data['c5'] = df['ring-type'].map({'f':0, 'e':1, 'z':2, 'l':3, 'r':4, 'p':5, 'g':6, 'm':7, 'a':8})
    
    data['c6'] = df['habitat'].map({'d':0, 'g':1, 'l':2, 'm':3, 'h':4, 'p':5, 'w':6, 'u':7})
    
    #---BIN---#
    data['b0'] = df['edible-poisonous'].map({'p':1, 'e':0})
    data['b1'] = df['does-bruise-or-bleed'].map({'t':1, 'f':0})
    data['b2'] = df['has-ring'].map({'t':1, 'f':0})
    
    
    #---NUM---#
    data['n0'] = (df['cap-diameter'] - 30) / 30.0
    data['n1'] = (df['stem-height'] - 10) / 20.0
    data['n2'] = (df['stem-width'] - 40) / 60.0
    data['n3'] = data['n0'] / data['n2']
    data['n4'] = data['n1'] / data['n2']
    
    data['y'] = df['season'].map({'a':0, 'u':1, 'w':2, 's':3, 'z':-1})
    
    return data.reset_index(drop=True)
    
    

data = create_data(data)
test = create_data(test) 

cols = [f'c{i}' for i in range(7)] + ['b0', 'b1', 'b2'] + ['n0', 'n1', 'n2', 'n3', 'n4']

lgb_params = {
    'objective': 'multiclass',
    #'metric' : 'multi_logloss',
    'num_class' : 4,
    'boosting_type': 'gbdt',
    'seed':16,
    'verbose':-1,


'learning_rate': 0.21779455647179433,
 'num_leaves': 329,
 'colsample': 0.18851366842620274,
 'subsample': 0.25078255059092,
 'max_depth': 47,
 'min_child_samples': 1727,
 'reg_alpha': 1.4507346883966345e-05,
 'reg_lambda': 5.298258780575125e-07,
 'cat_smooth': 71


#'booster': 'gbtree',
#              'objective': 'multiclass',
#              'num_class' : 4,
# 'colsample_bylevel': 1.0,
# 'colsample_bytree': 1.0,
# 'gamma': 0.0,
# 'learning_rate': 0.3,
# 'max_delta_step': 0.0,
# 'max_depth': 12,
# 'min_child_weight': 15.0,
# 'n_estimators': 105,
# 'nthread': -1,
# 'reg_alpha': 0.001,
# 'reg_lambda': 0.1,
# 'subsample': 1.0,
# 'seed': 2022
}



def accuracy_lgbm(preds: np.ndarray, data: lgb.Dataset):
    label = data.get_label()
    preds = preds.reshape((len(label), 4), order = 'F')
    preds = preds.argmax(axis = 1)
    score = accuracy_score(label, preds)
    
    return 'acc', score, True

In [None]:
N_folds = 5
seeds = [455, 485, 659, 16, 5659]

data_preds = np.zeros((data.shape[0], 4))
test_preds = np.zeros((test.shape[0], 4))

feature_importance = pd.DataFrame()

for seed in seeds:
    print('---------------------------------------')
    print('SEED : ', seed)
    print('---------------------------------------')
    
    
    data['fold'] = -1
    skf = StratifiedKFold(n_splits=N_folds, shuffle=True, random_state=seed)
    for f, (_, idxs) in enumerate(skf.split(data, data['y'])):
        data.loc[idxs, 'fold'] = f
        
    for F in range(N_folds):
        
        print('FOLD : ', F)
        
        train = data[data['fold'] != F].reset_index(drop=True)
        valid = data[data['fold'] == F].reset_index(drop=True)
        
        
        train_dataset = lgb.Dataset(train[cols], train['y'], categorical_feature = [0, 1, 2, 4, 5, 6])
        valid_dataset = lgb.Dataset(valid[cols], valid['y'], categorical_feature = [0, 1, 2, 4, 5, 6])
        
        model = lgb.train(params = lgb_params,
                          num_boost_round=10000,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, valid_dataset], 
                          verbose_eval = 0,
                          early_stopping_rounds=20,
                          feval=accuracy_lgbm)
        
        del train_dataset, valid_dataset
        
        fold_importance = pd.DataFrame()
        fold_importance["feature"] = cols
        fold_importance["importance"] = model.feature_importance()
        fold_importance["fold"] = F + 1
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
        
        
        preds = model.predict(valid[cols])
        data_preds[data[data['fold'] == F].index] += preds
        
        preds = model.predict(test[cols])
        test_preds += preds

---------------------------------------
SEED :  455
---------------------------------------
FOLD :  0








FOLD :  1




FOLD :  2




FOLD :  3




FOLD :  4




---------------------------------------
SEED :  485
---------------------------------------
FOLD :  0




FOLD :  1




FOLD :  2




FOLD :  3




FOLD :  4




---------------------------------------
SEED :  659
---------------------------------------
FOLD :  0




FOLD :  1




FOLD :  2




FOLD :  3




FOLD :  4




---------------------------------------
SEED :  16
---------------------------------------
FOLD :  0




FOLD :  1




FOLD :  2




FOLD :  3




FOLD :  4




---------------------------------------
SEED :  5659
---------------------------------------
FOLD :  0




FOLD :  1




FOLD :  2




FOLD :  3




In [None]:
data_preds = data_preds / (len(seeds))
test_preds = test_preds / (len(seeds) * N_folds)

In [None]:
feature_importance["importance"] /= 10
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
    by="importance", ascending=False)[:50].index

best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

plt.figure(figsize=(8, 6));
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
plt.title('LGB Features (avg over folds)');

In [None]:
sub = pd.DataFrame()
sub['season'] = np.argmax(test_preds, axis = 1)
sub['season'] = sub['season'].map({0:'a', 1:'u', 2:'w', 3:'s'})
sub.to_csv('sub0.csv', index=False)
sub['season'].value_counts(normalize = True)

In [None]:
np.save('lgb0_oof.npy', data_preds)
np.save('lgb0_preds.npy', test_preds)