In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Params

In [3]:
data_path = Path(r"/database/kaggle/PII/data")
CHECKPOINT_PATH = Path(r"/database/kaggle/PII/checkpoint")
os.listdir(data_path)

['train.json',
 'mpware_mixtral8x7b_v1.1-no-i-username.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'pii-masking-200k.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [4]:
FOLD_NAME = "fold_msk_5_seed_42"
model_name = "deberta-v3-large"
folder = CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/')

In [5]:
LABEL2TYPE = ['NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O']

In [18]:
def read_data_stack(folder_path,df=None,name=None):
    dfx = pd.read_parquet(folder_path/'stack_oof.gzip')
    dfx['label'] = dfx['label'].fillna(7).astype(np.int32)
    dfx[['document','token','label_pred']] = dfx[['document','token','label_pred']].astype(np.int32)
#     df = df[~((df.label_pred==df.label) & (df.label==7))]
    cols = [x for x in dfx.columns if x not in ['doc_size','row_id','I']]
    if df is not None:
        print('add')
        df[[x+name for x in LABEL2TYPE]] = dfx[LABEL2TYPE].values
        cols = [x for x in df.columns if x not in ['doc_size','row_id','I']]
        return df[cols]
    else:
        return dfx[cols]

In [19]:
df = read_data_stack(folder/"2024-02-04--vsub1")
df.shape

(4992533, 15)

In [20]:
df = read_data_stack(folder/"2024-04-06--dv3l_cp_nbrod_add00_rep_00_v1",df=df,name='_dv3l')
df.shape

add


(4992533, 23)

In [21]:
df = read_data_stack(folder/"2024-04-03--dv3l_cp_nbrod_add05_rep_05_v1",df=df,name='_dv3l_rep05')
df.shape

add


(4992533, 31)

In [22]:
df.head()

Unnamed: 0,document,token,tokens,label_pred,score,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,O,label,fold_msk_5_seed_42,NAME_STUDENT_dv3l,EMAIL_dv3l,USERNAME_dv3l,ID_NUM_dv3l,PHONE_NUM_dv3l,URL_PERSONAL_dv3l,STREET_ADDRESS_dv3l,O_dv3l,NAME_STUDENT_dv3l_rep05,EMAIL_dv3l_rep05,USERNAME_dv3l_rep05,ID_NUM_dv3l_rep05,PHONE_NUM_dv3l_rep05,URL_PERSONAL_dv3l_rep05,STREET_ADDRESS_dv3l_rep05,O_dv3l_rep05
0,16,0,Reporting,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9999
1,16,1,process,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,16,2,|,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,16,3,by,7,0.9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9999,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0001,0.0,0.0001,0.0,0.0,0.0,0.0,0.9997
4,16,4,Gilberto,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
def add_features(df_vsub1):
    for lab in tqdm(LABEL2TYPE):
        df_vsub1[lab+"_mean"] = df_vsub1[[lab,lab+'_dv3l',lab+'_dv3l_rep05']].mean(axis=1)
        df_vsub1[lab+"_min"] = df_vsub1[[lab,lab+'_dv3l',lab+'_dv3l_rep05']].min(axis=1)
        df_vsub1[lab+"_max"] = df_vsub1[[lab,lab+'_dv3l',lab+'_dv3l_rep05']].max(axis=1)
        
        for lag in [1,-1,2,-2]:
            df_vsub1[lab+f"_mean_{lag}"] = df_vsub1.groupby('document')[lab+"_mean"].transform(lambda x:x.shift(lag))
            df_vsub1[lab+f"_min_{lag}"] = df_vsub1.groupby('document')[lab+"_min"].transform(lambda x:x.shift(lag))
            df_vsub1[lab+f"_max_{lag}"] = df_vsub1.groupby('document')[lab+"_max"].transform(lambda x:x.shift(lag))
    
    df['token_size'] = df['tokens'].str.len()
    df['token_as_int'] = ((df['tokens'].str.contains('\d', regex=True))*1).astype(np.int16)
    return df

In [25]:
df = add_features(df)
df.shape

  0%|          | 0/8 [00:00<?, ?it/s]

(4992533, 153)

In [26]:
df.head()

Unnamed: 0,document,token,tokens,label_pred,score,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,O,label,fold_msk_5_seed_42,NAME_STUDENT_dv3l,EMAIL_dv3l,USERNAME_dv3l,ID_NUM_dv3l,PHONE_NUM_dv3l,URL_PERSONAL_dv3l,STREET_ADDRESS_dv3l,O_dv3l,NAME_STUDENT_dv3l_rep05,EMAIL_dv3l_rep05,USERNAME_dv3l_rep05,ID_NUM_dv3l_rep05,PHONE_NUM_dv3l_rep05,URL_PERSONAL_dv3l_rep05,STREET_ADDRESS_dv3l_rep05,O_dv3l_rep05,NAME_STUDENT_mean,NAME_STUDENT_min,NAME_STUDENT_max,NAME_STUDENT_mean_1,NAME_STUDENT_min_1,NAME_STUDENT_max_1,NAME_STUDENT_mean_-1,NAME_STUDENT_min_-1,NAME_STUDENT_max_-1,NAME_STUDENT_mean_2,NAME_STUDENT_min_2,NAME_STUDENT_max_2,NAME_STUDENT_mean_-2,NAME_STUDENT_min_-2,NAME_STUDENT_max_-2,EMAIL_mean,EMAIL_min,EMAIL_max,EMAIL_mean_1,EMAIL_min_1,EMAIL_max_1,EMAIL_mean_-1,EMAIL_min_-1,EMAIL_max_-1,EMAIL_mean_2,EMAIL_min_2,EMAIL_max_2,EMAIL_mean_-2,EMAIL_min_-2,EMAIL_max_-2,USERNAME_mean,USERNAME_min,USERNAME_max,USERNAME_mean_1,USERNAME_min_1,USERNAME_max_1,USERNAME_mean_-1,USERNAME_min_-1,USERNAME_max_-1,USERNAME_mean_2,USERNAME_min_2,USERNAME_max_2,USERNAME_mean_-2,USERNAME_min_-2,USERNAME_max_-2,ID_NUM_mean,ID_NUM_min,ID_NUM_max,ID_NUM_mean_1,ID_NUM_min_1,ID_NUM_max_1,ID_NUM_mean_-1,ID_NUM_min_-1,ID_NUM_max_-1,ID_NUM_mean_2,ID_NUM_min_2,ID_NUM_max_2,ID_NUM_mean_-2,ID_NUM_min_-2,ID_NUM_max_-2,PHONE_NUM_mean,PHONE_NUM_min,PHONE_NUM_max,PHONE_NUM_mean_1,PHONE_NUM_min_1,PHONE_NUM_max_1,PHONE_NUM_mean_-1,PHONE_NUM_min_-1,PHONE_NUM_max_-1,PHONE_NUM_mean_2,PHONE_NUM_min_2,PHONE_NUM_max_2,PHONE_NUM_mean_-2,PHONE_NUM_min_-2,PHONE_NUM_max_-2,URL_PERSONAL_mean,URL_PERSONAL_min,URL_PERSONAL_max,URL_PERSONAL_mean_1,URL_PERSONAL_min_1,URL_PERSONAL_max_1,URL_PERSONAL_mean_-1,URL_PERSONAL_min_-1,URL_PERSONAL_max_-1,URL_PERSONAL_mean_2,URL_PERSONAL_min_2,URL_PERSONAL_max_2,URL_PERSONAL_mean_-2,URL_PERSONAL_min_-2,URL_PERSONAL_max_-2,STREET_ADDRESS_mean,STREET_ADDRESS_min,STREET_ADDRESS_max,STREET_ADDRESS_mean_1,STREET_ADDRESS_min_1,STREET_ADDRESS_max_1,STREET_ADDRESS_mean_-1,STREET_ADDRESS_min_-1,STREET_ADDRESS_max_-1,STREET_ADDRESS_mean_2,STREET_ADDRESS_min_2,STREET_ADDRESS_max_2,STREET_ADDRESS_mean_-2,STREET_ADDRESS_min_-2,STREET_ADDRESS_max_-2,O_mean,O_min,O_max,O_mean_1,O_min_1,O_max_1,O_mean_-1,O_min_-1,O_max_-1,O_mean_2,O_min_2,O_max_2,O_mean_-2,O_min_-2,O_max_-2,token_size,token_as_int
0,16,0,Reporting,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9999,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,1.0,0.9999,1.0,,,,1.0,1.0,1.0,,,,1.0,1.0,1.0,9,0
1,16,1,process,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0001,0.0,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.9999,1.0,1.0,1.0,1.0,,,,0.9999,0.9997,1.0,7,0
2,16,2,|,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0,0.0001,0.0,0.0,0.0,0.9999,0.9999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9999,0.9997,1.0,1.0,0.9999,1.0,0.0001,0.0,0.0001,1,0
3,16,3,by,7,0.9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9999,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0001,0.0,0.0001,0.0,0.0,0.0,0.0,0.9997,0.0001,0.0,0.0001,0.0,0.0,0.0,0.9999,0.9999,1.0,0.0,0.0,0.0,0.9999,0.9999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9999,0.9997,1.0,1.0,1.0,1.0,0.0001,0.0,0.0001,1.0,1.0,1.0,0.0001,0.0,0.0001,2,0
4,16,4,Gilberto,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9999,0.9999,1.0,0.0001,0.0,0.0001,0.9999,0.9999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0,0.0001,0.9999,0.9997,1.0,0.0001,0.0,0.0001,1.0,1.0,1.0,1.0,1.0,1.0,8,0


# Modeling

In [27]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

import operator
import typing as tp
lgb.__version__

'3.3.5'

In [28]:
import pickle

#==============================================================================================
def save_pickle(name,var):
    # print(f"Saving {name} ....")
    with open(name+'.pkl','wb') as fout:
        pickle.dump(var,fout)
    fout.close()

In [29]:
#==============================================================================================
def lgbm_features_importance(clf,features,n=15,size=(15,12)):
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()

    fold_importance_df=fold_importance_df.sort_values("importance",ascending=False).iloc[:n,:]
    plt.figure(figsize=size)
    sns.barplot(x="importance", y="Feature", data=fold_importance_df) 
    plt.title('Features importance ')
    plt.tight_layout()
    plt.show()
    
    
#==============================================================================================
def fit_lgbm(param,
             dtrain,
             dval,
             features,
             categoricals,
             target,
             fold = 0,
             es = 100,
             vb_eval = 10,
             n_iter = 5_000_000,
             min_lr = 1e-3,
            ):
    print(param)
    # Penalization Beta
#     beta = 2.5
#     LgbmLoss = CustomLGBM(beta)

    trn_data=lgb.Dataset(dtrain[features], label=dtrain[target],categorical_feature=categoricals)
    val_data=lgb.Dataset(dval[features], label=dval[target],categorical_feature=categoricals)
    
    clf=lgb.train(params = param, train_set=trn_data,
                        num_boost_round=n_iter, 
                        valid_sets = [trn_data,val_data],
                        verbose_eval=vb_eval,
#                         fobj=LgbmLoss.return_grad_and_hess,
#                         feval=LgbmLoss.return_loss,
                        early_stopping_rounds = es,
                        learning_rates = lambda iter: max(param["learning_rate"] * (0.99 ** iter),min_lr),
                 )
    
    oof = clf.predict(dval[features],num_iteration=clf.best_iteration)
    
#     s = {}
#     for i,tg in enumerate(ALL_TARGETS):
#         s[tg] = smape(dval[tg].values, oof[:,i])
#     print(s)
    
#     s = {}
#     for i,tg in enumerate(ALL_TARGETS):
#         s[tg] = mae(dval[tg].values, oof[:,i])
#     print(s)
    
    lgbm_features_importance(clf,features,n=30,size=(15,12))
    
    dico = {
        "clf":clf,
#         "mse":loss,
#         "correlation":corr,
        "fold":fold
       }    
    
    save_pickle(str(args.checkpoints_path/f'fold_{fold}'),dico)
    return oof

#     save_pickle(str(name),clf)
    
    
#==============================================================================================        
def kfold(args,df):
       
    k = len(set(df[args.kfold_name].unique().tolist()) -set([-1]))

    print(f"----------- {args.kfold_name} ---------")
    for i in range(k):
        if i in args.selected_folds:
            print(f"\n-------------   Fold {i+1} / {k}  -------------\n")
            
            val_ind = df[args.kfold_name]==i
            train_df = df[~(df[args.kfold_name].isin([i,-1]))].reset_index(drop=True)
            valid_df = df[val_ind].reset_index(drop=True)
            
            trn_len = len(train_df)
            val_len = len(valid_df)
            
            print(f"Train ({trn_len}) Validation ({val_len})")
            oof = fit_lgbm(args.param,
                        train_df,
                        valid_df,
                        features = args.features,
                        categoricals = args.categoricals,
                        target = args.target,
                        fold = i,
                        es = args.es,
                        vb_eval = args.vb_eval,
                        n_iter = args.n_iter,
                        min_lr = args.min_lr)
    return oof

In [33]:
df.columns.tolist()

['document',
 'token',
 'tokens',
 'label_pred',
 'score',
 'NAME_STUDENT',
 'EMAIL',
 'USERNAME',
 'ID_NUM',
 'PHONE_NUM',
 'URL_PERSONAL',
 'STREET_ADDRESS',
 'O',
 'label',
 'fold_msk_5_seed_42',
 'NAME_STUDENT_dv3l',
 'EMAIL_dv3l',
 'USERNAME_dv3l',
 'ID_NUM_dv3l',
 'PHONE_NUM_dv3l',
 'URL_PERSONAL_dv3l',
 'STREET_ADDRESS_dv3l',
 'O_dv3l',
 'NAME_STUDENT_dv3l_rep05',
 'EMAIL_dv3l_rep05',
 'USERNAME_dv3l_rep05',
 'ID_NUM_dv3l_rep05',
 'PHONE_NUM_dv3l_rep05',
 'URL_PERSONAL_dv3l_rep05',
 'STREET_ADDRESS_dv3l_rep05',
 'O_dv3l_rep05',
 'NAME_STUDENT_mean',
 'NAME_STUDENT_min',
 'NAME_STUDENT_max',
 'NAME_STUDENT_mean_1',
 'NAME_STUDENT_min_1',
 'NAME_STUDENT_max_1',
 'NAME_STUDENT_mean_-1',
 'NAME_STUDENT_min_-1',
 'NAME_STUDENT_max_-1',
 'NAME_STUDENT_mean_2',
 'NAME_STUDENT_min_2',
 'NAME_STUDENT_max_2',
 'NAME_STUDENT_mean_-2',
 'NAME_STUDENT_min_-2',
 'NAME_STUDENT_max_-2',
 'EMAIL_mean',
 'EMAIL_min',
 'EMAIL_max',
 'EMAIL_mean_1',
 'EMAIL_min_1',
 'EMAIL_max_1',
 'EMAIL_mean_-1',

In [34]:
features = [
    'NAME_STUDENT_mean',
     'NAME_STUDENT_min',
     'NAME_STUDENT_max',
     'NAME_STUDENT_mean_1',
     'NAME_STUDENT_min_1',
     'NAME_STUDENT_max_1',
     'NAME_STUDENT_mean_-1',
     'NAME_STUDENT_min_-1',
     'NAME_STUDENT_max_-1',
     'NAME_STUDENT_mean_2',
     'NAME_STUDENT_min_2',
     'NAME_STUDENT_max_2',
     'NAME_STUDENT_mean_-2',
     'NAME_STUDENT_min_-2',
     'NAME_STUDENT_max_-2',
     'EMAIL_mean',
     'EMAIL_min',
     'EMAIL_max',
     'EMAIL_mean_1',
     'EMAIL_min_1',
     'EMAIL_max_1',
     'EMAIL_mean_-1',
     'EMAIL_min_-1',
     'EMAIL_max_-1',
     'EMAIL_mean_2',
     'EMAIL_min_2',
     'EMAIL_max_2',
     'EMAIL_mean_-2',
     'EMAIL_min_-2',
     'EMAIL_max_-2',
     'USERNAME_mean',
     'USERNAME_min',
     'USERNAME_max',
     'USERNAME_mean_1',
     'USERNAME_min_1',
     'USERNAME_max_1',
     'USERNAME_mean_-1',
     'USERNAME_min_-1',
     'USERNAME_max_-1',
     'USERNAME_mean_2',
     'USERNAME_min_2',
     'USERNAME_max_2',
     'USERNAME_mean_-2',
     'USERNAME_min_-2',
     'USERNAME_max_-2',
     'ID_NUM_mean',
     'ID_NUM_min',
     'ID_NUM_max',
     'ID_NUM_mean_1',
     'ID_NUM_min_1',
     'ID_NUM_max_1',
     'ID_NUM_mean_-1',
     'ID_NUM_min_-1',
     'ID_NUM_max_-1',
     'ID_NUM_mean_2',
     'ID_NUM_min_2',
     'ID_NUM_max_2',
     'ID_NUM_mean_-2',
     'ID_NUM_min_-2',
     'ID_NUM_max_-2',
     'PHONE_NUM_mean',
     'PHONE_NUM_min',
     'PHONE_NUM_max',
     'PHONE_NUM_mean_1',
     'PHONE_NUM_min_1',
     'PHONE_NUM_max_1',
     'PHONE_NUM_mean_-1',
     'PHONE_NUM_min_-1',
     'PHONE_NUM_max_-1',
     'PHONE_NUM_mean_2',
     'PHONE_NUM_min_2',
     'PHONE_NUM_max_2',
     'PHONE_NUM_mean_-2',
     'PHONE_NUM_min_-2',
     'PHONE_NUM_max_-2',
     'URL_PERSONAL_mean',
     'URL_PERSONAL_min',
     'URL_PERSONAL_max',
     'URL_PERSONAL_mean_1',
     'URL_PERSONAL_min_1',
     'URL_PERSONAL_max_1',
     'URL_PERSONAL_mean_-1',
     'URL_PERSONAL_min_-1',
     'URL_PERSONAL_max_-1',
     'URL_PERSONAL_mean_2',
     'URL_PERSONAL_min_2',
     'URL_PERSONAL_max_2',
     'URL_PERSONAL_mean_-2',
     'URL_PERSONAL_min_-2',
     'URL_PERSONAL_max_-2',
     'STREET_ADDRESS_mean',
     'STREET_ADDRESS_min',
     'STREET_ADDRESS_max',
     'STREET_ADDRESS_mean_1',
     'STREET_ADDRESS_min_1',
     'STREET_ADDRESS_max_1',
     'STREET_ADDRESS_mean_-1',
     'STREET_ADDRESS_min_-1',
     'STREET_ADDRESS_max_-1',
     'STREET_ADDRESS_mean_2',
     'STREET_ADDRESS_min_2',
     'STREET_ADDRESS_max_2',
     'STREET_ADDRESS_mean_-2',
     'STREET_ADDRESS_min_-2',
     'STREET_ADDRESS_max_-2',
     'O_mean',
     'O_min',
     'O_max',
     'O_mean_1',
     'O_min_1',
     'O_max_1',
     'O_mean_-1',
     'O_min_-1',
     'O_max_-1',
     'O_mean_2',
     'O_min_2',
     'O_max_2',
     'O_mean_-2',
     'O_min_-2',
     'O_max_-2']
len(features)

120

In [40]:
df['label'].value_counts()

7    4989794
0       2461
5        111
3         79
1         39
6         22
4         21
2          6
Name: label, dtype: int64

In [43]:
class args:
    kfold_name = "fold_msk_5_seed_42"
    target = "label"
    features = features
    categoricals = []
    selected_folds = [0]
    es = 500
    vb_eval = 200
    n_iter = 20_000
    min_lr = 1e-3
    param = {
#                 'bagging_freq': 1,
#                 'bagging_fraction': 0.1,
                'boost_from_average':'true',
                'boost': 'gbdt',
                'feature_fraction': 0.2,
                'learning_rate': 5e-2,
#                 "min_data_per_leaf"
            #     "max_bins":60,
#                 'num_threads':20,
#                 'cat_smooth':500,
#                 'max_depth': 50,
            #     'cat_l2':10,
                'min_data_in_leaf': 6,
#                 'num_leaves': 132,
#                 'feature_name': 'auto', # that's actually the default
#                 'categorical_feature': 'auto',
#                 "lambda_l1":0.3,
#                 "lambda_l2":1.2,
                'objective': 'multiclass',
                "num_class":8,
#                 "n_jobs": -1,
#                 'metric':'mse',
                'verbosity': -1}
args.checkpoints_path = (CHECKPOINT_PATH/args.kfold_name/"Lgb_test")
args.checkpoints_path.mkdir(parents=True,exist_ok=True)

In [None]:
oof = kfold(args,df)

----------- fold_msk_5_seed_42 ---------

-------------   Fold 1 / 5  -------------

Train (3995201) Validation (997332)
{'boost_from_average': 'true', 'boost': 'gbdt', 'feature_fraction': 0.2, 'learning_rate': 0.05, 'min_data_in_leaf': 6, 'objective': 'multiclass', 'num_class': 8, 'verbosity': -1}




Training until validation scores don't improve for 500 rounds
