In [2]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb
import xgboost as xgb

from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import display

pd.options.display.max_rows = 10000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [4]:
def read_data(na_values='?', target='income', drop_cols=['education-num']):
    train = pd.read_csv('data/kaggle/train.csv')
    train[target] = train[target].map({'<=50K':0, '>50K':1})
    y = train.pop(target)
    train.drop(drop_cols, axis='columns', inplace=True)
    print('train.shape:', train.shape)
    display(train.head())
    
    test = pd.read_csv('data/kaggle/test.csv')
    test.drop(drop_cols, axis='columns', inplace=True)
    print('test.shape:', test.shape)
    display(test.head())
    
    return train, test, y

In [5]:
train, test, y = read_data()

train.shape: (29305, 14)


Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,1,25,Private,219199,11th,Divorced,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States
1,2,39,Private,52978,Some-college,Divorced,Other-service,Not-in-family,White,Female,0,1721,55,United-States
2,3,35,Private,196899,Bachelors,Never-married,Handlers-cleaners,Not-in-family,Asian-Pac-Islander,Female,0,0,50,Haiti
3,4,64,Private,135527,Assoc-voc,Divorced,Tech-support,Not-in-family,White,Female,0,0,40,United-States
4,5,24,Private,60783,Some-college,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,70,United-States


test.shape: (19537, 14)


Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,29306,18,?,245274,Some-college,Never-married,?,Own-child,White,Male,0,0,16,United-States
1,29307,29,Private,83003,HS-grad,Married-civ-spouse,Other-service,Wife,White,Female,0,0,40,United-States
2,29308,45,Private,35136,Bachelors,Married-civ-spouse,Tech-support,Husband,Black,Male,0,0,40,United-States
3,29309,42,Self-emp-not-inc,64631,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States
4,29310,41,Private,195821,Doctorate,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,1902,40,United-States


In [6]:
train.isna().sum()

no                0
age               0
workclass         0
fnlwgt            0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In [7]:
test.isna().sum()

no                0
age               0
workclass         0
fnlwgt            0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In [8]:
features = [col for col in train.columns if col != 'id']
pd.concat([train,y], axis=1)[train[features].duplicated(keep=False)].sort_values(features)

Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [9]:
features = [col for col in test.columns if col != 'id']
test[test[features].duplicated(keep=False)].sort_values(features)

Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country


In [10]:
grouped_trn = pd.concat([train,y], axis=1).groupby(features)['income'].agg(['count','mean'])
grouped_trn.head()
grouped_trn_dup = grouped_trn[grouped_trn['count']>1].reset_index()
grouped_trn_dup

Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,count,mean


In [11]:
crit = (train['age']==39)&(train['workclass']=='Private')&(train['fnlwgt']==138192)
pd.concat([train,y], axis=1)[crit]

Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [12]:
train_dedup = train.drop_duplicates(features, keep='last')
y_dedup = y[train_dedup.index]
train_dedup.reset_index(drop=True, inplace=True)
y_dedup.reset_index(drop=True, inplace=True)
print(train_dedup.shape, y_dedup.shape)
train_dedup.head()

(29305, 14) (29305,)


Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,1,25,Private,219199,11th,Divorced,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States
1,2,39,Private,52978,Some-college,Divorced,Other-service,Not-in-family,White,Female,0,1721,55,United-States
2,3,35,Private,196899,Bachelors,Never-married,Handlers-cleaners,Not-in-family,Asian-Pac-Islander,Female,0,0,50,Haiti
3,4,64,Private,135527,Assoc-voc,Divorced,Tech-support,Not-in-family,White,Female,0,0,40,United-States
4,5,24,Private,60783,Some-college,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,70,United-States


In [13]:
crit = (train_dedup['age']==39)&(train_dedup['workclass']=='Private')&(train_dedup['fnlwgt']==138192)
pd.concat([train_dedup,y_dedup], axis=1)[crit]

Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [14]:
def freq_encode_full(df1, df2, col, normalize=True):
    df = pd.concat([df1[col],df2[col]])
    vc = df.value_counts(dropna=False, normalize=normalize).to_dict()
    nm = col + '_FE_FULL'
    df1[nm] = df1[col].map(vc)
    df1[nm] = df1[nm].astype('float32')
    df2[nm] = df2[col].map(vc)
    df2[nm] = df2[nm].astype('float32')
    return nm

In [17]:
for df in [train_dedup,test]:
    df['workclass_occupation'] = df['workclass'] + '#' + df['occupation']
    df['workclass_education'] = df['workclass'] + '#' + df['education']
    df['occupation_education'] = df['occupation'] + '#' + df['education']
    df['marital_status_relationship'] = df['marital-status'] + '#' + df['relationship']
    df['race_sex'] = df['race'] + '#' + df['sex']
    #df['native_country_race'] = df['native_country'] + '#' + df['race']
    #df['workclass_na'] = df['workclass'].isna().astype(int)
    #df['occupation_na'] = df['occupation'].isna().astype(int)
    #df['native_country_na'] = df['native_country'].isna().astype(int)
    #df['na_cnt'] = df.isna().sum(axis=1)
    df['capital_margin'] = df['capital-gain'] - df['capital-loss']
    df['capital_total'] = df['capital-gain'] + df['capital-loss']
    df['capital_margin_flag'] = np.nan
    df.loc[df['capital_margin']==0, 'capital_margin_flag'] = 'zero'
    df.loc[df['capital_margin']>0, 'capital_margin_flag'] = 'positive'
    df.loc[df['capital_margin']<0, 'capital_margin_flag'] = 'negative'
    #df['fnlwgt_log'] = np.log1p(df['fnlwgt'])

In [15]:
cate_cols = []
# LABEL ENCODING
for col in train_dedup.columns:
    if (train_dedup[col].dtype.name == 'object' or test[col].dtype.name == 'object' \
        or train_dedup[col].nunique() < 300) and col != 'age':
        cate_cols.append(col)
        le = LabelEncoder()
        le.fit(list(train_dedup[col].values) + list(test[col].values))
        train_dedup[col] = le.transform(list(train_dedup[col].values))
        test[col] = le.transform(list(test[col].values))
        
print('categorical feature:', cate_cols)

categorical feature: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'workclass_occupation', 'workclass_education', 'occupation_education', 'marital_status_relationship', 'race_sex', 'capital_margin', 'capital_total', 'capital_margin_flag']


In [16]:
for col in [col for col in cate_cols if col not in ['capital_gain','capital_loss','race']]:
    freq_encode_full(train_dedup, test, col)

In [17]:
remove_features = ['id','income']
features = [col for col in list(train_dedup) if col not in remove_features]
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'workclass_occupation',
 'workclass_education',
 'occupation_education',
 'marital_status_relationship',
 'race_sex',
 'capital_margin',
 'capital_total',
 'capital_margin_flag',
 'workclass_FE_FULL',
 'education_FE_FULL',
 'marital_status_FE_FULL',
 'occupation_FE_FULL',
 'relationship_FE_FULL',
 'sex_FE_FULL',
 'hours_per_week_FE_FULL',
 'native_country_FE_FULL',
 'workclass_occupation_FE_FULL',
 'workclass_education_FE_FULL',
 'occupation_education_FE_FULL',
 'marital_status_relationship_FE_FULL',
 'race_sex_FE_FULL',
 'capital_margin_FE_FULL',
 'capital_total_FE_FULL',
 'capital_margin_flag_FE_FULL']

In [18]:
def make_cat_prediction(train, y, test, features, categorical_features=None, model_params=None, folds=5):
    skf = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)
        
        clf = CatBoostClassifier(**model_params)
        clf.fit(x_tr, y_tr, eval_set=(x_val, y_val),
                cat_features=categorical_features,
                use_best_model=True,
                verbose=True)

        feature_importance[f'fold_{fold+1}'] = clf.feature_importances_

        best_iteration = clf.best_iteration_
        y_pred_val = clf.predict_proba(x_val)[:,1]

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict_proba(x_test)[:,1] / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [19]:
def make_lgb_prediction(train, y, test, features, categorical_features='auto', model_params=None, folds=5):
    def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) 
        return 'f1', f1_score(y_true, y_hat, average='binary'), True
    
    skf = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)

        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            categorical_feature=categorical_features,
            verbose_eval=200,
            early_stopping_rounds=100,
            feval=lgb_f1_score
        )

        feature_importance[f'fold_{fold+1}'] = clf.feature_importance()

        y_pred_val = clf.predict(x_val)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict(x_test) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [20]:
def make_xgb_prediction(train, y, test, features, model_params=None, folds=5):
    def xgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) 
        return 'f1', 1-f1_score(y_true, y_hat, average='micro')
    
    skf = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = xgb.DMatrix(x_tr, label=y_tr)
        dvalid = xgb.DMatrix(x_val, label=y_val)

        clf = xgb.train(
            model_params,
            dtrain,
            num_boost_round=10000, 
            evals=[(dtrain, 'train'), (dvalid, 'valid')],
            verbose_eval=200,
            early_stopping_rounds=100,
            feval=xgb_f1_score
        )

        #feature_importance[f'fold_{fold+1}'] = clf.get_score()

        y_pred_val = clf.predict(dvalid)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict(xgb.DMatrix(x_test)) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [21]:
# catboost model params
cat_params = {
    'n_estimators': 10000,
    'learning_rate': 0.07,
    #'eval_metric': 'TotalF1', # 'TotalF1'
    'eval_metric': 'F1',
    #'eval_metric': 'Logloss',
    'loss_function': 'Logloss',
    'random_seed': SEED,
    'metric_period': 100,
    'od_wait': 100,
    #'task_type': 'GPU',
    'depth': 6,
    #'bootstrap_type': 'Bayesian',
    'rsm': 0.8,
    #'colsample_bylevel': 0.7,
}

In [22]:
y_oof_cat, y_preds_cat, fi_cat = make_cat_prediction(train_dedup, y_dedup, test, features, \
                                                     categorical_features=cate_cols, model_params=cat_params)

Fold: 1
(20824, 37) (5206, 37)




0:	learn: 0.6258236	test: 0.6338999	best: 0.6338999 (0)	total: 131ms	remaining: 21m 47s
100:	learn: 0.7140000	test: 0.7137150	best: 0.7146028 (92)	total: 6.22s	remaining: 10m 10s
200:	learn: 0.7336541	test: 0.7238512	best: 0.7256792 (190)	total: 12.9s	remaining: 10m 28s
300:	learn: 0.7476108	test: 0.7221980	best: 0.7259194 (237)	total: 19s	remaining: 10m 12s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7259194396
bestIteration = 237

Shrink model to first 238 iterations.
Fold 1 | F1 Score: 0.8797541298501729
Fold: 2
(20824, 37) (5206, 37)




0:	learn: 0.6241860	test: 0.6138798	best: 0.6138798 (0)	total: 64.4ms	remaining: 10m 44s
100:	learn: 0.7151260	test: 0.7059859	best: 0.7078553 (87)	total: 6.36s	remaining: 10m 23s
200:	learn: 0.7335890	test: 0.7075306	best: 0.7120419 (154)	total: 12.6s	remaining: 10m 15s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7120418848
bestIteration = 154

Shrink model to first 155 iterations.
Fold 2 | F1 Score: 0.8732232039953899
Fold: 3
(20824, 37) (5206, 37)




0:	learn: 0.6231109	test: 0.6231618	best: 0.6231618 (0)	total: 61.7ms	remaining: 10m 17s
100:	learn: 0.7173239	test: 0.7112861	best: 0.7128452 (97)	total: 6.38s	remaining: 10m 24s
200:	learn: 0.7402229	test: 0.7183038	best: 0.7186823 (182)	total: 12.7s	remaining: 10m 16s
300:	learn: 0.7519861	test: 0.7208602	best: 0.7208602 (285)	total: 19.2s	remaining: 10m 17s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7208602151
bestIteration = 285

Shrink model to first 286 iterations.
Fold 3 | F1 Score: 0.8753361505954668
Fold: 4
(20824, 37) (5206, 37)




0:	learn: 0.6285120	test: 0.6047170	best: 0.6047170 (0)	total: 56.9ms	remaining: 9m 29s
100:	learn: 0.7181015	test: 0.7090272	best: 0.7090272 (100)	total: 6.28s	remaining: 10m 15s
200:	learn: 0.7369794	test: 0.7195492	best: 0.7198612 (190)	total: 12.2s	remaining: 9m 55s
300:	learn: 0.7493232	test: 0.7260982	best: 0.7283685 (290)	total: 18.9s	remaining: 10m 9s
400:	learn: 0.7584640	test: 0.7263339	best: 0.7292294 (336)	total: 25.7s	remaining: 10m 14s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7292294447
bestIteration = 336

Shrink model to first 337 iterations.
Fold 4 | F1 Score: 0.8791778716865155
Fold: 5
(20824, 37) (5206, 37)




0:	learn: 0.6213953	test: 0.6395187	best: 0.6395187 (0)	total: 61.5ms	remaining: 10m 15s
100:	learn: 0.7158105	test: 0.7108326	best: 0.7108326 (100)	total: 6.3s	remaining: 10m 17s
200:	learn: 0.7346626	test: 0.7212389	best: 0.7224436 (180)	total: 12.4s	remaining: 10m 2s
300:	learn: 0.7520157	test: 0.7215134	best: 0.7238179 (204)	total: 18.9s	remaining: 10m 10s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7238179408
bestIteration = 204

Shrink model to first 205 iterations.
Fold 5 | F1 Score: 0.8799462159047253

Mean F1 score = 0.877487514406454
OOF F1 score = 0.8774875144064541


In [23]:
# lgb model params
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 63,
    'max_depth': -1,
    'max_bin': 255,
    'min_data_in_leaf': 40,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'n_estimators': 10000,
    'early_stopping_rounds': 100, 
    'seed': SEED,
    'verbose': -1,
    'n_jobs': -1,    
} 

In [24]:
y_oof_lgb, y_preds_lgb, fi_lgb = make_lgb_prediction(train_dedup, y_dedup, test, features, \
                                                     model_params=lgb_params)

Fold: 1
(20824, 37) (5206, 37)
Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.191258	training's f1: 0.824571	valid_1's binary_logloss: 0.278734	valid_1's f1: 0.722056
Early stopping, best iteration is:
[110]	training's binary_logloss: 0.226197	training's f1: 0.774698	valid_1's binary_logloss: 0.276755	valid_1's f1: 0.723974
Fold 1 | F1 Score: 0.8772570111409912
Fold: 2
(20824, 37) (5206, 37)
Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.189721	training's f1: 0.824448	valid_1's binary_logloss: 0.283718	valid_1's f1: 0.706339
Early stopping, best iteration is:
[149]	training's binary_logloss: 0.207675	training's f1: 0.79851	valid_1's binary_logloss: 0.28091	valid_1's f1: 0.710594
Fold 2 | F1 Score: 0.8709181713407607
Fold: 3
(20824, 37) (5206, 37)
Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.189693	training's f1: 0.823542	valid_1's binar

In [25]:
# xgb model params
xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'max_depth': 6,
    #'colsample_bytree': 0.8,
    'subsample': 0.8,
    'disable_default_eval_metric': 1,
    #'eval_metric': 'logloss',
    'seed': SEED,
} 

In [26]:
y_oof_xgb, y_preds_xgb, fi_xgb = make_xgb_prediction(train_dedup, y_dedup, test, features, \
                                                     model_params=xgb_params)

Fold: 1
(20824, 37) (5206, 37)
[0]	train-f1:0.24212	valid-f1:0.24222
Multiple eval metrics have been passed: 'valid-f1' will be used for early stopping.

Will train until valid-f1 hasn't improved in 100 rounds.
Stopping. Best iteration:
[0]	train-f1:0.24212	valid-f1:0.24222

Fold 1 | F1 Score: 0.8784095274683058
Fold: 2
(20824, 37) (5206, 37)
[0]	train-f1:0.24212	valid-f1:0.24222
Multiple eval metrics have been passed: 'valid-f1' will be used for early stopping.

Will train until valid-f1 hasn't improved in 100 rounds.
Stopping. Best iteration:
[0]	train-f1:0.24212	valid-f1:0.24222

Fold 2 | F1 Score: 0.8693814829043411
Fold: 3
(20824, 37) (5206, 37)
[0]	train-f1:0.24212	valid-f1:0.24222
Multiple eval metrics have been passed: 'valid-f1' will be used for early stopping.

Will train until valid-f1 hasn't improved in 100 rounds.
Stopping. Best iteration:
[0]	train-f1:0.24212	valid-f1:0.24222

Fold 3 | F1 Score: 0.8728390318862851
Fold: 4
(20824, 37) (5206, 37)
[0]	train-f1:0.24217	valid-

In [27]:
cat_weight = 0.4
lgb_weight = 0.4
xgb_weight = 0.2

y_oof = cat_weight*y_oof_cat + lgb_weight*y_oof_lgb + xgb_weight*y_oof_xgb
y_preds = cat_weight*y_preds_cat + lgb_weight*y_preds_lgb + xgb_weight*y_preds_xgb

In [28]:
val_thres = 0.5
val_score = f1_score(y_dedup, np.where(y_oof>val_thres, 1, 0), average='micro')

print('val_score:', val_score)

val_score: 0.8765655013446023


In [29]:
submission = pd.read_csv('/kaggle/input/kakr-4th-competition/sample_submission.csv')
submission.prediction = np.where(y_preds > val_thres, 1, 0)
submission.head()

Unnamed: 0,id,prediction
0,0,0
1,1,1
2,2,0
3,3,1
4,4,1


In [30]:
submission.prediction.value_counts()

0    5238
1    1274
Name: prediction, dtype: int64

In [31]:
submission.to_csv('submission.csv', index=False)