# LightGBM Model

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import numpy as np
import os
import pandas as pd
import pickle

from tqdm import tqdm
import time

from scipy import stats
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb

%matplotlib inline

from utils import write_train_file, create_submission_file

In [3]:
data_dir = "/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation"
pkl_file = os.path.join(data_dir, "lgbm_6m.pkl")

prepare_data = False
inp_seq_len, tgt_seq_len = 12, 12

In [5]:
if prepare_data:
    df_prod = pd.read_csv(os.path.join(data_dir, "articles.csv"), dtype={'article_id': str, 'product_code': str})
    # df_prod = df_prod.set_index('article_id').T.to_dict()
    # print(f"Total {len(df_prod)} products")
    
    df_tr = pd.read_csv(os.path.join(data_dir, "transactions_train.csv"), dtype={'article_id': str})
    df_tr['t_dat'] = pd.to_datetime(df_tr['t_dat'])

    print(f"Total {len(df_tr['customer_id'].unique())} customers and {len(df_tr['article_id'].unique())} products")
    print(df_tr['t_dat'].min(), df_tr['t_dat'].max())


Select a shorter duration before the training cutoff

In [45]:
include_meta = [
        "product_type_name",
        "product_group_name",
        "graphical_appearance_name",
        "colour_group_name",
        "department_name",
        "index_name",
        "index_group_name",
        "section_name",
        "garment_group_name",
    ]

# df_short = df_tr[df_tr['t_dat'] >= pd.to_datetime('2020-08-31')].copy()  # last 3 weeks
df_short = df_tr[df_tr['t_dat'] >= pd.to_datetime('2020-03-22')].copy()  # last 6 months
df_short.reset_index(drop=True, inplace=True)
print(f"Total {len(df_short['customer_id'].unique())} customers and {len(df_short['article_id'].unique())} products")

# temp_dict = df_prod[['article_id'] + include_meta].set_index('article_id').T.to_dict('list')
# df_short["meta"] = df_short["article_id"].map(temp_dict)
# df_short[['meta'+str(ii+1) for ii in range(len(include_meta))]] = pd.DataFrame(df_short.meta.tolist(), index= df_short.index)
# df_short.drop(['meta'], axis=1, inplace=True)
# df_short.head()

Total 748053 customers and 51478 products


In [46]:
dfg = df_short.groupby("customer_id")["article_id"].apply(' '.join).reset_index()
dfg['length'] = dfg['article_id'].apply(lambda x: len(x.split()))
dfg = dfg[dfg['length'] >= tgt_seq_len+1].reset_index(drop=True)
dfg['input'] = dfg['article_id'].apply(lambda x: ' '.join(x.split()[-tgt_seq_len-inp_seq_len:-tgt_seq_len]))
dfg['target'] = dfg['article_id'].apply(lambda x: ' '.join(x.split()[-tgt_seq_len:]))

# create inputs and targets
dfg[['S'+str(ii+1) for ii in range(inp_seq_len)]] = dfg['input'].str.split(' ', tgt_seq_len, expand=True)
dfg[['Y'+str(ii+1) for ii in range(tgt_seq_len)]] = dfg['target'].str.split(' ', tgt_seq_len, expand=True)
dfg.drop(['input', 'target'], axis=1, inplace=True)
dfg

Unnamed: 0,customer_id,article_id,length,S1,S2,S3,S4,S5,S6,S7,...,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0599580055 0599580055 0811835004 0811835004 08...,15,0599580055,0599580055,0811835004,,,,,...,0811835004,0723529001,0559630026,0599580083,0811927004,0811927004,0811925005,0811925005,0351484002,0826211002
1,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,0562245099 0797892001 0554477035 0751471023 05...,51,0706016001,0881244001,0903326005,0903326005,0904736002,0904736002,0903428001,...,0836997006,0836997006,0730863038,0730863038,0562245099,0516859008,0921226007,0889652001,0797892001,0568597007
2,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0839402001 0715411001 0751471035 0719629016 06...,16,0839402001,0715411001,0751471035,0719629016,,,,...,0837686001,0782616019,0857448005,0807244012,0807241026,0807241026,0706016038,0914441005,0706016015,0778476005
3,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0841383002 0749699001 0889714001 0832732003 08...,18,0841383002,0749699001,0889714001,0832732003,0864716001,0832482001,,...,0895418003,0875469002,0835851001,0870345002,0902325002,0926847002,0909924004,0917434002,0946748003,0722803001
4,0001076e215991bad544dd3e7312f78d9f576a1cc3ddc4...,0822355002 0841173002 0854356001 0492897001 08...,13,0822355002,,,,,,,...,0492897001,0808698004,0810838010,0854356006,0878733003,0701472001,0658298001,0843373001,0843373003,0658298007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202888,fffcd556af797bddc25d6d56600b6e298a19b90624a8ee...,0867023001 0806388002 0806388001 0806388003 08...,19,0867023001,0806388002,0806388001,0806388003,0812207002,0827968001,0827968004,...,0806388001,0806388001,0715624013,0715624013,0827370001,0827370001,0827968001,0806388001,0715624013,0827370001
202889,fffe7116f9f68e8ad287fd7b6e33aad4871d7080e77d2d...,0832453003 0887949004 0832453001 0808624005 08...,31,0773170009,0818029003,0717490064,0860322001,0620197056,0717490057,0620197057,...,0860322004,0871581001,0859400007,0682236023,0875287001,0841298005,0855081003,0824490001,0896169002,0896152003
202890,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0837941001 0591466033 0702932003 0832362002 05...,59,0905811002,0876411001,0932243002,0717490071,0649439006,0895610005,0895610005,...,0873276003,0905365002,0905365002,0863646005,0748269009,0881919001,0803757023,0748269009,0881919001,0898573003
202891,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0826955010 0826955010 0797565001 0822344003 05...,23,0826955010,0826955010,0797565001,0822344003,0557599022,0253448003,0840567001,...,0756320028,0822344001,0684209019,0822344010,0713997002,0720125039,0740922009,0791587007,0804992033,0557599022


In [19]:
dfg['length'].describe()

count    202893.000000
mean         27.076198
std          19.620027
min          13.000000
25%          16.000000
50%          21.000000
75%          31.000000
max         550.000000
Name: length, dtype: float64

In [52]:
for kk, mname in enumerate(include_meta):
    temp_dict = {row['article_id']: row[mname] for ii, row in df_prod[['article_id', mname]].iterrows()}
    for col in range(1, inp_seq_len+1):
        dfg["M"+str(kk+1)+str(col)] = dfg["S"+str(col)].map(temp_dict)
dfg

  after removing the cwd from sys.path.


Unnamed: 0,customer_id,article_id,length,S1,S2,S3,S4,S5,S6,S7,...,M93,M94,M95,M96,M97,M98,M99,M910,M911,M912
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0599580055 0599580055 0811835004 0811835004 08...,15,0599580055,0599580055,0811835004,,,,,...,Swimwear,,,,,,,,,
1,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,0562245099 0797892001 0554477035 0751471023 05...,51,0706016001,0881244001,0903326005,0903326005,0904736002,0904736002,0903428001,...,Jersey Fancy,Jersey Fancy,Accessories,Accessories,Jersey Fancy,Jersey Fancy,Shirts,Shirts,Trousers Denim,Trousers Denim
2,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0839402001 0715411001 0751471035 0719629016 06...,16,0839402001,0715411001,0751471035,0719629016,,,,...,Trousers,Swimwear,,,,,,,,
3,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0841383002 0749699001 0889714001 0832732003 08...,18,0841383002,0749699001,0889714001,0832732003,0864716001,0832482001,,...,Shoes,Jersey Fancy,Swimwear,Jersey Fancy,,,,,,
4,0001076e215991bad544dd3e7312f78d9f576a1cc3ddc4...,0822355002 0841173002 0854356001 0492897001 08...,13,0822355002,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202888,fffcd556af797bddc25d6d56600b6e298a19b90624a8ee...,0867023001 0806388002 0806388001 0806388003 08...,19,0867023001,0806388002,0806388001,0806388003,0812207002,0827968001,0827968004,...,Jersey Basic,Jersey Basic,Trousers Denim,Jersey Basic,Jersey Basic,,,,,
202889,fffe7116f9f68e8ad287fd7b6e33aad4871d7080e77d2d...,0832453003 0887949004 0832453001 0808624005 08...,31,0773170009,0818029003,0717490064,0860322001,0620197056,0717490057,0620197057,...,Jersey Basic,Jersey Fancy,Jersey Basic,Jersey Basic,Jersey Basic,Jersey Basic,Trousers,Trousers,Shoes,Blouses
202890,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0837941001 0591466033 0702932003 0832362002 05...,59,0905811002,0876411001,0932243002,0717490071,0649439006,0895610005,0895610005,...,Jersey Fancy,Jersey Basic,Jersey Fancy,Trousers,Trousers,"Under-, Nightwear",Blouses,"Under-, Nightwear","Under-, Nightwear",Jersey Basic
202891,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0826955010 0826955010 0797565001 0822344003 05...,23,0826955010,0826955010,0797565001,0822344003,0557599022,0253448003,0840567001,...,"Under-, Nightwear","Under-, Nightwear","Under-, Nightwear","Under-, Nightwear","Under-, Nightwear",Swimwear,Swimwear,Swimwear,Swimwear,


In [57]:
with open(pkl_file, "wb") as fw:
    pickle.dump(dfg, fw)


### Load Prepared Dataset

In [4]:
with open(pkl_file, "rb") as fr:
    data_df = pickle.load(fr)

data_df.drop(['customer_id', 'article_id'], axis=1, inplace=True)
data_df.head()

Unnamed: 0,length,S1,S2,S3,S4,S5,S6,S7,S8,S9,...,M93,M94,M95,M96,M97,M98,M99,M910,M911,M912
0,15,599580055,599580055.0,811835004.0,,,,,,,...,Swimwear,,,,,,,,,
1,51,706016001,881244001.0,903326005.0,903326005.0,904736002.0,904736002.0,903428001.0,903428001.0,695324013.0,...,Jersey Fancy,Jersey Fancy,Accessories,Accessories,Jersey Fancy,Jersey Fancy,Shirts,Shirts,Trousers Denim,Trousers Denim
2,16,839402001,715411001.0,751471035.0,719629016.0,,,,,,...,Trousers,Swimwear,,,,,,,,
3,18,841383002,749699001.0,889714001.0,832732003.0,864716001.0,832482001.0,,,,...,Shoes,Jersey Fancy,Swimwear,Jersey Fancy,,,,,,
4,13,822355002,,,,,,,,,...,,,,,,,,,,


In [30]:
class Objective(object):
    def __init__(self, 
                 df_train,
                 df_valid,
                 categoricals,
                 fixed_params,
                 target_col="target",
                 num_classes=None,
                 param_set={}, verbose_eval=50):
        self.categoricals = categoricals
        self.fixed_params = fixed_params
        self.param_set = param_set
        self.verbose_eval = verbose_eval
        self.target_col = target_col
        self.num_classes = num_classes
        self.dtrain = lgb.Dataset(
            df_train.drop([self.target_col], axis=1),
            label = df_train[self.target_col],
            categorical_feature=self.categoricals,
            free_raw_data=False
        )
        self.dvalid = lgb.Dataset(
            df_valid.drop([self.target_col], axis=1),
            label = df_valid[self.target_col],
            categorical_feature=self.categoricals,
            reference=self.dtrain,
            free_raw_data=False
        )
        self.default_ranges = {
            "num_leaves":(2, 256),
            "min_data_in_leaf":(5, 100),
            "learning_rate":(1e-3, 1e-1),
            "feature_fraction":(0.4, 1.0),
            "bagging_freq":(1, 7),
            "bagging_fraction":(0.4, 1.0)
        }
        
    def get_params(self, trial):
        param_funcs = {
            "num_leaves":trial.suggest_int,
            "min_data_in_leaf":trial.suggest_int,
            "learning_rate":trial.suggest_loguniform,
            "feature_fraction":trial.suggest_float,
            "bagging_freq":trial.suggest_int,
            "bagging_fraction":trial.suggest_float
        }
        params = {}
        for param, rng in self.param_set.items():
            if rng is None:
                default_rng = self.default_ranges[param]
                params[param] = param_funcs[param](param, default_rng[0], default_rng[1])
            else:
                params[param] = param_funcs[param](param, rng[0], rng[1])

        params.update(self.fixed_params)
        return params
    
    def __call__(self, trial):
        params = self.get_params(trial)
        bst = lgb.train(
            params,
            self.dtrain,
            valid_sets=[self.dvalid],
            verbose_eval=self.verbose_eval,
#             num_classes=self.num_classes,
        )
        # get best value of objective
        valid_0 = bst.best_score['valid_0']
        score = valid_0[list(valid_0)[0]]
        
        trial.set_user_attr('best_iteration', bst.best_iteration)
        trial.set_user_attr('features', self.dtrain.feature_name)
        trial.set_user_attr('importance', bst.feature_importance().tolist())
        
        return score

In [31]:
class EarlyStoppingExceeded(optuna.exceptions.OptunaError):
    pass

class EarlyStoppingCallback(object):
    # from https://github.com/optuna/optuna/issues/1001#issuecomment-596478792
    
    def __init__(self, early_stopping_rounds, min_delta):
        self.early_stopping_rounds = early_stopping_rounds
        self.min_delta = min_delta
        self.early_stopping_count = 0
        self.best_score = None
    
    def __call__(self, study, trial):
        if self.best_score == None:
            self.best_score = study.best_value

        if study.best_value < self.best_score - self.min_delta:
            self.best_score = study.best_value
            self.early_stopping_count = 0
        else:
            if self.early_stopping_count > self.early_stopping_rounds:
                self.early_stopping_count = 0
                best_score = None
                raise EarlyStoppingExceeded()
            else:
                self.early_stopping_count += 1
        return
    

def tune_model(df_train, 
               df_valid,
               categoricals,
               fixed_params,
               param_set,
               target_col="target",
               num_classes=None,
               n_trials=50,
               verbose_eval=50,
               show_progress=True,
               early_stop_callback=None,
               tpe_mode="independent"):
    multivariate_flag = True if tpe_mode == "multivariate" else False
    sampler = optuna.samplers.TPESampler(multivariate=multivariate_flag)
    study = optuna.create_study(sampler=sampler)
    callbacks = None
    if early_stop_callback is not None:
        callbacks = [early_stop_callback]
    else:
        callbacks = []
    try:
        study.optimize(
            Objective(
                df_train=df_train,
                df_valid=df_valid,
                categoricals=categoricals,
                fixed_params=fixed_params,
                param_set=param_set,
                target_col=target_col,
                num_classes=num_classes,
                verbose_eval=verbose_eval
            ),
            n_trials=n_trials,
            show_progress_bar=show_progress,
            callbacks=callbacks
        )
    except EarlyStoppingExceeded:
        print(f'EarlyStopping Exceeded: No new best scores on iters {early_stop_callback.early_stopping_rounds}')
    return study


In [51]:
def build_single_model(df, **kwargs):
    
    eval_days = kwargs.get("eval_days", 30)
    obj_func = kwargs.get("obj_func", 'multiclass')
    num_rounds = kwargs.get("num_rounds", 1000)
    early_stopping_rounds = kwargs.get("early_stopping_rounds", 50)
    model_type = kwargs.get("model_type", 'single')
    target_col = kwargs.get("target_col", "target")

    res_df_train = df[[target_col]].rename(columns={target_col:"actuals"})
    categoricals = [col for col in df.columns if col not in ["length", target_col]]
    num_classes = len(df[target_col].unique())
    for cat in categoricals:
        df[cat] = df[cat].astype('category')
    
    df[target_col] = df[target_col].astype('category')
    df_train, df_val = train_test_split(df, test_size=0.20, random_state=42)
    
    print(categoricals, num_classes)
    print(df_train.shape, df_val.shape)

#     params = {
#         "objective":obj_func,
#         "metric":['auc_mu', 'multi_logloss'],
#         "num_rounds":num_rounds,
#         "num_class": num_classes,
#         "early_stopping_rounds":early_stopping_rounds,
#         "first_metric_only":True,
#         "force_row_wise":True,
#         "feature_pre_filter":False,
#         "verbose":-1,
#     }
    
#     dtrain = lgb.Dataset(
#             df.drop(columns=[target_col]),
#             label=df[target_col],
#             categorical_feature=categoricals
#         )
#     bst = lgb.train(params, dtrain, verbose_eval=0)
#     sys.exit("KK")
    
    print("Tune hyperparameters...")
    param_set = {
        "num_leaves":None, 
        "min_data_in_leaf":None, 
        "learning_rate":None, 
        "feature_fraction":None,
        "bagging_freq":None, 
        "bagging_fraction":None
    }

    fixed_params = {
        "objective":obj_func,
        "metric":['auc_mu', 'multi_logloss'],
        "num_rounds":num_rounds,
        "num_class": num_classes,
        "early_stopping_rounds":early_stopping_rounds,
        "first_metric_only":True,
        "force_row_wise":True,
        "feature_pre_filter":False,
        "verbose":-1,
    }

    early_stopping = EarlyStoppingCallback(10, 0.001)

    study = tune_model(
                        df_train,
                        df_val,
                        categoricals,
                        fixed_params,
                        param_set,
                        target_col,
                        num_classes,
                        n_trials=100, verbose_eval=0,
                        show_progress=False, early_stop_callback=early_stopping,
                    )

    print("Saving best model parameters...")
    best_params = {k: [v] for (k,v) in study.best_params.items()}
    print(best_params)
    num_rounds = study.best_trial.user_attrs["best_iteration"]

    fixed_params["num_rounds"] = num_rounds
    # fixed_params["early_stopping_rounds"] = 0
    params = study.best_params.copy()

    params.update(fixed_params)
    del params["early_stopping_rounds"] # = 0        

    # Now we can use train+valid data for retraining with the 
    # parameters learnt using only train
    dtrain = lgb.Dataset(
        df.drop(columns=[target_col]),
        label=df[target_col],
        categorical_feature=categoricals
    )
    bst = lgb.train(params, dtrain, verbose_eval=0)
    
    res_df_val = df_val[[target_col]]
    res_df_val = res_df_val.rename(columns={target_col:"actuals"})
    res_df_val["predictions"] = bst.predict(df_val.drop(columns=[target_col]))

#     res_df_test = df_test[[target_col, baseline_col]]
#     res_df_test = res_df_test.rename(columns={target_col:"actuals"})
#     res_df_test["predictions"] = bst.predict(df_test.drop(columns=[target_col]))
#     res_df_train["predictions"] = bst.predict(df.drop(columns=[target_col]))
    
    return res_df_train, res_df_val, bst


### Build Multiple Models

One model for each future step - total 12 models

In [33]:
def get_metrics(df):
    df_ = df.copy()
    metrics = (mae(df_["actuals"], df_["predictions"]),
               rmse(df_["actuals"], df_["predictions"]),
               nrmse(df_["actuals"], df_["predictions"]),
               mape(df_["actuals"], df_["predictions"]))
    print("prediction:", metrics)


In [34]:
target_cols = [col for col in data_df.columns if 'Y' in col]
target_cols

['Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9', 'Y10', 'Y11', 'Y12']

In [35]:
feature_cols = [col for col in data_df.columns if col not in target_cols]

In [52]:
res_dfs, models = [], []
for ii in range(tgt_seq_len):
    tgt = f"Y{ii+1}"
    cols = copy.deepcopy(feature_cols)
    cols.append(tgt)
    df_ii = data_df[cols].copy()
    res_df_train, res_df_val, model = build_single_model(df_ii, target_col=tgt, obj_func='multiclass') 
    print(f"Finished modeling for {tgt}")
    sys.exit()
    get_metrics(res_df_test, bsl, remove_nonop=False)
    res_dfs.append(res_df_test)
    models.append(model)

[32m[I 2022-02-18 11:18:57,941][0m A new study created in memory with name: no-name-7b5808ee-9f4a-4d03-ac3d-de970600c5bf[0m


['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'M1', 'M11', 'M12', 'M13', 'M14', 'M15', 'M16', 'M17', 'M18', 'M19', 'M110', 'M111', 'M112', 'M21', 'M22', 'M23', 'M24', 'M25', 'M26', 'M27', 'M28', 'M29', 'M210', 'M211', 'M212', 'M31', 'M32', 'M33', 'M34', 'M35', 'M36', 'M37', 'M38', 'M39', 'M310', 'M311', 'M312', 'M41', 'M42', 'M43', 'M44', 'M45', 'M46', 'M47', 'M48', 'M49', 'M410', 'M411', 'M412', 'M51', 'M52', 'M53', 'M54', 'M55', 'M56', 'M57', 'M58', 'M59', 'M510', 'M511', 'M512', 'M61', 'M62', 'M63', 'M64', 'M65', 'M66', 'M67', 'M68', 'M69', 'M610', 'M611', 'M612', 'M71', 'M72', 'M73', 'M74', 'M75', 'M76', 'M77', 'M78', 'M79', 'M710', 'M711', 'M712', 'M81', 'M82', 'M83', 'M84', 'M85', 'M86', 'M87', 'M88', 'M89', 'M810', 'M811', 'M812', 'M91', 'M92', 'M93', 'M94', 'M95', 'M96', 'M97', 'M98', 'M99', 'M910', 'M911', 'M912'] 24126
(162314, 123) (40579, 123)
Tune hyperparameters...


[33m[W 2022-02-18 11:19:00,707][0m Trial 0 failed because of the following error: ValueError('Series.dtypes must be int, float or bool')[0m
Traceback (most recent call last):
  File "/anaconda/envs/py37_tensorflow/lib/python3.7/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-30-826d2a587277>", line 64, in __call__
    verbose_eval=self.verbose_eval,
  File "/anaconda/envs/py37_tensorflow/lib/python3.7/site-packages/lightgbm/engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "/anaconda/envs/py37_tensorflow/lib/python3.7/site-packages/lightgbm/basic.py", line 2605, in __init__
    train_set.construct()
  File "/anaconda/envs/py37_tensorflow/lib/python3.7/site-packages/lightgbm/basic.py", line 1819, in construct
    categorical_feature=self.categorical_feature, params=self.params)
  File "/anaconda/envs/py37_tensorflow/lib/python3.7/site-packages/lightgbm/basic.py", l

ValueError: Series.dtypes must be int, float or bool

In [None]:
from sklearn.preprocessing import LabelEncoder

df = df_ii[['S1',"length", "Y1"]].copy()
target_col = "Y1"
obj_func = "multiclass"
num_rounds = 1000
early_stopping_rounds = 50

categoricals = [col for col in df.columns if col not in ["length", target_col]]
num_classes = len(df[target_col].unique())
all_encoders = {}
for cat in categoricals:
    le = LabelEncoder()
    df[cat] = le.fit_transform(df[cat])
    df[cat] = df[cat].astype('category')
    all_encoders[cat] = le

le_tgt = LabelEncoder()
df[target_col] = le_tgt.fit_transform(df[target_col])
# df[target_col] = df[target_col].astype('category')

df_train, df_val = train_test_split(df, test_size=0.20, random_state=42)

print(categoricals, num_classes)
print(df_train.shape, df_val.shape)

params = {
    "objective":obj_func,
    "metric":['auc_mu', 'multi_logloss'],
    "num_rounds":num_rounds,
    "num_class": num_classes,
    "early_stopping_rounds":early_stopping_rounds,
    "first_metric_only":True,
    "force_row_wise":True,
    "feature_pre_filter":False,
    "verbose":-1,
}

dtrain = lgb.Dataset(
        df.drop(columns=[target_col]),
        label=df[target_col],
        categorical_feature=categoricals
    )

bst = lgb.train(params, dtrain, verbose_eval=0)


['S1'] 24126
(162314, 3) (40579, 3)




In [59]:
df

Unnamed: 0,S1,length,Y1
0,2176,15,11979
1,4953,51,21671
2,15622,16,2485
3,15816,18,23463
4,13668,13,15212
...,...,...,...
202888,19170,19,12042
202889,9135,31,15185
202890,22647,59,18737
202891,14275,23,13171


In [12]:
import math 

n1, c1 = 80000, 1600
n2, c2 = 80000, 1696

p1, p2 = c1/n1, c2/n2
q1, q2 = 1-p1, 1-p2
s1, s2 = math.sqrt(p1*(1-p1)/n1), math.sqrt(p2*(1-p2)/n2)
se_diff = math.sqrt(s1*s1 + s2*s2)
t_val = (p1-p2)/se_diff

In [13]:
t_val

-1.689668300098093