In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
import time
import joblib
import itertools
from tqdm.auto import tqdm
import lightgbm as lgb
from itertools import combinations

# ====================================================
# Configurations
# ====================================================
class CFG:
    input_dir = '../input/amex-compress/'
    input_dir_model = '../input/lgbm-split4-seed42/'
    seed = 42
    n_folds = 5
    target = 'target'

# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
# ====================================================
# Read test only
# ====================================================
def read_test(x, splits):
    test = pd.read_parquet(CFG.input_dir + 'test_fe_compress.parquet')
    test_section = np.array_split(test,splits)[x]
    del test
    gc.collect()
    return test_section
# ====================================================
# Read model only
# ====================================================
def read_model(filename):
    model = joblib.load(CFG.input_dir_model + filename)
    return model
# ====================================================
#  Split predictions
# ====================================================
def test_split(test,model):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    num_cols = list(test.dtypes[(test.dtypes == 'float32') | (test.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        test[col + '_round2'] = test[col].round(2)
    num_cols = [col for col in test.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    for col in num_cols:
        try:
            test[f'{col}_last_mean_diff'] = test[f'{col}_last'] - test[f'{col}_mean']
        except:
            pass
        
    # Transform float64 and float32 to float16
    num_cols = list(test.dtypes[(test.dtypes == 'float32') | (test.dtypes == 'float64')].index)
    
    for col in num_cols:
        test[col] = test[col].astype(np.float16)
    features = [col for col in test.columns if col not in ['customer_ID', CFG.target]]
    
    test_pred = model.predict(test[features])
    
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_pred})
    test_df.set_index('customer_ID')
    
    del test, test_pred, cat_features, num_cols, features
    gc.collect()

    
    return test_df

    
    
# ====================================================
#  Helper
# ====================================================

def helper(splits):
    model_filenames = ['lgbm_split4_seed42.pkl','lgbm_split4_seed39.pkl','lgbm_split4_seed420.pkl','lgbm_split4_seed73.pkl']
    ensembled_pred_df = pd.DataFrame(columns = ['prediction'])
    
    for i in range(len(model_filenames)):
        
        model = read_model(model_filenames[i])
        
        model_pred_df = pd.DataFrame(columns = ['prediction'])
        
        for j in range(splits):
            test= read_test(j, splits)
            print(f'Predicting split {j+1} of {splits} with model {model_filenames[i]}')
            split_output = test_split(test, model)
            model_pred_df = pd.concat([model_pred_df, split_output], axis=0)
            del test, split_output
            gc.collect()
            
        del model
        gc.collect()
        model_pred_df['prediction'] = model_pred_df['prediction']/splits
        
        ensembled_pred_df = pd.concat([ensembled_pred_df, model_pred_df]).groupby(by='customer_ID',as_index=False).sum()
        
        del model_pred_df
        gc.collect()
        
    timestr = time.strftime("%Y%m%d-%H%M%S")
    ensembled_pred_df.to_csv(f'./ensembled_pred_{timestr}.csv', index = False)
    

In [4]:
seed_everything(CFG.seed)
helper(5)

In [None]:
x = np.zeros(0)

In [None]:
x.shape

In [None]:
y = np.append(x,x)

In [None]:
y.shape

In [2]:
pred_df_1 = pd.DataFrame([[123,0.5],[101,1]],columns = ['customer_ID','prediction'])
pred_df_2 = pd.DataFrame([[123,1],[101,0]],columns = ['customer_ID','prediction'])
pred_df_3 = pd.DataFrame([[123,1],[101,0]],columns = ['customer_ID','prediction'])

In [6]:
ensembled_pred_df = pd.DataFrame(columns = ['customer_ID','prediction'])
dfs = [pred_df_1,pred_df_2,pred_df_3]
for i in range(len(dfs)):
    dfs[i]['prediction'] = dfs[i]['prediction']/len(dfs)
    ensembled_pred_df = pd.concat([ensembled_pred_df,dfs[i]],axis=0).groupby(by='customer_ID',as_index=False).sum()

In [7]:
ensembled_pred_df