In [1]:
import warnings
# warnings.simplefilter('ignore')

import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
import polars as pl
from utils import *

In [2]:
# pd.set_option('display.max_column', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_seq_items', None)
# pd.set_option('display.max_colwidth', None) # 500
# pd.set_option('expand_frame_repr', True)

## Config

In [3]:
root = '.'

## DataFrames size reduction and other functions

In [4]:
# https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
#     start_mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage of properties dataframe is :",start_mem_usg," MB")
#     NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
#             # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
#             # Integer does not support NA, therefore, NA needs to be filled
#             if not np.isfinite(props[col]).all(): 
#                 NAlist.append(col)
#                 props[col].fillna(mn-1,inplace=True)  
                   
            # test if column dtype is int.
            if 'int' in props[col].dtype.name:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
#             # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
#     # Print final result
#     print("___MEMORY USAGE AFTER COMPLETION:___")
#     mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage is: ",mem_usg," MB")
#     print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
#     return props, NAlist
    return props

def pad_target(x):
    # pad target (whose length is less than 13) with np.nan values at start.
    
    t = np.zeros(13)
    t[:-len(x)] = 0. # np.nan
    t[-len(x):] = x
    return list(t)

# limiting 'count of distinct values' to be less than or equal to 255 (max_bin), by removing infrequent values.
def GreedyFindBin(distinct_values, counts, num_distinct_values, max_bin, total_cnt, min_data_in_bin=3):
    # distinct_values = vc.index.values
    # counts = vc.values
    # num_distinct_values = len(vc) e.g., 284
    # max_bin = 255
    # total_cnt = vc.sum() e.g., 1383534

    # distinct_values => 
    # [-8900. -8100. -8000. -7900. -7800. -7700. -7600. -7500. -7400. -7300.
    #  -7200. -7100. -7000. -6900. -6800. -6700. -6600. -6500. -6400. -6300.
    #     ..
    #  18799. 18899. 18999. 19099. 19199. 19299. 19399. 19499. 19599. 19699.
    #  19799. 19899. 19999. 20099.]    
    
    # counts => 
    # [    1     1     1     1     2     2     4     1     2     3     5     7
    #      4     5    11     4     9    13    13    11    20    10    30    21
    #     ..
    #  14966 11184 13507 10751 13033 10588 12653  9195 10650  8760 10421  8375
    #   9463  6798  8235  6411  8216  6324 27516     1]    

    bin_upper_bound=list();
    assert(max_bin>0)

    # The number of feature values is less than the number of max_bin, directly take the midpoint of distinct_values and place it.
    if num_distinct_values <= max_bin:
        cur_cnt_inbin = 0
        for i in range(num_distinct_values-1):
            cur_cnt_inbin += counts[i]

            if cur_cnt_inbin >= min_data_in_bin:
                bin_upper_bound.append((distinct_values[i] + distinct_values[i + 1]) / 2.0)
                cur_cnt_inbin = 0

        cur_cnt_inbin += counts[num_distinct_values - 1];
#         bin_upper_bound.append(float('Inf'))

    else:
        if min_data_in_bin>0:
            max_bin=min(max_bin,total_cnt//min_data_in_bin)
                # max_bin => 255
            max_bin=max(max_bin,1)
                # max_bin => 255            

        mean_bin_size=total_cnt/max_bin
            # mean_bin_size => 5425.623529411765
        rest_bin_cnt = max_bin
        rest_sample_cnt = total_cnt
            # rest_sample_cnt, rest_bin_cnt => 1383534, 255

        is_big_count_value=[False]*num_distinct_values        
            # is_big_count_value => [False, False,  ..., False]
            # 284 elements.

        for i in range(num_distinct_values):
            if counts[i] >= mean_bin_size:
                is_big_count_value[i] = True
                rest_bin_cnt-=1
                rest_sample_cnt -= counts[i]

            # rest_sample_cnt, rest_bin_cnt => 207531 121
        mean_bin_size = rest_sample_cnt/rest_bin_cnt
            # mean_bin_size => 1715.1322314049587

        upper_bounds=[float('Inf')]*max_bin
        lower_bounds=[float('Inf')]*max_bin
        bin_cnt = 0
        lower_bounds[bin_cnt] = distinct_values[0]
        cur_cnt_inbin = 0
        
        for i in range(num_distinct_values-1):
                # num_distinct_values => 284
            if not is_big_count_value[i]:
                rest_sample_cnt -= counts[i]
            
                # rest_sample_cnt, rest_bin_cnt => 207530, 121
            cur_cnt_inbin += counts[i]            

            # If the cur_cnt_inbin is too small, accumulate the next value until the condition is met and enter the loop.
            # Need a new bin if the current feature values need to be separated into a bin, or the current count of several feature values exceeds mean_bin_size, or the next one needs to be independently bucketed.
            if is_big_count_value[i] or cur_cnt_inbin >= mean_bin_size or \
            (is_big_count_value[i + 1] and cur_cnt_inbin >= max(1.0, mean_bin_size * 0.5)):
                # counts[i] => 205
                # cur_cnt_inbin, distinct_values[i] => 1886, -4000.0
                # bin_cnt => 0
                upper_bounds[bin_cnt] = distinct_values[i] # The largest value of the i-th bin is distinct_values[i].
                bin_cnt+=1
                # distinct_values[i + 1] => -3900.0
                lower_bounds[bin_cnt] = distinct_values[i + 1] # The minimum of the next bin is distinct_values[i + 1], pay attention to ++bin first.
                if bin_cnt >= max_bin - 1:
                    break
                cur_cnt_inbin = 0
                if not is_big_count_value[i]:
                    rest_bin_cnt-=1
                    mean_bin_size = rest_sample_cnt / rest_bin_cnt

        # update bin upper bound (Similar to the operation where the number of feature values is less than the number of max_bins, take the mean of the current value and the next value as the dividing point of the bucket).
        for i in range(bin_cnt-1):
            bin_upper_bound.append((upper_bounds[i] + lower_bounds[i + 1]) / 2.0)
#         bin_upper_bound.append(float('Inf'))

        # bin_upper_bound =>
        # [-3950.0, -3250.0, -2750.0, -2350.0, -2050.0, -1750.0, -1450.0, -1150.0, -950.0, -750.0,
        #  -550.0, -350.0, -150.0, -50.0, 49.5, 249.0, 449.0, 649.0, 849.0, 1049.0,
        #  ... 
        #  18049.0, 18149.0, 18249.0, 18349.0, 18449.0, 18549.0, 18649.0, 18749.0, 18849.0, 18949.0,
        #  19049.0, 19149.0, 19249.0, 19349.0, 19449.0, 19549.0, 19649.0, 19749.0, 19849.0, 19949.0]
        # len(bin_upper_bound) => 203
    return bin_upper_bound

In [6]:
for fn in ['cat','num','diff','rank_num','last3_cat','last3_num','last3_diff', 'last6_num','ym_rank_num']:
    df = reduce_mem_usage(pd.read_parquet(f'{root}/{fn}_feature'))

    if 'last' in fn:
        # appending 'lastX_' to columns of dataframes 'last3_cat','last3_num','last3_diff' and 'last6_num'.
        pre = '_'.join(fn.split('_')[:-1])+'_'
        df = df.rename({col:pre+col for col in df.columns if col != 'customer_ID'}, axis=1)
        
    df.to_parquet(f'{root}/{fn}_feature', compression='gzip', index=False)
del df
_ = gc.collect()
# also, copy ['cat','num','diff','rank_num','last3_cat','last3_num','last3_diff', 'last6_num','ym_rank_num'] files to the inside of folder 'extra'.

## Processing

In [14]:
for fn in ['cat','num','diff','rank_num','last3_cat','last3_num','last3_diff', 'last6_num','ym_rank_num']:
    df = pd.read_parquet(f'{root}/{fn}_feature')    
    for col in tqdm(df.columns):
        if col not in ['customer_ID','S_2']:
            vc = df[col].value_counts().sort_index()
                # (for column 'last3_P_2_sum') vc =>         
                #             ...
                # -7900.0         1
                # -7800.0         2
                #             ...  
                #  19699.0     6411
                #  19799.0     8216 
                # Name: last3_P_2_sum, Length: 284, dtype: int64

                # len(vc), vc.sum() => 284, 1383534
            bins = GreedyFindBin(vc.index.values,vc.values,len(vc),255,vc.sum())    
                # bins => [-3950.0, -3250.0, -2750.0, -2350.0, .., 19549.0, 19649.0, 19749.0, 19849.0, 19949.0]  
                # len(bins) => 204

                # len(df[col]) => 1383534

            if not -np.inf in bins: bins = [-np.inf]+bins            
            if not np.inf in bins: bins = bins+[np.inf]

            df[col] = np.digitize(df[col], bins)
                # numpy.digitize(x, bins, right=False) => Return the indices of the bins to which each value in input array belongs.
                # df[col] => 
                # 0           78
                # 1          163
                #           ... 
                # 1383532    115
                # 1383533    172    

                # len(df[col]) => 1383534            

            df.loc[df[col]==len(bins),col] = 0 # put zeros for values matching the 'len(bins)'.

            df[col] = df[col] / df[col].max() # normalize
                # df[col] => 
                # 0          0.382353
                # 1          0.799020
                #              ...   
                # 1383532    0.563725
                # 1383533    0.843137   
    df = reduce_mem_usage(df)
    df.to_parquet(f'{root}/{fn}_feature', compression='gzip', index=False) 

100%|█████████████████████████████████████████| 208/208 [00:10<00:00, 19.11it/s]
100%|███████████████████████████████████████| 1063/1063 [08:45<00:00,  2.02it/s]
100%|███████████████████████████████████████| 1063/1063 [08:38<00:00,  2.05it/s]
100%|█████████████████████████████████████████| 178/178 [00:14<00:00, 12.59it/s]
100%|█████████████████████████████████████████| 148/148 [00:04<00:00, 32.16it/s]
100%|█████████████████████████████████████████| 886/886 [06:00<00:00,  2.46it/s]
100%|█████████████████████████████████████████| 886/886 [05:58<00:00,  2.47it/s]
100%|█████████████████████████████████████████| 886/886 [06:12<00:00,  2.38it/s]
100%|███████████████████████████████████████| 1063/1063 [08:45<00:00,  2.02it/s]


In [6]:
oof = pd.read_csv('./o_debug/LGB_with_series_feature/oof.csv') # train data
sub = pd.read_csv('./o_debug/LGB_with_series_feature/submission.csv.zip') # test data
oof = reduce_mem_usage(oof)
sub = reduce_mem_usage(sub)

In [7]:
oof = oof.groupby('customer_ID',sort=False)['target'].agg(lambda x:pad_target(x))
sub = sub.groupby('customer_ID',sort=False)['prediction'].agg(lambda x:pad_target(x))
tmp = pd.concat([oof,sub])
    # tmp =>
    # [0.0017161552068444, 0.00154039329695, 0.001630075701532, ..., 0.0007328170974785, 0.0009232219338176]
    # 13 elements
del oof, sub
_ = gc.collect()
tmp = pd.DataFrame(data=tmp.tolist(), index=tmp.index, columns=['target%s'%i for i in range(1,14)])
tmp['customer_ID'] = tmp.index
tmp = reduce_mem_usage(tmp)
tmp.to_parquet(f'tmp_feature', compression='gzip', index=False)
# also, copy tmp file to the inside of folder 'extra'.

In [23]:
# # loading manual features (pandas library).
# dfs = []
# for fn in ['cat','num','diff','rank_num','last3_cat','last3_num','last3_diff', 'last6_num','ym_rank_num']:
#     if len(dfs) == 0:
#         dfs.append(pd.read_parquet(f'{root}/{fn}_feature'))
#     else:
#         dfs.append(pd.read_parquet(f'{root}/{fn}_feature')).drop([id_name],axis=1))

In [4]:
# loading manual features (polars library).
df = pl.read_parquet(f'cat_feature')
for fn in ['num','diff','rank_num','last3_cat','last3_num','last3_diff', 'last6_num','ym_rank_num','tmp']:
    df = df.join(pl.read_parquet(f'{root}/{fn}_feature'), on="customer_ID", how="left", ) 
    
df.write_parquet(f'{root}/nn_all_feature', compression='gzip')