In [1]:
#Import
import gc
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
#Read parquet
train = pd.read_parquet("./data/train.parquet")
test = pd.read_parquet("./data/test.parquet")
train_labels = pd.read_csv("./data/train_labels.csv")

In [3]:
#Columns
all_cols = [c for c in list(train.columns) if c not in ['customer_ID', 'S_2']]
cat_cols = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
num_cols = [col for col in all_cols if col not in cat_cols]

In [4]:
#Finding repetition
customer_ID_count = train['customer_ID'].nunique()
row_count = train.shape[0]

print('Count of unique customers: {x}'.format(x=customer_ID_count))
print('Row count: {x}'.format(x=row_count))
print('Ratio = {x}'.format(x=customer_ID_count/row_count))

Count of unique customers: 458913
Row count: 5531451
Ratio = 0.08296430719534531


In [None]:
#Exploration

In [5]:
gc.collect()

0

In [6]:
#Preprocessing and feature engineering
for i in cat_cols:
    encoder = LabelEncoder()
    train[i] = encoder.fit_transform(train[i])
    test[i] = encoder.transform(test[i])
         

In [17]:
def feature_engineering(df):
    
    df_num_agg = df.groupby('customer_ID')[num_cols].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]
    df_num_agg.reset_index(inplace=True)
    '''
    for col in df_num_agg:
        if 'last' in col and col.replace('last', 'first') in df_num_agg:
            df_num_agg[col + '_lag_sub'] = df_num_agg[col] - df_num_agg[col.replace('last', 'first')]
            df_num_agg[col + '_lag_div'] = df_num_agg[col] / df_num_agg[col.replace('last', 'first')]
    '''
    num_cols_new = [c for c in list(df_num_agg.columns) if c != 'customer_ID']
    df_num_agg[num_cols_new] = df_num_agg[num_cols_new].apply(pd.to_numeric, downcast = 'float')

    df_cat_agg = df.groupby('customer_ID')[cat_cols].agg(['count','last','nunique']).astype('int32')
    df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]
    df_cat_agg.reset_index(inplace=True)
    
    cat_cols_new = [c for c in list(df_cat_agg.columns) if c != 'customer_ID']
    df_cat_agg[cat_cols_new] = df_cat_agg[cat_cols_new].apply(pd.to_numeric, downcast = 'unsigned')

    df1 = df_num_agg.merge(df_cat_agg, how = 'inner', on = 'customer_ID')
    del df_num_agg
    del df_cat_agg
    gc.collect()
    
    return df1

    
def preprocess(df, train = True):
    
    df1 = feature_engineering(df)
    #df1 = compression(df1)
    
    if train:
        df1 = pd.concat([df1, train_labels], axis=1)
    
    
    return df1
    
    
    
    

In [18]:
train_new = preprocess(train)
test_new = preprocess(test, False)

In [20]:
seen = set()
dupes = []
for x in list(train_new.columns):
    if x not in seen:
        seen.add(x)
    else:
        dupes.append(x)

In [21]:
dupes

['customer_ID']

In [20]:
train_new.drop(columns=train_new.columns[-2], 
        axis=1, 
        inplace=True)

In [21]:
train_new.to_parquet("./data/train_fe_compress.parquet", compression="gzip")
test_new.to_parquet("./data/test_fe_compress.parquet", compression="gzip")