In [1]:
import os
import gc
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from contextlib import contextmanager

%matplotlib inline

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [3]:
# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('./data/credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

In [4]:
def main(sample=False, filename='preprocess/credit_card_features.csv'):
    if os.path.exists(filename):
        return pd.read_csv(filename)

    num_rows = 10000 if sample else None
    with timer("Process credit card balance"):
        credit_card_features = credit_card_balance(num_rows)
        print("Previous credit_card df shape:", credit_card_features.shape)
        credit_card_features.to_csv(filename)
        return credit_card_features

In [5]:
df = main()

Previous credit_card df shape: (103558, 141)
Process credit card balance - done in 100s


In [6]:
len(df.columns), df.columns

(141, Index(['CC_MONTHS_BALANCE_MIN', 'CC_MONTHS_BALANCE_MAX',
        'CC_MONTHS_BALANCE_MEAN', 'CC_MONTHS_BALANCE_SUM',
        'CC_MONTHS_BALANCE_VAR', 'CC_AMT_BALANCE_MIN', 'CC_AMT_BALANCE_MAX',
        'CC_AMT_BALANCE_MEAN', 'CC_AMT_BALANCE_SUM', 'CC_AMT_BALANCE_VAR',
        ...
        'CC_NAME_CONTRACT_STATUS_Signed_MAX',
        'CC_NAME_CONTRACT_STATUS_Signed_MEAN',
        'CC_NAME_CONTRACT_STATUS_Signed_SUM',
        'CC_NAME_CONTRACT_STATUS_Signed_VAR', 'CC_NAME_CONTRACT_STATUS_nan_MIN',
        'CC_NAME_CONTRACT_STATUS_nan_MAX', 'CC_NAME_CONTRACT_STATUS_nan_MEAN',
        'CC_NAME_CONTRACT_STATUS_nan_SUM', 'CC_NAME_CONTRACT_STATUS_nan_VAR',
        'CC_COUNT'],
       dtype='object', length=141))