In [1]:
import os
import gc
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from contextlib import contextmanager

%matplotlib inline

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [3]:
# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category=True):
    pos = pd.read_csv('./data/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg

In [4]:
def main(sample=False, filename='preprocess/pos_cash_features.csv'):
    if os.path.exists(filename):
        return pd.read_csv(filename)

    num_rows = 10000 if sample else None
    with timer("Process POS Cash balance"):
        pos_cash_features = pos_cash(num_rows)
        print("Previous pos_cash df shape:", pos_cash_features.shape)
        pos_cash_features.to_csv(filename)
        return pos_cash_features

In [5]:
df = main()

Previous pos_cash df shape: (337252, 18)
Process POS Cash balance - done in 45s


In [6]:
len(df.columns), df.columns

(18, Index(['POS_MONTHS_BALANCE_MAX', 'POS_MONTHS_BALANCE_MEAN',
        'POS_MONTHS_BALANCE_SIZE', 'POS_SK_DPD_MAX', 'POS_SK_DPD_MEAN',
        'POS_SK_DPD_DEF_MAX', 'POS_SK_DPD_DEF_MEAN',
        'POS_NAME_CONTRACT_STATUS_Active_MEAN',
        'POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN',
        'POS_NAME_CONTRACT_STATUS_Approved_MEAN',
        'POS_NAME_CONTRACT_STATUS_Canceled_MEAN',
        'POS_NAME_CONTRACT_STATUS_Completed_MEAN',
        'POS_NAME_CONTRACT_STATUS_Demand_MEAN',
        'POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN',
        'POS_NAME_CONTRACT_STATUS_Signed_MEAN',
        'POS_NAME_CONTRACT_STATUS_XNA_MEAN',
        'POS_NAME_CONTRACT_STATUS_nan_MEAN', 'POS_COUNT'],
       dtype='object'))