In [None]:
version = "3"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, Normalizer
%matplotlib inline

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

# Feature Generators

In [4]:
from numba import njit, prange,jit
@jit(nopython=True, parallel=True)
def get_regression_coefficient(ys, xs, counts):
    output = np.zeros(counts.shape[0]-1)
    for i in prange(counts.shape[0]-1):
        x = xs[counts[i] : counts[i+1]]
        y = ys[counts[i] : counts[i+1]]

        output[i] = np.nan
        length = y.shape[0]
        sx = np.sum(x)
        denominator = ((length*np.dot(x, x)) - (sx*sx))
        
        if length >0 and denominator != 0:
            numerator = ((length*np.dot(x, y)) - (sx*np.sum(y)))
            output[i] = numerator/denominator
    return output

In [5]:
def get_trend_values(data):
        y = data.apply(lambda x: x.dropna().values)
        trends = pd.Series(index =  y.index)
        
        counts = np.insert(y.apply(lambda x: x.shape[0]).values, 0, 0)
        x = np.concatenate(np.array([np.arange(c) for c in counts])).ravel().astype(np.float64)
        y = np.concatenate(y.values).ravel().astype(np.float64)
        counts = np.cumsum(counts)
        
        trends.iloc[:] = get_regression_coefficient(y, x, counts)
        return trends

In [6]:
def generate_features(prefix, dataGroups, num_lags, colname, get_trend = False, get_kurtosis = True, to_calculate = ["sum", "count","nunique","mean","median","max","min","std","skew"]):
    global output
    if len(to_calculate)> 0 :
        print "Getting Aggregates for column: {}, {}".format(colname, num_lags)
        aggregates = dataGroups[colname].agg(to_calculate)
        aggregates.columns = ["{}_{}_{}_{}".format(prefix,colname, c, num_lags) for c in aggregates.columns]
        output = output.merge(aggregates, how = "left", left_on = "SK_ID_CURR", right_index=True)
        timer("Done Aggregating Column {}".format(colname))
    if get_kurtosis:
        print "Getting Kurtosis for Column: {}, {}".format(colname, num_lags)
        output["{}_{}_kurtosis_{}".format(prefix,colname,num_lags)] = output["SK_ID_CURR"].map(dataGroups[colname].apply(lambda x: x.kurt()))
        timer("Done Getting Kurtosis for Column {}".format(colname))

    if get_trend:
        print "Getting Trend for column: {}, {}".format(colname, num_lags)
        output["{}_{}_trend_{}".format(prefix, colname, num_lags)] = output["SK_ID_CURR"].map(get_trend_values(dataGroups[colname]))
        timer("Done Getting Trend for Column {}".format(colname))


# Feature Preprocessors

In [7]:
def one_hot_encoder(data, nan_as_category = True):
    categorical_columns = [col for col in data.columns if data[col].dtype == 'object']
    data = pd.get_dummies(data, columns= categorical_columns, dummy_na= nan_as_category)
    return data

def normalize_features(data, not_to_normalize):
    columns_to_normalize = data.columns.drop(not_to_normalize)
    data = data.replace([np.inf, -np.inf], np.nan)
    print data.shape, "Before dropping Na's"
    data = data.dropna(axis=1, how = "all")
    print data.shape, "After dropping Na's"
    for col in data.columns.drop(not_to_normalize):
        data[col] = data[col].fillna(data[col].mean()).fillna(0)
        data[col]= Normalizer().fit_transform(data[col].values.reshape(-1,1))
    return data

# Get Installment Features

In [None]:
def GetInstallmentFeatures(installment_path = "../data/installments_payments.csv"):
    installments = pd.read_csv(installment_path).sort_values(by=["SK_ID_CURR","DAYS_INSTALMENT"]).reset_index(drop = True)
    
    
    # Some Preprocessing
    installments['installment_paid_late_in_days'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']
    installments['installment_paid_over_amount'] = installments['AMT_PAYMENT'] - installments['AMT_INSTALMENT']
    installments['installment_paid_over_amount_ratio'] = installments['AMT_PAYMENT'] / (1+installments['AMT_INSTALMENT'])


    # All Aggregate Features
    groups = installments.groupby("SK_ID_CURR")
    
    installments["installment_paid_late_in_days_diff"] = groups["installment_paid_late_in_days"].diff()
    installments["installment_paid_over_amount_diff"] = groups["installment_paid_over_amount"].diff()
    installments["installment_paid_over_amount_ratio_diff"] = groups["installment_paid_over_amount_ratio"].diff()
    installments["NUM_INSTALMENT_VERSION_diff"] = groups["NUM_INSTALMENT_VERSION"].diff()
    
    installments["installment_paid_late_in_days_diff_diff"] = groups["installment_paid_late_in_days_diff"].diff()
    installments["installment_paid_over_amount_diff_diff"] = groups["installment_paid_over_amount_diff"].diff()
    installments["installment_paid_over_amount_ratio_diff_diff"] = groups["installment_paid_over_amount_ratio_diff"].diff()
    installments["NUM_INSTALMENT_VERSION_diff_diff"] = groups["NUM_INSTALMENT_VERSION_diff"].diff()
    installments["DAYS_INSTALMENT_diff_diff"] = groups["DAYS_INSTALMENT"].diff().diff()

    
    installments["installment_paid_late_in_days_rolling6"] = groups["installment_paid_late_in_days"].apply(lambda x: x.rolling(6).mean())
    installments["installment_paid_over_amount_rolling6"] = groups["installment_paid_over_amount"].apply(lambda x: x.rolling(6).mean())
    installments["installment_paid_over_amount_ratio_rolling6"] = groups["installment_paid_over_amount_ratio"].apply(lambda x: x.rolling(6).mean())
    installments["NUM_INSTALMENT_VERSION_rolling6"] = groups["NUM_INSTALMENT_VERSION"].apply(lambda x: x.rolling(6).mean())
    installments["DAYS_INSTALMENT_rolling6"] = groups["DAYS_INSTALMENT"].apply(lambda x: x.rolling(6).mean())
    
    installments["installment_paid_late_in_days_rolling12"] = groups["installment_paid_late_in_days"].apply(lambda x: x.rolling(12).mean())
    installments["installment_paid_over_amount_rolling12"] = groups["installment_paid_over_amount"].apply(lambda x: x.rolling(12).mean())
    installments["installment_paid_over_amount_ratio_rolling12"] = groups["installment_paid_over_amount_ratio"].apply(lambda x: x.rolling(12).mean())
    installments["NUM_INSTALMENT_VERSION_rolling12"] = groups["NUM_INSTALMENT_VERSION"].apply(lambda x: x.rolling(12).mean())
    installments["DAYS_INSTALMENT_rolling12"] = groups["DAYS_INSTALMENT"].apply(lambda x: x.rolling(12).mean())
    
    # For Lagged Items
    for lags in [12, 60, 90]:
        groups = installments.groupby("SK_ID_CURR").tail(lags).reset_index(drop=True).groupby("SK_ID_CURR")

        generate_features("installment",groups, lags, "installment_paid_late_in_days_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_ratio_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "NUM_INSTALMENT_VERSION_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_late_in_days_diff_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_diff_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_ratio_diff_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "NUM_INSTALMENT_VERSION_diff_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "DAYS_INSTALMENT_diff_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_late_in_days_rolling6", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_rolling6", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_ratio_rolling6", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "NUM_INSTALMENT_VERSION_rolling6", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "DAYS_INSTALMENT_rolling6", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_late_in_days_rolling12", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_rolling12", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_ratio_rolling12", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "NUM_INSTALMENT_VERSION_rolling12", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "DAYS_INSTALMENT_rolling12", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])


In [None]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/installments_payments.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
GetInstallmentFeatures(installment_path = "../data/installments_payments.csv")


# Saving Data
if get_unnormalized:
    output.to_csv("csv/installment_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/installment_features_normalized_V{}.csv".format(version), index=False)

# Get POS Cash Features

In [8]:
def GetPOSFeatures(pos_cash_path = "../data/POS_CASH_balance.csv"):
    pos_cash = pd.read_csv(pos_cash_path).sort_values(by=["SK_ID_CURR","SK_ID_PREV","MONTHS_BALANCE"]).reset_index(drop = True)
    
    # All Aggregate Features
    groups = pos_cash.groupby("SK_ID_CURR")
    
    pos_cash["SK_DPD_diff"] = groups["SK_DPD"].diff()
    pos_cash["SK_DPD_DEF_diff"] = groups["SK_DPD_DEF"].diff()
    
    pos_cash["SK_DPD_diff_diff"] = groups["SK_DPD_diff"].diff()
    pos_cash["SK_DPD_DEF_diff_diff"] = groups["SK_DPD_DEF_diff"].diff()
    pos_cash["CNT_INSTALMENT_diff_diff"] = groups["CNT_INSTALMENT"].diff().diff()
    pos_cash["CNT_INSTALMENT_FUTURE_diff_diff"] = groups["CNT_INSTALMENT_FUTURE"].diff().diff()

    pos_cash["SK_DPD_rolling6"] = groups["SK_DPD"].apply(lambda x: x.rolling(6).mean())
    pos_cash["SK_DPD_DEF_rolling6"] = groups["SK_DPD_DEF"].apply(lambda x: x.rolling(6).mean())
    pos_cash["CNT_INSTALMENT_rolling6"] = groups["CNT_INSTALMENT"].apply(lambda x: x.rolling(6).mean())
    pos_cash["CNT_INSTALMENT_FUTURE_rolling6"] = groups["CNT_INSTALMENT_FUTURE"].apply(lambda x: x.rolling(6).mean())


    pos_cash["SK_DPD_rolling12"] = groups["SK_DPD"].apply(lambda x: x.rolling(12).mean())
    pos_cash["SK_DPD_DEF_rolling12"] = groups["SK_DPD_DEF"].apply(lambda x: x.rolling(12).mean())
    pos_cash["CNT_INSTALMENT_rolling12"] = groups["CNT_INSTALMENT"].apply(lambda x: x.rolling(12).mean())
    pos_cash["CNT_INSTALMENT_FUTURE_rolling12"] = groups["CNT_INSTALMENT_FUTURE"].apply(lambda x: x.rolling(12).mean())


    #For Lagged Items
    for lags in [6, 12]:
        groups = pos_cash.groupby("SK_ID_CURR").tail(lags).reset_index(drop=True).groupby("SK_ID_CURR")

        generate_features("pos_cash",groups, lags, "SK_DPD_diff", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_DEF_diff", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_diff_diff", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_DEF_diff_diff", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT_diff_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT_FUTURE_diff_diff", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_rolling6", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_DEF_rolling6", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT_rolling6", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT_FUTURE_rolling6", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_rolling12", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_DEF_rolling12", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT_rolling12", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT_FUTURE_rolling12", get_trend = True, get_kurtosis = True, to_calculate =  ["sum","mean","median","max","min","std","skew"])

        del groups
        gc.collect()
    

In [9]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/POS_CASH_balance.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
GetPOSFeatures(pos_cash_path = "../data/POS_CASH_balance.csv")

# Saving Data
if get_unnormalized:
    output.to_csv("csv/pos_cash_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/pos_cash_features_normalized_V{}.csv".format(version), index=False)

Getting Aggregates for column: SK_DPD_diff, 6
Done Aggregating Column SK_DPD_diff : 1828.79173899 seconds
Getting Kurtosis for Column: SK_DPD_diff, 6
Done Getting Kurtosis for Column SK_DPD_diff : 74.5211980343 seconds
Getting Trend for column: SK_DPD_diff, 6
Done Getting Trend for Column SK_DPD_diff : 101.981404066 seconds
Getting Aggregates for column: SK_DPD_DEF_diff, 6
Done Aggregating Column SK_DPD_DEF_diff : 74.5928058624 seconds
Getting Kurtosis for Column: SK_DPD_DEF_diff, 6
Done Getting Kurtosis for Column SK_DPD_DEF_diff : 74.2076630592 seconds
Getting Trend for column: SK_DPD_DEF_diff, 6
Done Getting Trend for Column SK_DPD_DEF_diff : 101.479686975 seconds
Getting Aggregates for column: SK_DPD_diff_diff, 6
Done Aggregating Column SK_DPD_diff_diff : 74.2912449837 seconds
Getting Kurtosis for Column: SK_DPD_diff_diff, 6
Done Getting Kurtosis for Column SK_DPD_diff_diff : 74.5988090038 seconds
Getting Trend for column: SK_DPD_diff_diff, 6
Done Getting Trend for Column SK_DPD_di

Done Getting Kurtosis for Column SK_DPD_DEF_rolling6 : 74.7521190643 seconds
Getting Trend for column: SK_DPD_DEF_rolling6, 12
Done Getting Trend for Column SK_DPD_DEF_rolling6 : 117.866225958 seconds
Getting Aggregates for column: CNT_INSTALMENT_rolling6, 12
Done Aggregating Column CNT_INSTALMENT_rolling6 : 74.8060290813 seconds
Getting Kurtosis for Column: CNT_INSTALMENT_rolling6, 12
Done Getting Kurtosis for Column CNT_INSTALMENT_rolling6 : 77.163779974 seconds
Getting Trend for column: CNT_INSTALMENT_rolling6, 12
Done Getting Trend for Column CNT_INSTALMENT_rolling6 : 119.07503891 seconds
Getting Aggregates for column: CNT_INSTALMENT_FUTURE_rolling6, 12
Done Aggregating Column CNT_INSTALMENT_FUTURE_rolling6 : 74.6169810295 seconds
Getting Kurtosis for Column: CNT_INSTALMENT_FUTURE_rolling6, 12
Done Getting Kurtosis for Column CNT_INSTALMENT_FUTURE_rolling6 : 78.7400350571 seconds
Getting Trend for column: CNT_INSTALMENT_FUTURE_rolling6, 12
Done Getting Trend for Column CNT_INSTALME

# Credit Card Features

In [10]:
def GetCreditCardFeatures(credit_card_path = "../data/credit_card_balance.csv"):
    credit_card = pd.read_csv(credit_card_path).sort_values(by=["SK_ID_CURR","SK_ID_PREV","MONTHS_BALANCE"]).reset_index(drop = True)
    credit_card = credit_card.fillna(0.0)
    
    
    # Generating Ratio Features
    credit_card['cnt_pos_ratio'] = credit_card['CNT_DRAWINGS_POS_CURRENT']/(credit_card['CNT_DRAWINGS_CURRENT']+1.0)
    credit_card['cnt_atm_ratio'] = credit_card['CNT_DRAWINGS_ATM_CURRENT']/(credit_card['CNT_DRAWINGS_CURRENT']+1.0)
    credit_card['amt_pos_ratio'] = credit_card['AMT_DRAWINGS_POS_CURRENT']/(credit_card['AMT_DRAWINGS_CURRENT']+1.0)
    credit_card['amt_atm_ratio'] = credit_card['AMT_DRAWINGS_ATM_CURRENT']/(credit_card['AMT_DRAWINGS_CURRENT']+1.0)
    credit_card['balance_to_drawing_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_DRAWINGS_CURRENT']+1.0)
    credit_card['balance_to_limit_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_CREDIT_LIMIT_ACTUAL']+1.0)
    credit_card['balance_to_total_payment_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_PAYMENT_TOTAL_CURRENT']+1.0)
    credit_card['balance_to_minimum_installment_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_INST_MIN_REGULARITY']+1.0)
    credit_card['minimum_installment_to_total_payment_ratio'] = credit_card['AMT_INST_MIN_REGULARITY']/(credit_card['AMT_PAYMENT_TOTAL_CURRENT']+1.0)
    credit_card['minimum_installment_to_current_payment_ratio'] = credit_card['AMT_INST_MIN_REGULARITY']/(credit_card['AMT_PAYMENT_CURRENT']+1.0)
    credit_card['current_to_total_payment_ratio'] = credit_card['AMT_PAYMENT_CURRENT']/(credit_card['AMT_PAYMENT_TOTAL_CURRENT']+1.0)
    credit_card['payment_to_receivable_min_ratio'] = credit_card['AMT_INST_MIN_REGULARITY']/(credit_card['AMT_RECEIVABLE_PRINCIPAL']+1.0)
    credit_card['payment_to_receivable_curr_ratio'] = credit_card['AMT_PAYMENT_CURRENT']/(credit_card['AMT_RECIVABLE']+1.0)
    credit_card['payment_to_receivable_total_ratio'] = credit_card['AMT_PAYMENT_TOTAL_CURRENT']/(credit_card['AMT_TOTAL_RECEIVABLE']+1.0)

    all_ratio_features = ["cnt_pos_ratio","cnt_atm_ratio","amt_pos_ratio","amt_atm_ratio","balance_to_drawing_ratio","balance_to_limit_ratio","balance_to_total_payment_ratio","balance_to_minimum_installment_ratio","minimum_installment_to_total_payment_ratio","minimum_installment_to_current_payment_ratio","current_to_total_payment_ratio","payment_to_receivable_min_ratio","payment_to_receivable_curr_ratio","payment_to_receivable_total_ratio"]
    other_features = ["SK_DPD","SK_DPD_DEF","CNT_DRAWINGS_ATM_CURRENT","AMT_DRAWINGS_CURRENT","AMT_TOTAL_RECEIVABLE","AMT_PAYMENT_TOTAL_CURRENT","CNT_INSTALMENT_MATURE_CUM","AMT_BALANCE","AMT_DRAWINGS_ATM_CURRENT","AMT_DRAWINGS_POS_CURRENT","CNT_DRAWINGS_POS_CURRENT","CNT_DRAWINGS_POS_CURRENT","CNT_DRAWINGS_CURRENT"]
    
    # Generating Features that will be used
    groups = credit_card.groupby("SK_ID_CURR")
    
    for col in all_ratio_features+other_features:
        credit_card[col+"_diff"] = groups[col].diff()
        credit_card[col+"_diff_diff"] = groups[col+"_diff"].diff()
        credit_card[col+"_rolling6"] = groups[col].apply(lambda x: x.rolling(6).mean())
        credit_card[col+"_rolling12"] = groups[col].apply(lambda x: x.rolling(12).mean())

    
    using_columns = ["{}_diff".format(c) for c in all_ratio_features+other_features] + ["{}_diff_diff".format(c) for c in all_ratio_features+other_features]+["{}_rolling6".format(c) for c in all_ratio_features+other_features]+["{}_rolling12".format(c) for c in all_ratio_features+other_features]
                
    # Lag Aggregate Features
    for lags in [6, 12]:
        groups = credit_card.groupby("SK_ID_CURR").tail(lags).reset_index(drop=True).groupby("SK_ID_CURR")
        
        for col in using_columns:
            generate_features("credit_card",groups, lags, col, get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
     
        del groups
        gc.collect()

In [11]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/credit_card_balance.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
GetCreditCardFeatures(credit_card_path = "../data/credit_card_balance.csv")

# Saving Data
if get_unnormalized:
    output.to_csv("csv/credit_card_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/credit_card_features_normalized_V{}.csv".format(version), index=False)

Getting Aggregates for column: cnt_pos_ratio_diff, 6
Done Aggregating Column cnt_pos_ratio_diff : 4025.372962 seconds
Getting Kurtosis for Column: cnt_pos_ratio_diff, 6
Done Getting Kurtosis for Column cnt_pos_ratio_diff : 23.1981739998 seconds
Getting Trend for column: cnt_pos_ratio_diff, 6
Done Getting Trend for Column cnt_pos_ratio_diff : 30.7753500938 seconds
Getting Aggregates for column: cnt_atm_ratio_diff, 6
Done Aggregating Column cnt_atm_ratio_diff : 22.8141238689 seconds
Getting Kurtosis for Column: cnt_atm_ratio_diff, 6
Done Getting Kurtosis for Column cnt_atm_ratio_diff : 23.4760601521 seconds
Getting Trend for column: cnt_atm_ratio_diff, 6
Done Getting Trend for Column cnt_atm_ratio_diff : 30.9736218452 seconds
Getting Aggregates for column: amt_pos_ratio_diff, 6
Done Aggregating Column amt_pos_ratio_diff : 23.0098161697 seconds
Getting Kurtosis for Column: amt_pos_ratio_diff, 6
Done Getting Kurtosis for Column amt_pos_ratio_diff : 23.4225189686 seconds
Getting Trend for c

Done Aggregating Column AMT_PAYMENT_TOTAL_CURRENT_diff : 22.7141890526 seconds
Getting Kurtosis for Column: AMT_PAYMENT_TOTAL_CURRENT_diff, 6
Done Getting Kurtosis for Column AMT_PAYMENT_TOTAL_CURRENT_diff : 23.2596149445 seconds
Getting Trend for column: AMT_PAYMENT_TOTAL_CURRENT_diff, 6
Done Getting Trend for Column AMT_PAYMENT_TOTAL_CURRENT_diff : 30.6682829857 seconds
Getting Aggregates for column: CNT_INSTALMENT_MATURE_CUM_diff, 6
Done Aggregating Column CNT_INSTALMENT_MATURE_CUM_diff : 22.9886181355 seconds
Getting Kurtosis for Column: CNT_INSTALMENT_MATURE_CUM_diff, 6
Done Getting Kurtosis for Column CNT_INSTALMENT_MATURE_CUM_diff : 22.7338769436 seconds
Getting Trend for column: CNT_INSTALMENT_MATURE_CUM_diff, 6
Done Getting Trend for Column CNT_INSTALMENT_MATURE_CUM_diff : 30.6263930798 seconds
Getting Aggregates for column: AMT_BALANCE_diff, 6
Done Aggregating Column AMT_BALANCE_diff : 22.7345058918 seconds
Getting Kurtosis for Column: AMT_BALANCE_diff, 6
Done Getting Kurtosi

Done Getting Kurtosis for Column current_to_total_payment_ratio_diff_diff : 23.3833069801 seconds
Getting Trend for column: current_to_total_payment_ratio_diff_diff, 6
Done Getting Trend for Column current_to_total_payment_ratio_diff_diff : 31.4475779533 seconds
Getting Aggregates for column: payment_to_receivable_min_ratio_diff_diff, 6
Done Aggregating Column payment_to_receivable_min_ratio_diff_diff : 23.0856020451 seconds
Getting Kurtosis for Column: payment_to_receivable_min_ratio_diff_diff, 6
Done Getting Kurtosis for Column payment_to_receivable_min_ratio_diff_diff : 23.8681018353 seconds
Getting Trend for column: payment_to_receivable_min_ratio_diff_diff, 6
Done Getting Trend for Column payment_to_receivable_min_ratio_diff_diff : 31.1894190311 seconds
Getting Aggregates for column: payment_to_receivable_curr_ratio_diff_diff, 6
Done Aggregating Column payment_to_receivable_curr_ratio_diff_diff : 23.2975800037 seconds
Getting Kurtosis for Column: payment_to_receivable_curr_ratio_d

Done Getting Trend for Column cnt_atm_ratio_rolling6 : 32.1439340115 seconds
Getting Aggregates for column: amt_pos_ratio_rolling6, 6
Done Aggregating Column amt_pos_ratio_rolling6 : 23.4448108673 seconds
Getting Kurtosis for Column: amt_pos_ratio_rolling6, 6
Done Getting Kurtosis for Column amt_pos_ratio_rolling6 : 23.0632650852 seconds
Getting Trend for column: amt_pos_ratio_rolling6, 6
Done Getting Trend for Column amt_pos_ratio_rolling6 : 30.7999999523 seconds
Getting Aggregates for column: amt_atm_ratio_rolling6, 6
Done Aggregating Column amt_atm_ratio_rolling6 : 22.9326930046 seconds
Getting Kurtosis for Column: amt_atm_ratio_rolling6, 6
Done Getting Kurtosis for Column amt_atm_ratio_rolling6 : 23.0919570923 seconds
Getting Trend for column: amt_atm_ratio_rolling6, 6
Done Getting Trend for Column amt_atm_ratio_rolling6 : 31.1421928406 seconds
Getting Aggregates for column: balance_to_drawing_ratio_rolling6, 6
Done Aggregating Column balance_to_drawing_ratio_rolling6 : 23.05122017

Done Getting Kurtosis for Column AMT_PAYMENT_TOTAL_CURRENT_rolling6 : 23.2582700253 seconds
Getting Trend for column: AMT_PAYMENT_TOTAL_CURRENT_rolling6, 6
Done Getting Trend for Column AMT_PAYMENT_TOTAL_CURRENT_rolling6 : 32.3962330818 seconds
Getting Aggregates for column: CNT_INSTALMENT_MATURE_CUM_rolling6, 6
Done Aggregating Column CNT_INSTALMENT_MATURE_CUM_rolling6 : 23.5043439865 seconds
Getting Kurtosis for Column: CNT_INSTALMENT_MATURE_CUM_rolling6, 6
Done Getting Kurtosis for Column CNT_INSTALMENT_MATURE_CUM_rolling6 : 23.4560709 seconds
Getting Trend for column: CNT_INSTALMENT_MATURE_CUM_rolling6, 6
Done Getting Trend for Column CNT_INSTALMENT_MATURE_CUM_rolling6 : 32.2745280266 seconds
Getting Aggregates for column: AMT_BALANCE_rolling6, 6
Done Aggregating Column AMT_BALANCE_rolling6 : 23.5225989819 seconds
Getting Kurtosis for Column: AMT_BALANCE_rolling6, 6
Done Getting Kurtosis for Column AMT_BALANCE_rolling6 : 23.4139239788 seconds
Getting Trend for column: AMT_BALANCE_r

Done Getting Kurtosis for Column current_to_total_payment_ratio_rolling12 : 23.2862579823 seconds
Getting Trend for column: current_to_total_payment_ratio_rolling12, 6
Done Getting Trend for Column current_to_total_payment_ratio_rolling12 : 34.3081169128 seconds
Getting Aggregates for column: payment_to_receivable_min_ratio_rolling12, 6
Done Aggregating Column payment_to_receivable_min_ratio_rolling12 : 24.2449738979 seconds
Getting Kurtosis for Column: payment_to_receivable_min_ratio_rolling12, 6
Done Getting Kurtosis for Column payment_to_receivable_min_ratio_rolling12 : 23.1749081612 seconds
Getting Trend for column: payment_to_receivable_min_ratio_rolling12, 6
Done Getting Trend for Column payment_to_receivable_min_ratio_rolling12 : 34.3585150242 seconds
Getting Aggregates for column: payment_to_receivable_curr_ratio_rolling12, 6
Done Aggregating Column payment_to_receivable_curr_ratio_rolling12 : 23.7921118736 seconds
Getting Kurtosis for Column: payment_to_receivable_curr_ratio_r

Done Aggregating Column amt_pos_ratio_diff : 23.8572189808 seconds
Getting Kurtosis for Column: amt_pos_ratio_diff, 12
Done Getting Kurtosis for Column amt_pos_ratio_diff : 23.1319220066 seconds
Getting Trend for column: amt_pos_ratio_diff, 12
Done Getting Trend for Column amt_pos_ratio_diff : 32.3809468746 seconds
Getting Aggregates for column: amt_atm_ratio_diff, 12
Done Aggregating Column amt_atm_ratio_diff : 24.6854791641 seconds
Getting Kurtosis for Column: amt_atm_ratio_diff, 12
Done Getting Kurtosis for Column amt_atm_ratio_diff : 24.0071468353 seconds
Getting Trend for column: amt_atm_ratio_diff, 12
Done Getting Trend for Column amt_atm_ratio_diff : 32.5114409924 seconds
Getting Aggregates for column: balance_to_drawing_ratio_diff, 12
Done Aggregating Column balance_to_drawing_ratio_diff : 23.8969120979 seconds
Getting Kurtosis for Column: balance_to_drawing_ratio_diff, 12
Done Getting Kurtosis for Column balance_to_drawing_ratio_diff : 23.773786068 seconds
Getting Trend for co

Done Getting Trend for Column CNT_INSTALMENT_MATURE_CUM_diff : 32.6838009357 seconds
Getting Aggregates for column: AMT_BALANCE_diff, 12
Done Aggregating Column AMT_BALANCE_diff : 24.1234760284 seconds
Getting Kurtosis for Column: AMT_BALANCE_diff, 12
Done Getting Kurtosis for Column AMT_BALANCE_diff : 23.5105919838 seconds
Getting Trend for column: AMT_BALANCE_diff, 12
Done Getting Trend for Column AMT_BALANCE_diff : 32.6429300308 seconds
Getting Aggregates for column: AMT_DRAWINGS_ATM_CURRENT_diff, 12
Done Aggregating Column AMT_DRAWINGS_ATM_CURRENT_diff : 24.3502039909 seconds
Getting Kurtosis for Column: AMT_DRAWINGS_ATM_CURRENT_diff, 12
Done Getting Kurtosis for Column AMT_DRAWINGS_ATM_CURRENT_diff : 23.5922119617 seconds
Getting Trend for column: AMT_DRAWINGS_ATM_CURRENT_diff, 12
Done Getting Trend for Column AMT_DRAWINGS_ATM_CURRENT_diff : 32.5759050846 seconds
Getting Aggregates for column: AMT_DRAWINGS_POS_CURRENT_diff, 12
Done Aggregating Column AMT_DRAWINGS_POS_CURRENT_diff 

Done Getting Trend for Column payment_to_receivable_min_ratio_diff_diff : 32.7106151581 seconds
Getting Aggregates for column: payment_to_receivable_curr_ratio_diff_diff, 12
Done Aggregating Column payment_to_receivable_curr_ratio_diff_diff : 24.3627560139 seconds
Getting Kurtosis for Column: payment_to_receivable_curr_ratio_diff_diff, 12
Done Getting Kurtosis for Column payment_to_receivable_curr_ratio_diff_diff : 23.5754787922 seconds
Getting Trend for column: payment_to_receivable_curr_ratio_diff_diff, 12
Done Getting Trend for Column payment_to_receivable_curr_ratio_diff_diff : 32.7276821136 seconds
Getting Aggregates for column: payment_to_receivable_total_ratio_diff_diff, 12
Done Aggregating Column payment_to_receivable_total_ratio_diff_diff : 24.7696800232 seconds
Getting Kurtosis for Column: payment_to_receivable_total_ratio_diff_diff, 12
Done Getting Kurtosis for Column payment_to_receivable_total_ratio_diff_diff : 23.4227769375 seconds
Getting Trend for column: payment_to_rec

Done Getting Kurtosis for Column amt_atm_ratio_rolling6 : 23.475659132 seconds
Getting Trend for column: amt_atm_ratio_rolling6, 12
Done Getting Trend for Column amt_atm_ratio_rolling6 : 34.300814867 seconds
Getting Aggregates for column: balance_to_drawing_ratio_rolling6, 12
Done Aggregating Column balance_to_drawing_ratio_rolling6 : 24.7188382149 seconds
Getting Kurtosis for Column: balance_to_drawing_ratio_rolling6, 12
Done Getting Kurtosis for Column balance_to_drawing_ratio_rolling6 : 23.345307827 seconds
Getting Trend for column: balance_to_drawing_ratio_rolling6, 12
Done Getting Trend for Column balance_to_drawing_ratio_rolling6 : 33.772149086 seconds
Getting Aggregates for column: balance_to_limit_ratio_rolling6, 12
Done Aggregating Column balance_to_limit_ratio_rolling6 : 24.8995690346 seconds
Getting Kurtosis for Column: balance_to_limit_ratio_rolling6, 12
Done Getting Kurtosis for Column balance_to_limit_ratio_rolling6 : 23.5958600044 seconds
Getting Trend for column: balanc

Done Getting Trend for Column CNT_INSTALMENT_MATURE_CUM_rolling6 : 33.3519659042 seconds
Getting Aggregates for column: AMT_BALANCE_rolling6, 12
Done Aggregating Column AMT_BALANCE_rolling6 : 25.7396380901 seconds
Getting Kurtosis for Column: AMT_BALANCE_rolling6, 12
Done Getting Kurtosis for Column AMT_BALANCE_rolling6 : 24.5962049961 seconds
Getting Trend for column: AMT_BALANCE_rolling6, 12
Done Getting Trend for Column AMT_BALANCE_rolling6 : 32.8393819332 seconds
Getting Aggregates for column: AMT_DRAWINGS_ATM_CURRENT_rolling6, 12
Done Aggregating Column AMT_DRAWINGS_ATM_CURRENT_rolling6 : 24.320235014 seconds
Getting Kurtosis for Column: AMT_DRAWINGS_ATM_CURRENT_rolling6, 12
Done Getting Kurtosis for Column AMT_DRAWINGS_ATM_CURRENT_rolling6 : 22.8960969448 seconds
Getting Trend for column: AMT_DRAWINGS_ATM_CURRENT_rolling6, 12
Done Getting Trend for Column AMT_DRAWINGS_ATM_CURRENT_rolling6 : 32.1323881149 seconds
Getting Aggregates for column: AMT_DRAWINGS_POS_CURRENT_rolling6, 12

Done Getting Kurtosis for Column payment_to_receivable_min_ratio_rolling12 : 22.8654119968 seconds
Getting Trend for column: payment_to_receivable_min_ratio_rolling12, 12
Done Getting Trend for Column payment_to_receivable_min_ratio_rolling12 : 34.2308580875 seconds
Getting Aggregates for column: payment_to_receivable_curr_ratio_rolling12, 12
Done Aggregating Column payment_to_receivable_curr_ratio_rolling12 : 24.4986400604 seconds
Getting Kurtosis for Column: payment_to_receivable_curr_ratio_rolling12, 12
Done Getting Kurtosis for Column payment_to_receivable_curr_ratio_rolling12 : 22.9332590103 seconds
Getting Trend for column: payment_to_receivable_curr_ratio_rolling12, 12
Done Getting Trend for Column payment_to_receivable_curr_ratio_rolling12 : 35.0303618908 seconds
Getting Aggregates for column: payment_to_receivable_total_ratio_rolling12, 12
Done Aggregating Column payment_to_receivable_total_ratio_rolling12 : 24.4589579105 seconds
Getting Kurtosis for Column: payment_to_receiva

# Previous Application Features

In [None]:
def GetPreviousAppFeatures(prev_app_path = "../data/previous_application.csv"):
    prev_apps = pd.read_csv(prev_app_path).sort_values(by=["SK_ID_CURR","DAYS_DECISION"]).reset_index(drop = True)
    
    # Generating Some Features
    prev_apps["proportion_granted"] = prev_apps["AMT_APPLICATION"]/(prev_apps["AMT_CREDIT"]+1.0)

    # All Aggregate Features
    usecols = ["CNT_PAYMENT","DAYS_DECISION","DAYS_FIRST_DRAWING","proportion_granted"]
    groups = prev_apps.groupby("SK_ID_CURR")
    
    for col in usecols:
        prev_apps[col+"_diff"] = groups[col].diff()
        prev_apps[col+"_diff_diff"] = groups[col+"_diff"].diff()
        prev_apps[col+"_rolling6"] = groups[col].apply(lambda x: x.rolling(6).mean())
        prev_apps[col+"_rolling12"] = groups[col].apply(lambda x: x.rolling(12).mean())

    using_columns = ["{}_diff".format(c) for c in usecols] + ["{}_diff_diff".format(c) for c in usecols]+ ["{}_rolling6".format(c) for c in usecols]+["{}_rolling12".format(c) for c in usecols]
              
    # For Lagged Items
    for lags in [12,60,90]:
        groups = prev_apps.groupby("SK_ID_CURR").tail(lags).reset_index(drop=True).groupby("SK_ID_CURR")

        for col in using_columns:
            generate_features("prev_apps",groups, lags, col, get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        
        del groups
        gc.collect()
    
 

In [None]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/previous_application.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
GetPreviousAppFeatures(prev_app_path = "../data/previous_application.csv")

# Saving Data
if get_unnormalized:
    output.to_csv("csv/previous_application_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/previous_application_features_normalized_V{}.csv".format(version), index=False)