In [1]:
version = "1"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, Normalizer
%matplotlib inline

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

# Feature Generators

In [4]:
def get_regression_coefficient(x):
    if x.shape[0] > 0:
        return np.linalg.lstsq(np.arange(0, len(x)).reshape(-1, 1), x.values.reshape(-1,1))[0][0][0]
    else:
        return np.nan

In [5]:
def generate_features(prefix, dataGroups, num_lags, colname, get_trend = False, get_kurtosis = True, to_calculate = ["sum", "count","nunique","mean","median","max","min","std","skew"]):
    global output
    if len(to_calculate)> 0 :
        print "Getting Aggregates for column: {}, {}".format(colname, num_lags)
        aggregates = dataGroups[colname].agg(to_calculate)
        aggregates.columns = ["{}_{}_{}_{}".format(prefix,colname, c, num_lags) for c in aggregates.columns]
        output = output.merge(aggregates, how = "left", left_on = "SK_ID_CURR", right_index=True)
        timer("Done Aggregating Column {}".format(colname))
    if get_kurtosis:
        print "Getting Kurtosis for Column: {}, {}".format(colname, num_lags)
        output["{}_{}_kurtosis_{}".format(prefix,colname,num_lags)] = output["SK_ID_CURR"].map(dataGroups[colname].apply(lambda x: x.kurt()))
        timer("Done Getting Kurtosis for Column {}".format(colname))

    if get_trend:
        print "Getting Trend for column: {}, {}".format(colname, num_lags)
        output["{}_{}_trend_{}".format(prefix, colname, num_lags)] = output["SK_ID_CURR"].map(dataGroups[colname].apply(lambda x: get_regression_coefficient(x.dropna())))
        timer("Done Getting Trend for Column {}".format(colname))


# Feature Preprocessors

In [6]:
def one_hot_encoder(data, nan_as_category = True):
    categorical_columns = [col for col in data.columns if data[col].dtype == 'object']
    data = pd.get_dummies(data, columns= categorical_columns, dummy_na= nan_as_category)
    return data

def normalize_features(data, not_to_normalize):
    columns_to_normalize = data.columns.drop(not_to_normalize)
    data = data.replace([np.inf, -np.inf], np.nan)
    
    for col in columns_to_normalize:
        data[col]= Normalizer().fit_transform(data[col].fillna(data[col].mean()).values.reshape(-1,1))
    return data

# Get Application Features

In [None]:
def GetApplicationFeatures(get_ohe = False, normalize = False, get_flags = False, train_path = "../data/application_train.csv", test_path = "../data/application_test.csv"):
    # Loading Data
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    data = pd.concat([train, test], axis=0)
    
    # Cleaning {To be improved later}
    data['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
    data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    data['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
    data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
    data['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
    
    # Feature Engineering on groupby
    inc_by_org = data[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
    data['new_inc_by_org'] = data['ORGANIZATION_TYPE'].map(inc_by_org)
    
    # Feature Engineering on Flag Columns
    docs = [c for c in data.columns if 'FLAG_DOC' in c]
    live = [c for c in data.columns if ('FLAG_' in c) & ('FLAG_DOC' not in c) & ('_FLAG_' not in c)]
    data['new_doc_ind_kurt'] = data[docs].kurtosis(axis=1)
    data['new_live_ind_sum'] = data[live].sum(axis=1)
    
    # Feature Engineering on feature interations
    data['annuity_income_percentage'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL'].astype("float32")
    data['car_to_birth_ratio'] = data['OWN_CAR_AGE'] / data['DAYS_BIRTH'].astype("float32")
    data['car_to_employ_ratio'] = data['OWN_CAR_AGE'] / data['DAYS_EMPLOYED'].astype("float32")
    data['children_ratio'] = data['CNT_CHILDREN'] / data['CNT_FAM_MEMBERS'].astype("float32")
    data['credit_to_annuity_ratio'] = data['AMT_CREDIT'] / data['AMT_ANNUITY'].astype("float32")
    data['credit_to_goods_ratio'] = data['AMT_CREDIT'] / data['AMT_GOODS_PRICE'].astype("float32")
    data['credit_to_income_ratio'] = data['AMT_CREDIT'] / data['AMT_INCOME_TOTAL'].astype("float32")
    data['days_employed_percentage'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH'].astype("float32")
    data['income_credit_percentage'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT'].astype("float32")
    data['income_per_child'] = data['AMT_INCOME_TOTAL'] / (1 + data['CNT_CHILDREN']).astype("float32")
    data['income_per_person'] = data['AMT_INCOME_TOTAL'] / data['CNT_FAM_MEMBERS'].astype("float32")
    data['payment_rate'] = data['AMT_ANNUITY'] / data['AMT_CREDIT'].astype("float32")
    data['phone_to_birth_ratio'] = data['DAYS_LAST_PHONE_CHANGE'] / data['DAYS_BIRTH'].astype("float32")
    data['phone_to_employ_ratio'] = data['DAYS_LAST_PHONE_CHANGE'] / data['DAYS_EMPLOYED'].astype("float32")
    data['external_sources_weighted'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
    data['external_sources_product'] = data.EXT_SOURCE_1 * data.EXT_SOURCE_2 *  data.EXT_SOURCE_3
    data['cnt_non_child'] = data['CNT_FAM_MEMBERS'] - data['CNT_CHILDREN']
    data['child_to_non_child_ratio'] = data['CNT_CHILDREN'] / data['cnt_non_child'].astype("float32")
    data['income_per_non_child'] = data['AMT_INCOME_TOTAL'] / data['cnt_non_child'].astype("float32")
    data['credit_per_person'] = data['AMT_CREDIT'] / data['CNT_FAM_MEMBERS'].astype("float32")
    data['credit_per_child'] = data['AMT_CREDIT'] / (1 + data['CNT_CHILDREN']).astype("float32")
    data['credit_per_non_child'] = data['AMT_CREDIT'] / data['cnt_non_child'].astype("float32")
    for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian',"std"]:
        data['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

    # Generating Binary Indicator Variables for Certain Columns
    data['short_employment'] = (data['DAYS_EMPLOYED'] < -2000).astype(int)
    data['young_age'] = (data['DAYS_BIRTH'] < -14000).astype(int)

    # Handling Categorical Data
    categorical_columns = [c for c in data.columns if data[c].dtype == "object"]

    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        data[bin_feature], uniques = pd.factorize(data[bin_feature])
        
    if get_ohe:
        data = one_hot_encoder(data, True)
    else:
        for col in categorical_columns:
            data[col] = LabelEncoder().fit_transform(data[col])
    
    # Normalizing Features
    if normalize:
        if get_ohe:
            data = normalize_features(data,["SK_ID_CURR","TARGET"])
        else:
            data = normalize_features(data,["SK_ID_CURR","TARGET"]+categorical_columns)
    
    # Handling Flag Columns {Work some more on it}
    flag_columns = ['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']

    if get_flags:
        return data
    else:
        return data.drop(flag_columns, axis=1)

In [None]:
get_normalized = False
get_unnormalized = True

if get_normalized:
    output = GetApplicationFeatures(get_ohe = True, normalize = True, get_flags = False, train_path = "../data/application_train.csv", test_path = "../data/application_test.csv"):
    output.to_csv("csv/application_features_normalized_V{}.csv".format(version), index=False)
if get_unnormalized:
    output = GetApplicationFeatures(get_ohe = False, normalize = False, get_flags = False, train_path = "../data/application_train.csv", test_path = "../data/application_test.csv"):
    output.to_csv("csv/application_features_V{}.csv".format(version), index=False)

# Get Installment Features

In [7]:
def GetInstallmentFeatures(installment_path = "../data/installments_payments.csv"):
    installments = pd.read_csv(installment_path).sort_values(by=["SK_ID_CURR","DAYS_INSTALMENT"]).reset_index(drop = True)
    
    # Generating Some Features
    installments['installment_paid_late_in_days'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']
    installments['installment_paid_late'] = (installments['installment_paid_late_in_days'] > 0).astype(int)

    installments['installment_paid_over_amount'] = installments['AMT_PAYMENT'] - installments['AMT_INSTALMENT']
    installments['installment_paid_over_amount_ratio'] = installments['AMT_PAYMENT'] / (1+installments['AMT_INSTALMENT'])
    installments['installment_paid_over'] = (installments['installment_paid_over_amount'] > 0).astype(int)

    installments["amt_instalment_pct_change"] = installments.groupby("SK_ID_CURR")["AMT_INSTALMENT"].pct_change().replace([np.inf, -np.inf], np.nan)
    installments["days_instalment_diff"] = installments.groupby("SK_ID_CURR")["DAYS_INSTALMENT"].diff()
    
    # All Aggregate Features
    groups = installments.groupby("SK_ID_CURR")
    
    generate_features("installment",groups, "all", "SK_ID_PREV", get_trend = False, get_kurtosis = False, to_calculate = ["count","nunique"])
    generate_features("installment",groups, "all", "DAYS_INSTALMENT", get_trend = False, get_kurtosis = False, to_calculate = ["min","max","mean"])
    generate_features("installment",groups, "all", "installment_paid_late_in_days", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "all", "installment_paid_over_amount", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "all", "installment_paid_over_amount_ratio", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "all", "NUM_INSTALMENT_VERSION", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "all", "amt_instalment_pct_change", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "all", "days_instalment_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "all", "installment_paid_late", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("installment",groups, "all", "installment_paid_over", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])

    # For Lagged Items
    for lags in [2, 3, 6, 12, 18, 36, 48, 60, 72]:
        groups = installments.groupby("SK_ID_CURR").tail(lags).reset_index(drop=True).groupby("SK_ID_CURR")

        generate_features("installment",groups, lags, "installment_paid_late_in_days", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_over_amount_ratio", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "NUM_INSTALMENT_VERSION", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "amt_instalment_pct_change", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "days_instalment_diff", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("installment",groups, lags, "installment_paid_late", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
        generate_features("installment",groups, lags, "installment_paid_over", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
        del groups
        gc.collect()

    # For Last Item of Each Previous ID
    groups = installments.groupby("SK_ID_CURR").nth(-1).reset_index().groupby("SK_ID_CURR")
    
    generate_features("installment",groups, "last", "installment_paid_late_in_days", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "last", "installment_paid_over_amount", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "last", "installment_paid_over_amount_ratio", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "last", "NUM_INSTALMENT_VERSION", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "last", "amt_instalment_pct_change", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("installment",groups, "last", "installment_paid_late", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("installment",groups, "last", "installment_paid_over", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    del groups
    gc.collect()

In [None]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/installments_payments.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
GetInstallmentFeatures(installment_path = "../data/installments_payments.csv")

# Some Additional Features
output["installment_IDprevUNIQUE_to_IDprevALL_ratio"] = output["installment_SK_ID_PREV_nunique_all"]/output["installment_SK_ID_PREV_count_all"].astype("float32")

# Saving Data
if get_unnormalized:
    output.to_csv("csv/installment_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/installment_features_normalized_V{}.csv".format(version), index=False)

Getting Aggregates for column: SK_ID_PREV
Done Aggregating Column SK_ID_PREV : 400.187371969 seconds
Getting Aggregates for column: DAYS_INSTALMENT
Done Aggregating Column DAYS_INSTALMENT : 0.374568939209 seconds
Getting Aggregates for column: installment_paid_late_in_days
Done Aggregating Column installment_paid_late_in_days : 67.5650811195 seconds
Getting Kurtosis for Column: installment_paid_late_in_days
Done Getting Kurtosis for Column installment_paid_late_in_days : 72.50918293 seconds
Getting Trend for column: installment_paid_late_in_days


  This is separate from the ipykernel package so we can avoid doing imports until


Done Getting Trend for Column installment_paid_late_in_days : 183.567909956 seconds
Getting Aggregates for column: installment_paid_over_amount
Done Aggregating Column installment_paid_over_amount : 68.6528539658 seconds
Getting Kurtosis for Column: installment_paid_over_amount
Done Getting Kurtosis for Column installment_paid_over_amount : 70.6632070541 seconds
Getting Trend for column: installment_paid_over_amount
Done Getting Trend for Column installment_paid_over_amount : 184.799141169 seconds
Getting Aggregates for column: installment_paid_over_amount_ratio
Done Aggregating Column installment_paid_over_amount_ratio : 67.7051548958 seconds
Getting Kurtosis for Column: installment_paid_over_amount_ratio
Done Getting Kurtosis for Column installment_paid_over_amount_ratio : 71.645029068 seconds
Getting Trend for column: installment_paid_over_amount_ratio
Done Getting Trend for Column installment_paid_over_amount_ratio : 184.607728004 seconds
Getting Aggregates for column: NUM_INSTALME

Done Aggregating Column installment_paid_over_amount : 66.8332810402 seconds
Getting Kurtosis for Column: installment_paid_over_amount
Done Getting Kurtosis for Column installment_paid_over_amount : 66.8879468441 seconds
Getting Trend for column: installment_paid_over_amount
Done Getting Trend for Column installment_paid_over_amount : 154.007226229 seconds
Getting Aggregates for column: installment_paid_over_amount_ratio
Done Aggregating Column installment_paid_over_amount_ratio : 66.3323447704 seconds
Getting Kurtosis for Column: installment_paid_over_amount_ratio
Done Getting Kurtosis for Column installment_paid_over_amount_ratio : 67.4568190575 seconds
Getting Trend for column: installment_paid_over_amount_ratio
Done Getting Trend for Column installment_paid_over_amount_ratio : 154.346445084 seconds
Getting Aggregates for column: NUM_INSTALMENT_VERSION
Done Aggregating Column NUM_INSTALMENT_VERSION : 66.5296199322 seconds
Getting Kurtosis for Column: NUM_INSTALMENT_VERSION
Done Gett

Done Getting Kurtosis for Column installment_paid_over_amount : 68.4471569061 seconds
Getting Trend for column: installment_paid_over_amount
Done Getting Trend for Column installment_paid_over_amount : 170.282650948 seconds
Getting Aggregates for column: installment_paid_over_amount_ratio
Done Aggregating Column installment_paid_over_amount_ratio : 68.0835461617 seconds
Getting Kurtosis for Column: installment_paid_over_amount_ratio
Done Getting Kurtosis for Column installment_paid_over_amount_ratio : 70.1743240356 seconds
Getting Trend for column: installment_paid_over_amount_ratio
Done Getting Trend for Column installment_paid_over_amount_ratio : 169.885516882 seconds
Getting Aggregates for column: NUM_INSTALMENT_VERSION
Done Aggregating Column NUM_INSTALMENT_VERSION : 68.3648459911 seconds
Getting Kurtosis for Column: NUM_INSTALMENT_VERSION
Done Getting Kurtosis for Column NUM_INSTALMENT_VERSION : 70.2151651382 seconds
Getting Trend for column: NUM_INSTALMENT_VERSION
Done Getting Tr

# Get POS Cash Features

In [None]:
def GetPOSFeatures(pos_cash_path = "../data/POS_CASH_balance.csv"):
    pos_cash = pd.read_csv(pos_cash_path).sort_values(by=["SK_ID_CURR","SK_ID_PREV","MONTHS_BALANCE"]).reset_index(drop = True)
    
    # Generating Some Features
    pos_cash['pos_cash_paid_late'] = (pos_cash['SK_DPD'] > 0).astype(int)
    pos_cash['pos_cash_paid_late_with_tolerance'] = (pos_cash['SK_DPD_DEF'] > 0).astype(int)
    pos_cash['contract_is_active'] = (pos_cash['NAME_CONTRACT_STATUS'] == "Active").astype(int)
    pos_cash['contract_is_completed'] = (pos_cash['NAME_CONTRACT_STATUS'] == "Completed").astype(int)
    pos_cash['contract_is_signed'] = (pos_cash['NAME_CONTRACT_STATUS'] == "Signed").astype(int)
    pos_cash["cnt_installment_diff_by_IDprev"] = pos_cash.groupby("SK_ID_PREV")["CNT_INSTALMENT"].diff()
    pos_cash["cnt_installment_future_diff_by_IDprev"] = pos_cash.groupby("SK_ID_PREV")["CNT_INSTALMENT_FUTURE"].diff()

    # All Aggregate Features
    groups = pos_cash.groupby("SK_ID_CURR")
    
    generate_features("pos_cash",groups, "all", "SK_ID_PREV", get_trend = False, get_kurtosis = False, to_calculate = ["count","nunique"])
    generate_features("pos_cash",groups, "all", "MONTHS_BALANCE", get_trend = False, get_kurtosis = False, to_calculate = ["min","max","mean"])
    generate_features("pos_cash",groups, "all", "SK_DPD", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "all", "SK_DPD_DEF", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "all", "CNT_INSTALMENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "all", "CNT_INSTALMENT_FUTURE", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "all", "cnt_installment_diff_by_IDprev", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "all", "cnt_installment_future_diff_by_IDprev", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "all", "pos_cash_paid_late", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "all", "pos_cash_paid_late_with_tolerance", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "all", "contract_is_active", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "all", "contract_is_completed", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "all", "contract_is_signed", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    
    #For Lagged Items
    for lags in [2, 3, 6, 12, 18, 36, 48, 60, 72]:
        groups = pos_cash.groupby("SK_ID_CURR").tail(lags).reset_index(drop=True).groupby("SK_ID_CURR")

        generate_features("pos_cash",groups, lags, "SK_DPD", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "SK_DPD_DEF", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "CNT_INSTALMENT_FUTURE", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "cnt_installment_diff_by_IDprev", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "cnt_installment_future_diff_by_IDprev", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("pos_cash",groups, lags, "pos_cash_paid_late", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
        generate_features("pos_cash",groups, lags, "pos_cash_paid_late_with_tolerance", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
        generate_features("pos_cash",groups, lags, "contract_is_active", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
        generate_features("pos_cash",groups, lags, "contract_is_completed", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
        generate_features("pos_cash",groups, lags, "contract_is_signed", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    
        del groups
        gc.collect()
    
    # For Last Item of Each Previous ID
    groups = pos_cash.groupby("SK_ID_CURR").nth(-1).reset_index().groupby("SK_ID_CURR")
    
    generate_features("pos_cash",groups, "last", "SK_DPD", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "last", "SK_DPD_DEF", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "last", "CNT_INSTALMENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "last", "CNT_INSTALMENT_FUTURE", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "last", "cnt_installment_diff_by_IDprev", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "last", "cnt_installment_future_diff_by_IDprev", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("pos_cash",groups, "last", "pos_cash_paid_late", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "last", "pos_cash_paid_late_with_tolerance", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "last", "contract_is_active", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "last", "contract_is_completed", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    generate_features("pos_cash",groups, "last", "contract_is_signed", get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])
    del groups
    gc.collect()

In [None]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/POS_CASH_balance.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
GetPOSFeatures(pos_cash_path = "../data/POS_CASH_balance.csv")

# Some Additional Features
output["pos_cash_IDprevUNIQUE_to_IDprevALL_ratio"] = output["pos_cash_SK_ID_PREV_nunique_all"]/output["pos_cash_SK_ID_PREV_count_all"].astype("float32")

# Saving Data
if get_unnormalized:
    output.to_csv("csv/pos_cash_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/pos_cash_features_normalized_V{}.csv".format(version), index=False)

# Credit Card Features

In [None]:
def GetCreditCardFeatures(credit_card_path = "../data/credit_card_balance.csv"):
    credit_card = pd.read_csv(credit_card_path).sort_values(by=["SK_ID_CURR","SK_ID_PREV","MONTHS_BALANCE"]).reset_index(drop = True)
    credit_card = credit_card.fillna(0.0)
        
    # Generating Some Binary Features 
    credit_card['credit_card_paid_late'] = (credit_card['SK_DPD'] > 0).astype(int)
    credit_card['credit_card_paid_late_with_tolerance'] = (credit_card['SK_DPD_DEF'] > 0).astype(int)
    credit_card['contract_is_active'] = (credit_card['NAME_CONTRACT_STATUS'] == "Active").astype(int)
    credit_card['contract_is_completed'] = (credit_card['NAME_CONTRACT_STATUS'] == "Completed").astype(int)
    credit_card['contract_is_signed'] = (credit_card['NAME_CONTRACT_STATUS'] == "Signed").astype(int)
    
    credit_card['amt_drawings_atm_current_is_zero']=(credit_card['AMT_DRAWINGS_ATM_CURRENT'] == 0).astype(int)
    credit_card['amt_drawings_pos_current_is_zero']=(credit_card['AMT_DRAWINGS_POS_CURRENT'] == 0).astype(int)
    credit_card['amt_drawing_current_is_zero'] = (credit_card['AMT_DRAWINGS_CURRENT'] == 0).astype(int)
    credit_card['cnt_drawings_atm_current_is_zero']=(credit_card['CNT_DRAWINGS_ATM_CURRENT'] == 0).astype(int)
    credit_card['cnt_drawings_pos_current_is_zero']=(credit_card['CNT_DRAWINGS_POS_CURRENT'] == 0).astype(int)
    credit_card['cnt_drawing_current_is_zero'] = (credit_card['CNT_DRAWINGS_CURRENT'] == 0).astype(int)
    credit_card['cnt_installment_mature_cum_is_zero'] = (credit_card['CNT_INSTALMENT_MATURE_CUM'] == 0).astype(int)
    
    all_binary_features = ["credit_card_paid_late","credit_card_paid_late_with_tolerance","contract_is_active","contract_is_completed","contract_is_signed","amt_drawings_atm_current_is_zero","amt_drawings_pos_current_is_zero","amt_drawing_current_is_zero","cnt_drawings_atm_current_is_zero","cnt_drawings_pos_current_is_zero","cnt_drawing_current_is_zero","cnt_installment_mature_cum_is_zero"]
    
    # Generating Ratio Features
    credit_card['cnt_pos_ratio'] = credit_card['CNT_DRAWINGS_POS_CURRENT']/(credit_card['CNT_DRAWINGS_CURRENT']+1.0)
    credit_card['cnt_atm_ratio'] = credit_card['CNT_DRAWINGS_ATM_CURRENT']/(credit_card['CNT_DRAWINGS_CURRENT']+1.0)
    credit_card['amt_pos_ratio'] = credit_card['AMT_DRAWINGS_POS_CURRENT']/(credit_card['AMT_DRAWINGS_CURRENT']+1.0)
    credit_card['amt_atm_ratio'] = credit_card['AMT_DRAWINGS_ATM_CURRENT']/(credit_card['AMT_DRAWINGS_CURRENT']+1.0)
    credit_card['balance_to_drawing_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_DRAWINGS_CURRENT']+1.0)
    
    credit_card['balance_to_limit_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_CREDIT_LIMIT_ACTUAL']+1.0)
    credit_card['balance_to_total_payment_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_PAYMENT_TOTAL_CURRENT']+1.0)
    credit_card['balance_to_minimum_installment_ratio'] = credit_card['AMT_BALANCE']/(credit_card['AMT_INST_MIN_REGULARITY']+1.0)
    credit_card['minimum_installment_to_total_payment_ratio'] = credit_card['AMT_INST_MIN_REGULARITY']/(credit_card['AMT_PAYMENT_TOTAL_CURRENT']+1.0)
    credit_card['minimum_installment_to_current_payment_ratio'] = credit_card['AMT_INST_MIN_REGULARITY']/(credit_card['AMT_PAYMENT_CURRENT']+1.0)
    credit_card['current_to_total_payment_ratio'] = credit_card['AMT_PAYMENT_CURRENT']/(credit_card['AMT_PAYMENT_TOTAL_CURRENT']+1.0)

    credit_card['payment_to_receivable_min_ratio'] = credit_card['AMT_INST_MIN_REGULARITY']/(credit_card['AMT_RECEIVABLE_PRINCIPAL']+1.0)
    credit_card['payment_to_receivable_curr_ratio'] = credit_card['AMT_PAYMENT_CURRENT']/(credit_card['AMT_RECIVABLE']+1.0)
    credit_card['payment_to_receivable_total_ratio'] = credit_card['AMT_PAYMENT_TOTAL_CURRENT']/(credit_card['AMT_TOTAL_RECEIVABLE']+1.0)

    all_ratio_features = ["cnt_pos_ratio","cnt_atm_ratio","amt_pos_ratio","amt_atm_ratio","balance_to_drawing_ratio","balance_to_limit_ratio","balance_to_total_payment_ratio","balance_to_minimum_installment_ratio","minimum_installment_to_total_payment_ratio","minimum_installment_to_current_payment_ratio","current_to_total_payment_ratio","payment_to_receivable_min_ratio","payment_to_receivable_curr_ratio","payment_to_receivable_total_ratio"]
    
    # All Aggregate Features
    groups = credit_card.groupby("SK_ID_CURR")
    
    generate_features("credit_card",groups, "all", "SK_ID_PREV", get_trend = False, get_kurtosis = False, to_calculate = ["count","nunique"])
    generate_features("credit_card",groups, "all", "MONTHS_BALANCE", get_trend = False, get_kurtosis = False, to_calculate = ["min","max","mean"])
    generate_features("credit_card",groups, "all", "SK_DPD", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "SK_DPD_DEF", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "CNT_DRAWINGS_ATM_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "CNT_DRAWINGS_POS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "CNT_DRAWINGS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "AMT_DRAWINGS_ATM_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "AMT_DRAWINGS_POS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "AMT_DRAWINGS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "CNT_INSTALMENT_MATURE_CUM", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "AMT_BALANCE", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "AMT_PAYMENT_TOTAL_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    generate_features("credit_card",groups, "all", "AMT_TOTAL_RECEIVABLE", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
    
    for col in all_ratio_features:
        generate_features("credit_card",groups, "all", col, get_trend = True, get_kurtosis = True, to_calculate = ["mean","max","min","std","skew"])

    for col in all_binary_features:
        generate_features("credit_card",groups, "all", col, get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])

    # Lag Aggregate Features
    for lags in [3,12, 24, 36]:
        groups = credit_card.groupby("SK_ID_CURR").tail(lags).reset_index(drop=True).groupby("SK_ID_CURR")

        generate_features("credit_card",groups, lags, "SK_DPD", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "SK_DPD_DEF", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "CNT_DRAWINGS_ATM_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "CNT_DRAWINGS_POS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "CNT_DRAWINGS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "AMT_DRAWINGS_ATM_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "AMT_DRAWINGS_POS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "AMT_DRAWINGS_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "CNT_INSTALMENT_MATURE_CUM", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "AMT_BALANCE", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "AMT_PAYMENT_TOTAL_CURRENT", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])
        generate_features("credit_card",groups, lags, "AMT_TOTAL_RECEIVABLE", get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])

        for col in all_ratio_features:
            generate_features("credit_card",groups, lags, col, get_trend = True, get_kurtosis = True, to_calculate = ["mean","max","min","std","skew"])

        for col in all_binary_features:
            generate_features("credit_card",groups, lags, col, get_trend = False, get_kurtosis = False, to_calculate = ["sum","mean"])

        del groups
        gc.collect()

In [None]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/credit_card_balance.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
GetCreditCardFeatures(credit_card_path = "../data/credit_card_balance.csv")

# Some Additional Features
output["credit_card_IDprevUNIQUE_to_IDprevALL_ratio"] = output["credit_card_SK_ID_PREV_nunique_all"]/output["credit_card_SK_ID_PREV_count_all"].astype("float32")

# Saving Data
if get_unnormalized:
    output.to_csv("csv/credit_card_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/credit_card_features_normalized_V{}.csv".format(version), index=False)

# Bureau Features

In [None]:
def GetBandBBFeatures(bureau_path = "../data/bureau.csv",bureau_balance_path = "../data/bureau_balance.csv"):
    bureau = pd.read_csv(bureau_path).sort_values(by =["SK_ID_CURR","DAYS_CREDIT"]).reset_index(drop = True)
    bureau_balance = pd.read_csv(bureau_balance_path)

    # Get Bureau Balance Aggregations
    bureau_balance = one_hot_encoder(bureau_balance, nan_as_category = True)

    months_balance_aggregation = bureau_balance.groupby("SK_ID_BUREAU").agg({"MONTHS_BALANCE",["min","max","mean"]})
    months_balance_aggregation.columns = ["bb_MONTHS_BALANCE_min", "bb_MONTHS_BALANCE_max", "bb_MONTHS_BALANCE_mean"]
    bb_cols = months_balance_aggregation.columns
    
    status_aggregation = bureau_balance.groupby("SK_ID_BUREAU")[bureau_balance.columns.drop(["SK_ID_BUREAU","MONTHS_BALANCE"])].mean()
    status_aggregation.columns = ["bb_{}_mean".format(c) for c in status_aggregation.columns]
    bb_cols += status_aggregation.columns 
    
    bureau_balance = status_aggregation.merge(months_balance_aggregation, how = "left", left_index=True, right_index = True)
    bureau = bureau.merge(bureau_balance, how = "left", left_on = "SK_ID_BUREAU", right_index = True)

    del months_balance_aggregation, status_aggregation, bureau_balance
    gc.collect()

    # Aggregating Data Part 1
    bureau_groupby = bureau.groupby("SK_ID_CURR")
    aggregates_df = pd.DataFrame()
    aggregates_df["total_counts"] = bureau_groupby["CREDIT_ACTIVE"].count()
    aggregates_df["unique_credit_active_counts"] = bureau_groupby["CREDIT_ACTIVE"].nunique()
    aggregates_df["currency_unique_counts"] = bureau_groupby["CREDIT_CURRENCY"].nunique()
    aggregates_df["credit_type_unique_counts"] = bureau_groupby["CREDIT_TYPE"].nunique()
    aggregates_df["min_days_credit"] = bureau_groupby["DAYS_CREDIT"].min()
    aggregates_df["loan_type_diversification"] = aggregates_df["credit_type_unique_counts"]/aggregates_df["total_counts"].astype("float32")
    aggregates_df["currency_type_diversification"] = aggregates_df["currency_unique_counts"]/aggregates_df["total_counts"].astype("float32")
    aggregates_df["active_type_diversification"] = aggregates_df["unique_credit_active_counts"]/aggregates_df["total_counts"].astype("float32")

    # Handling Categoricals
    bureau.loc[~bureau.CREDIT_TYPE.isin(["Consumer credit", "Credit card", "Car loan", "Mortgage","Microloan"]), "CREDIT_TYPE"] = "Other"
    bureau.loc[~bureau.CREDIT_ACTIVE.isin(["Closed","Active","Sold"]), "CREDIT_ACTIVE"] = "Other"
    bureau["CREDIT_CURRENCY"] = (bureau["CREDIT_CURRENCY"] == "currency 1").astype(int)

    # Generating Integer Columns
    bureau["credit_is_active"] = (bureau["CREDIT_ACTIVE"] == "Active").astype(int)
    bureau["credit_is_closed"] = (bureau["CREDIT_ACTIVE"] == "Closed").astype(int)
    bureau["credit_is_sold"] = (bureau["CREDIT_ACTIVE"] == "Sold").astype(int)
    bureau["credit_is_other"] = (bureau["CREDIT_ACTIVE"] == "Other").astype(int)
    bureau["credit_type_is_consumer_credit"] = (bureau["CREDIT_TYPE"] == "Consumer credit").astype(int)
    bureau["credit_type_is_credit_card"] = (bureau["CREDIT_TYPE"] == "Credit card").astype(int)
    bureau["credit_type_is_car_loan"] = (bureau["CREDIT_TYPE"] == "Car loan").astype(int)
    bureau["credit_type_is_mortgage"] = (bureau["CREDIT_TYPE"] == "Mortgage").astype(int)
    bureau["credit_type_is_microloan"] = (bureau["CREDIT_TYPE"] == "Microloan").astype(int)
    bureau["credit_type_is_other"] = (bureau["CREDIT_TYPE"] == "Other").astype(int)

    bureau.drop(["CREDIT_TYPE", "CREDIT_ACTIVE"], axis=1, inplace = True)
    int_columns = ["credit_is_active", "credit_is_closed","credit_is_sold","credit_is_other","credit_type_is_consumer_credit","credit_type_is_credit_card","credit_type_is_car_loan","credit_type_is_mortgage","credit_type_is_microloan","credit_type_is_other"]
    
    # Aggregating Data Part 2
    bureau_groupby = bureau.groupby("SK_ID_CURR")
    
    aggregates_df_mean = bureau_groupby[int_columns].mean()
    aggregates_df_mean.columns = ["ratio_{}".format(c) for c in aggregates_df_mean.columns]
    aggregates_df = aggregates_df.merge(aggregates_df_mean, how = "left", left_index=True, right_index=True)
    
    aggregates_df_sum = bureau_groupby[int_columns].sum()
    aggregates_df_sum.columns = ["sum_{}".format(c) for c in aggregates_df_sum.columns]
    aggregates_df = aggregates_df.merge(aggregates_df_sum, how = "left", left_index=True, right_index=True)
    del aggregates_df_mean, aggregates_df_sum
    gc.collect()
    
    
    # Ratio Features
    
    bureau["debt_ratio"] = bureau["AMT_CREDIT_SUM_DEBT"]/(1.0+bureau["AMT_CREDIT_SUM"])
    bureau["limit_ratio"] = bureau["AMT_CREDIT_SUM_LIMIT"]/(1.0+bureau["AMT_CREDIT_SUM"])
    bureau["overdue_ratio"] = bureau["AMT_CREDIT_SUM_OVERDUE"]/(1.0+bureau["AMT_CREDIT_SUM"])
    bureau["annuity_ratio"] = bureau["AMT_ANNUITY"]/(1.0+bureau["AMT_CREDIT_SUM"])
    
    # Get Feature Aggregates
    min_max_mean_sum_std = ["CREDIT_DAY_OVERDUE", "DAYS_CREDIT_ENDDATE","DAYS_ENDDATE_FACT","AMT_CREDIT_MAX_OVERDUE","DAYS_CREDIT_UPDATE","AMT_ANNUITY" "CNT_CREDIT_PROLONG"]
    complete_set = ["DAYS_CREDIT","AMT_CREDIT_SUM","AMT_CREDIT_MAX_OVERDUE"]
    
    bureau_groupby = bureau.groupby("SK_ID_CURR")
    
    for col in min_max_mean_sum_std+bb_cols:
        generate_features("bureau",groups, "all", col, get_trend = True, get_kurtosis = False, to_calculate = ["sum","mean","max","min","std"])

    for col in complete_set:
        generate_features("bureau",groups, "all", col, get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])

    # Get Lag Aggregates
    for lags in [5, 15, 30]:
        bureau_groupby = bureau.groupby("SK_ID_CURR").tail(lags).groupby("SK_ID_CURR")
    
        for col in complete_set:
            generate_features("bureau",groups, lags, col, get_trend = True, get_kurtosis = True, to_calculate = ["sum","mean","median","max","min","std","skew"])

    return aggregates_df

In [None]:
get_normalized = True
get_unnormalized = True

# Calling Generator
output = pd.read_csv("../data/bureau.csv", usecols = ["SK_ID_CURR"]).drop_duplicates().reset_index(drop=True)
aggregates_df = GetBandBBFeatures(bureau_path = "../data/bureau.csv",bureau_balance_path = "../data/bureau_balance.csv"):
output = output.merge(aggregates_df, how = "left", left_on = "SK_ID_CURR", right_index=True)

# Saving Data
if get_unnormalized:
    output.to_csv("csv/bureau_features_V{}.csv".format(version), index=False)
if get_normalized:
    normalize_features(output, ["SK_ID_CURR"]).to_csv("csv/bureau_features_normalized_V{}.csv".format(version), index=False)