In [1]:
version = "2"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, Normalizer
%matplotlib inline

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

# Feature Generators

In [4]:
from numba import njit, prange,jit
@jit(nopython=True, parallel=True)
def get_regression_coefficient(ys, xs, counts):
    output = np.zeros(counts.shape[0]-1)
    for i in prange(counts.shape[0]-1):
        x = xs[counts[i] : counts[i+1]]
        y = ys[counts[i] : counts[i+1]]

        output[i] = np.nan
        length = y.shape[0]
        sx = np.sum(x)
        denominator = ((length*np.dot(x, x)) - (sx*sx))
        
        if length >0 and denominator != 0:
            numerator = ((length*np.dot(x, y)) - (sx*np.sum(y)))
            output[i] = numerator/denominator
    return output

In [5]:
def get_trend_values(data):
        y = data.apply(lambda x: x.dropna().values)
        trends = pd.Series(index =  y.index)
        
        counts = np.insert(y.apply(lambda x: x.shape[0]).values, 0, 0)
        x = np.concatenate(np.array([np.arange(c) for c in counts])).ravel().astype(np.float64)
        y = np.concatenate(y.values).ravel().astype(np.float64)
        counts = np.cumsum(counts)
        
        trends.iloc[:] = get_regression_coefficient(y, x, counts)
        return trends

In [6]:
def generate_features(prefix, dataGroups, num_lags, colname, get_trend = False, get_kurtosis = True, to_calculate = ["sum", "count","nunique","mean","median","max","min","std","skew"]):
    global output
    if len(to_calculate)> 0 :
        print "Getting Aggregates for column: {}, {}".format(colname, num_lags)
        aggregates = dataGroups[colname].agg(to_calculate)
        aggregates.columns = ["{}_{}_{}_{}".format(prefix,colname, c, num_lags) for c in aggregates.columns]
        output = output.merge(aggregates, how = "left", left_on = "SK_ID_CURR", right_index=True)
        timer("Done Aggregating Column {}".format(colname))
    if get_kurtosis:
        print "Getting Kurtosis for Column: {}, {}".format(colname, num_lags)
        output["{}_{}_kurtosis_{}".format(prefix,colname,num_lags)] = output["SK_ID_CURR"].map(dataGroups[colname].apply(lambda x: x.kurt()))
        timer("Done Getting Kurtosis for Column {}".format(colname))

    if get_trend:
        print "Getting Trend for column: {}, {}".format(colname, num_lags)
        output["{}_{}_trend_{}".format(prefix, colname, num_lags)] = output["SK_ID_CURR"].map(get_trend_values(dataGroups[colname]))
        timer("Done Getting Trend for Column {}".format(colname))


# Feature Preprocessors

In [7]:
def one_hot_encoder(data, nan_as_category = True):
    categorical_columns = [col for col in data.columns if data[col].dtype == 'object']
    data = pd.get_dummies(data, columns= categorical_columns, dummy_na= nan_as_category)
    return data

def normalize_features(data, not_to_normalize):
    columns_to_normalize = data.columns.drop(not_to_normalize)
    data = data.replace([np.inf, -np.inf], np.nan)
    print data.shape, "Before dropping Na's"
    data = data.dropna(axis=1, how = "all")
    print data.shape, "After dropping Na's"
    for col in data.columns.drop(not_to_normalize):
        data[col] = data[col].fillna(data[col].mean()).fillna(0)
        data[col]= Normalizer().fit_transform(data[col].values.reshape(-1,1))
    return data

# Get Application Features

In [8]:
def GetApplicationFeatures(get_ohe = False, normalize = False, get_flags = False, train_path = "../data/application_train.csv", test_path = "../data/application_test.csv"):
    # Loading Data
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    data = pd.concat([train, test], axis=0)
    
    # Cleaning {To be improved later}
    data['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
    data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    data['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
    data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
    data['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
    
    # Feature Engineering on groupby
    inc_by_org = data[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
    data['new_inc_by_org'] = data['ORGANIZATION_TYPE'].map(inc_by_org)
    
    # Feature Engineering on Flag Columns
    docs = [c for c in data.columns if 'FLAG_DOC' in c]
    live = [c for c in data.columns if ('FLAG_' in c) & ('FLAG_DOC' not in c) & ('_FLAG_' not in c)]
    data['new_doc_ind_kurt'] = data[docs].kurtosis(axis=1)
    data['new_live_ind_sum'] = data[live].sum(axis=1)
    
    # Feature Engineering on feature interations
    data['annuity_income_percentage'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL'].astype("float32")
    data['car_to_birth_ratio'] = data['OWN_CAR_AGE'] / data['DAYS_BIRTH'].astype("float32")
    data['car_to_employ_ratio'] = data['OWN_CAR_AGE'] / data['DAYS_EMPLOYED'].astype("float32")
    data['children_ratio'] = data['CNT_CHILDREN'] / data['CNT_FAM_MEMBERS'].astype("float32")
    data['credit_to_annuity_ratio'] = data['AMT_CREDIT'] / data['AMT_ANNUITY'].astype("float32")
    data['credit_to_goods_ratio'] = data['AMT_CREDIT'] / data['AMT_GOODS_PRICE'].astype("float32")
    data['credit_to_income_ratio'] = data['AMT_CREDIT'] / data['AMT_INCOME_TOTAL'].astype("float32")
    data['days_employed_percentage'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH'].astype("float32")
    data['income_credit_percentage'] = data['AMT_INCOME_TOTAL'] / data['AMT_CREDIT'].astype("float32")
    data['income_per_child'] = data['AMT_INCOME_TOTAL'] / (1 + data['CNT_CHILDREN']).astype("float32")
    data['income_per_person'] = data['AMT_INCOME_TOTAL'] / data['CNT_FAM_MEMBERS'].astype("float32")
    data['payment_rate'] = data['AMT_ANNUITY'] / data['AMT_CREDIT'].astype("float32")
    data['phone_to_birth_ratio'] = data['DAYS_LAST_PHONE_CHANGE'] / data['DAYS_BIRTH'].astype("float32")
    data['phone_to_employ_ratio'] = data['DAYS_LAST_PHONE_CHANGE'] / data['DAYS_EMPLOYED'].astype("float32")
    data['external_sources_weighted'] = data.EXT_SOURCE_1 * 2 + data.EXT_SOURCE_2 * 3 + data.EXT_SOURCE_3 * 4
    data['external_sources_product'] = data.EXT_SOURCE_1 * data.EXT_SOURCE_2 *  data.EXT_SOURCE_3
    data['cnt_non_child'] = data['CNT_FAM_MEMBERS'] - data['CNT_CHILDREN']
    data['child_to_non_child_ratio'] = data['CNT_CHILDREN'] / data['cnt_non_child'].astype("float32")
    data['income_per_non_child'] = data['AMT_INCOME_TOTAL'] / data['cnt_non_child'].astype("float32")
    data['credit_per_person'] = data['AMT_CREDIT'] / data['CNT_FAM_MEMBERS'].astype("float32")
    data['credit_per_child'] = data['AMT_CREDIT'] / (1 + data['CNT_CHILDREN']).astype("float32")
    data['credit_per_non_child'] = data['AMT_CREDIT'] / data['cnt_non_child'].astype("float32")
    for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian',"std"]:
        data['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

    # Generating Binary Indicator Variables for Certain Columns
    data['short_employment'] = (data['DAYS_EMPLOYED'] < -2000).astype(int)
    data['young_age'] = (data['DAYS_BIRTH'] < -14000).astype(int)

    # Handling Categorical Data
    categorical_columns = [c for c in data.columns if data[c].dtype == "object"]

    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        data[bin_feature], uniques = pd.factorize(data[bin_feature])
        
    if get_ohe:
        data = one_hot_encoder(data, True)
    else:
        for col in categorical_columns:
            data[col] = LabelEncoder().fit_transform(data[col])
    
    # Normalizing Features
    if normalize:
        if get_ohe:
            data = normalize_features(data,["SK_ID_CURR","TARGET"])
        else:
            data = normalize_features(data,["SK_ID_CURR","TARGET"]+categorical_columns)
    
    # Handling Flag Columns {Work some more on it}
    flag_columns = ['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']

    if get_flags:
        return data
    else:
        return data.drop(flag_columns, axis=1)

In [9]:
get_normalized = True
get_unnormalized = True

if get_normalized:
    output = GetApplicationFeatures(get_ohe = True, normalize = True, get_flags = True, train_path = "../data/application_train.csv", test_path = "../data/application_test.csv")
    output.to_csv("csv/application_features_normalized_V{}.csv".format(version), index=False)
if get_unnormalized:
    output = GetApplicationFeatures(get_ohe = False, normalize = False, get_flags = True, train_path = "../data/application_train.csv", test_path = "../data/application_test.csv")
    output.to_csv("csv/application_features_V{}.csv".format(version), index=False)

  r = func(a, **kwargs)


(356255, 286) Before dropping Na's
(356255, 286) After dropping Na's
