In [1]:
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [None]:
def create_features(df):
    
    # Interactions between similarly distributed features in train and test
    df['FLAG_IMPORTANT_DOCS'] = df['FLAG_DOCUMENT_18']*df['FLAG_DOCUMENT_8']*df['FLAG_DOCUMENT_3']
    df['AMT_REQ'] = (df['AMT_REQ_CREDIT_BUREAU_HOUR'] + df['AMT_REQ_CREDIT_BUREAU_DAY'] + 
                     df['AMT_REQ_CREDIT_BUREAU_WEEK'] + df['AMT_REQ_CREDIT_BUREAU_MON'] + 
                     df['AMT_REQ_CREDIT_BUREAU_QRT'] + df['AMT_REQ_CREDIT_BUREAU_YEAR'])
    
    accom_avg_list = ['APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG',
                      'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG',
                      'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG',
                      'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG']
    accom_mode_list = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
                      'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE',
                      'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',
                      'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE',
                      'TOTALAREA_MODE']
    accom_medi_list = ['APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI',
                      'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',
                      'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI',
                      'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI']
    
    df['ACCOM_SCORE_AVG'] = df[accom_avg_list].mean(axis=1)
    df['ACCOM_SCORE_MODE'] = df[accom_mode_list].mean(axis=1)
    df['ACCOM_SCORE_MEDI'] = df[accom_medi_list].mean(axis=1)
    
    
    return(df)

In [None]:
def remove_on_missing(df):
    
    # Removing features with high percentage of missing values (> 95§% missing in both train and test)
    missing_list = ['NEW_RATIO_BURO_CNT_CREDIT_PROLONG_SUM', 'NEW_RATIO_BURO_CREDIT_TYPE_Mortgage_MEAN',
                    'NEW_RATIO_BURO_CREDIT_TYPE_Microloan_MEAN', 'ACTIVE_DAYS_ENDDATE_FACT_MEAN',
                    'NEW_RATIO_BURO_DAYS_ENDDATE_FACT_MEAN', 'Unnamed: 0']
    
    df = df.drop(missing_list, axis=1)
    
    return(df)

In [None]:
def consolidate_features(df):
    
    consolidate_list = ['APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG',
                      'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG',
                      'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG',
                      'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG',
                      'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
                      'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE',
                      'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',
                      'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE',
                      'TOTALAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI',
                      'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',
                      'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI',
                      'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI',
                      'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
                      'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
                      'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
    
    df = df.drop(consolidate_list, axis=1)
    
    return(df)

In [None]:
def remove_on_distributions(df):
    
    # Remove features that are dissimilarly distributed between train and test
    dist_list = ['BURO_MONTHS_BALANCE_SIZE_MEAN', 'ACTIVE_DAYS_ENDDATE_FACT_MEAN',
                 'CLOSED_MONTHS_BALANCE_SIZE_MEAN']
    
    df = df.drop(dist_list, axis=1)
    
    return(df)

In [10]:
def remove_on_importance(df):
    
    # Remove all features that weren't split on above some threshold times in most recent run
    importance = pd.read_csv('../output/importance_3.3.csv')
    importance_threshold = 2
    importance_red = importance[importance['importance']<=importance_threshold]
    importance_red.reset_index(inplace = True)
    importance_list = list(importance_red['feature'])
    df = df.drop(importance_list, axis = 1)
    
    del importance
    del importance_red
    del importance_list
    
    return df

In [None]:
def split_dataset(df):
    
    # TODO: split dataset based on similar features that make sense to group together
    pass
    

In [11]:
def main(split_data = False, importance_prune = False):
    
    df = pd.read_csv('../data/processed_data_3.2.csv')
    
    with timer('Removed features with high number of missing values'):
        df = remove_on_missing(df)
        print("df shape:", df.shape)
    with timer('Removed features after consolidation'):
        df = consolidate_features(df)
        print("df shape:", df.shape)
    with timer('Removed dissimilarly distributed features'):
        df = remove_on_distributions(df)
        print("df shape:", df.shape)
    with timer('Created feature interactions'):
        df = create_features(df)
        print("df shape:", df.shape)
    if importance_prune == True:
        with timer("Post-processing"):
            df = remove_on_importance(df)
            print("df shape:", df.shape)
    if split_data == True:
        with timer('Split dataset into similar groups and saved as csv files'):
            df = split_dataset(df)
        
    # Save processed data to csv    
    df.to_csv('../data/processed_data_3.3.csv')  

if __name__ == "__main__":
    with timer("Processing pipeline run"):
        main(split_data = False, importance_prune = False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


df shape: (356255, 149)
Process application_train and _test: - done in 16s
df shape: (356255, 380)
Process bureau and bureau_balance - done in 51s
df shape: (356255, 483)
Process credit card balance - done in 27s
df shape: (356255, 525)
Process installments payments - done in 44s
df shape: (356255, 541)
Process POS_CASH_balance - done in 27s
df shape: (356255, 820)
Process previous_applications - done in 53s
df shape: (356255, 637)
Post-processing - done in 8s
Processing pipeline run - done in 397s


In [3]:
# import pandas as pd
# df = pd.read_csv('../data/processed_data_3.2.csv')

In [4]:
# list(df.columns)

['Unnamed: 0',
 'AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'APARTMENTS_AVG',
 'APARTMENTS_MEDI',
 'APARTMENTS_MODE',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_MODE',
 'CNT_CHILDREN',
 'CNT_FAM_MEMBERS',
 'CODE_GENDER',
 'COMMONAREA_AVG',
 'COMMONAREA_MEDI',
 'COMMONAREA_MODE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'ELEVATORS_AVG',
 'ELEVATORS_MEDI',
 'ELEVATORS_MODE',
 'EMERGENCYSTATE_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'ENTRANCES_MODE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_8',
 'FLAG_EMAIL',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FL

In [2]:
importance = pd.read_csv('../output/importance_3.3.csv')

In [7]:
importance.sort_values('importance', ascending = False)[:500]

Unnamed: 0,feature,importance
828,NEW_EXT_SOURCES_MEAN,170545.783054
827,NEW_EXT_SOURCES_INSTAL_PAYMENT_PERC_MEAN,62055.472539
826,NEW_CREDIT_TO_GOODS_RATIO,16620.461701
825,EXT_SOURCE_2_3,14238.653084
824,PAYMENT_RATE,13740.987094
823,NEW_EXT_SOURCES_MEAN_NEW_EXT_SOURCES_MEAN,10217.017486
822,POS_CNT_INSTALMENT_FUTURE_MEAN,10088.787774
821,NEW_CREDIT_TO_ANNUITY_RATIO,10035.974734
820,NEW_EXT_SOURCES_MEAN_DAYS_EMPLOYED_PROD,9678.043861
819,NEW_EXT_SOURCES_DAYS_EMPLOYED_PERC,9087.377145
