In [50]:
import gc
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import warnings

warnings.filterwarnings('ignore')

In [2]:
#Feature Engineering
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def do_sum(dataframe, group_cols, counted, agg_name):
    gp = dataframe[group_cols + [counted]].groupby(group_cols)[counted].sum().reset_index().rename(columns={counted: agg_name})
    dataframe = dataframe.merge(gp, on=group_cols, how='left')
    return dataframe

def reduce_mem_usage(dataframe):
    m_start = dataframe.memory_usage().sum() / 1024 ** 2
    for col in dataframe.columns:
        col_type = dataframe[col].dtype
        if col_type != object:
            c_min = dataframe[col].min()
            c_max = dataframe[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dataframe[col] = dataframe[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dataframe[col] = dataframe[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dataframe[col] = dataframe[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dataframe[col] = dataframe[col].astype(np.int64)
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dataframe[col] = dataframe[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dataframe[col] = dataframe[col].astype(np.float32)
                else:
                    dataframe[col] = dataframe[col].astype(np.float64)

    m_end = dataframe.memory_usage().sum() / 1024 ** 2
    return dataframe

nan_as_category = True


def risk_groupanizer(dataframe, column_names, target_val=1, upper_limit_ratio=8.2, lower_limit_ratio=8.2):
    # one-hot encoder killer :-)
    all_cols = dataframe.columns
    for col in column_names:

        temp_df = dataframe.groupby([col] + ['TARGET'])[['SK_ID_CURR']].count().reset_index()
        temp_df['ratio%'] = round(temp_df['SK_ID_CURR']*100/temp_df.groupby([col])['SK_ID_CURR'].transform('sum'), 1)
        col_groups_high_risk = temp_df[(temp_df['TARGET'] == target_val) &
                                       (temp_df['ratio%'] >= upper_limit_ratio)][col].tolist()
        col_groups_low_risk = temp_df[(temp_df['TARGET'] == target_val) &
                                      (lower_limit_ratio >= temp_df['ratio%'])][col].tolist()
        if upper_limit_ratio != lower_limit_ratio:
            col_groups_medium_risk = temp_df[(temp_df['TARGET'] == target_val) &
                (upper_limit_ratio > temp_df['ratio%']) & (temp_df['ratio%'] > lower_limit_ratio)][col].tolist()

            for risk, col_groups in zip(['_high_risk', '_medium_risk', '_low_risk'],
                                        [col_groups_high_risk, col_groups_medium_risk, col_groups_low_risk]):
                dataframe[col + risk] = [1 if val in col_groups else 0 for val in dataframe[col].values]
        else:
            for risk, col_groups in zip(['_high_risk', '_low_risk'], [col_groups_high_risk, col_groups_low_risk]):
                dataframe[col + risk] = [1 if val in col_groups else 0 for val in dataframe[col].values]
        if dataframe[col].dtype == 'O' or dataframe[col].dtype == 'object':
            dataframe.drop(col, axis=1, inplace=True)
    return dataframe, list(set(dataframe.columns).difference(set(all_cols)))


def ligthgbm_feature_selection(dataframe, index_cols, auc_limit=0.7):
    dataframe = dataframe.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x))
    clf = LGBMClassifier(random_state=0)
    train_df = dataframe[dataframe['TARGET'].notnull()]
    X = dataframe.drop('TARGET', axis=1)
    y = dataframe['TARGET']
    train_columns = [col for col in X.columns if col not in index_cols]

    max_auc_score = 1
    best_cols = []
    while max_auc_score > auc_limit:
        train_columns = [col for col in train_columns if col not in best_cols]
        clf.fit(X[train_columns], y)
        feats_imp = pd.Series(clf.feature_importances_, index=train_columns)
        max_auc_score = roc_auc_score(y, clf.predict_proba(X[train_columns])[:, 1])
        bad_cols = feats_imp[feats_imp <12].index.tolist()

    dataframe.drop(train_columns, axis=1, inplace=True)
    return dataframe, bad_cols

In [65]:
def model1(df, n_folds = 5):
    
    features = df[df['TARGET'].notnull()]
    test_features = df[df['TARGET'].isnull()]
    #features, test_features = train.align(test_features, join = 'inner', axis = 1)
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    
    
    ''' One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    
    
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)'''
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = LGBMClassifier(n_estimators=2266, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.01, 
                                   reg_alpha = 0.02, reg_lambda = 0.9, 
                                   subsample = 0.86667, n_jobs = -1, random_state = 500)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'])
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    ''' metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})''' 
    fi_drop=feature_importances[feature_importances['importance']< 12]
    dropfeat=fi_drop['feature'].tolist()
    return submission, dropfeat

In [4]:
#Data Transformation
def group(df_to_agg, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = df_to_agg.groupby(aggregate_by).agg(aggregations)
    agg_df.columns = pd.Index(['{}{}_{}'.format(prefix, e[0], e[1].upper())
                               for e in agg_df.columns.tolist()])
    return agg_df.reset_index()

def group_and_merge(df_to_agg, df_to_merge, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = group(df_to_agg, prefix, aggregations, aggregate_by= aggregate_by)
    return df_to_merge.merge(agg_df, how='left', on= aggregate_by)

In [5]:
#Ingestion address


train=pd.read_csv(r"C:\Users\aksha\notebooks\data\application_train.csv")
test=pd.read_csv(r"C:\Users\aksha\notebooks\data\application_test.csv")
bureau=pd.read_csv(r"C:\Users\aksha\notebooks\data\bureau.csv")
bureau_bal=pd.read_csv(r"C:\Users\aksha\notebooks\data\bureau_balance.csv")
cc_bal=pd.read_csv(r"C:\Users\aksha\notebooks\data\credit_card_balance.csv")
payments=pd.read_csv(r"C:\Users\aksha\notebooks\data\installments_payments.csv")
cash=pd.read_csv(r"C:\Users\aksha\notebooks\data\POS_CASH_balance.csv")
prev_app=pd.read_csv(r"C:\Users\aksha\notebooks\data\previous_application.csv")

In [6]:
def remove_missing_columns(df, threshold = 75):
    # Calculate missing stats for train and test (remember to calculate a percent!)
    df_miss= pd.DataFrame(df.isnull().sum())
    df_miss['percent'] = 100 * df_miss[0] / len(df)
    
    '''test_miss = pd.DataFrame(test.isnull().sum())
    test_miss['percent'] = 100 * test_miss[0] / len(test)
    
    # list of missing columns for train and test'''
    missing_columns = list(df_miss.index[df_miss['percent'] > threshold])
    s = list(df_miss.index[df_miss['percent'] < threshold])
    ''' missing_test_columns = list(test_miss.index[test_miss['percent'] > threshold])
    
    # Combine the two lists together
    missing_columns = list(set(missing_train_columns + missing_test_columns))
    '''
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold))
    
    # Drop the missing columns and return
    df = df.drop(columns = missing_columns)
    #test = test.drop(columns = missing_columns)
    
    for m in [df]:
        for col in s:
            if m[col].isnull().sum()>0:
                if m[col].dtype=='object' or m[col].dtype=='category':
                    md= df[col].mode()
                    m[col].fillna(md, inplace=True)
            
    
    '''for m in [test]:
        for col in missing_test_columns:
            med = test[col].median()
            m[col].fillna(med, inplace=True)'''
    
    return df


# Function to calculate missing values by column# Funct 
def missing_values_table(df, print_info = False):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        if print_info:
            # Print some summary information
            print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
                "There are " + str(mis_val_table_ren_columns.shape[0]) +
                  " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table

In [7]:
def application(df1, df2):
    '''df = pd.read_csv(r'../input/home-credit-default-risk/application_train.csv')
    test_df = pd.read_csv(r'../input/home-credit-default-risk/application_test.csv')
    df = df.append(test_df).reset_index()'''

    df = pd.concat([df1, df2], ignore_index=True)
    # general cleaning procedures
    df = df[df['CODE_GENDER'] != 'XNA']
    df = df[df['AMT_INCOME_TOTAL'] < 20000000] # remove a outlier 117M
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) # set null value
    df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True) # set null value

    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)

    # Flag_document features - count and kurtosis
    docs = [f for f in df.columns if 'FLAG_DOC' in f]
    df['DOCUMENT_COUNT'] = df[docs].sum(axis=1)
    df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1)

    def get_age_label(days_birth):
        """ Return the age group label (int). """
        age_years = -days_birth / 365
        if age_years < 27: return 1
        elif age_years < 40: return 2
        elif age_years < 50: return 3
        elif age_years < 65: return 4
        elif age_years < 99: return 5
        else: return 0
    # Categorical age - based on target=1 plot
    df['AGE_RANGE'] = df['DAYS_BIRTH'].apply(lambda x: get_age_label(x))

    # New features based on External sources
    df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCES_WEIGHTED'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 1 + df.EXT_SOURCE_3 * 3
    #np.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
    for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
        df[feature_name] = eval('np.{}'.format(function_name))(
            df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

    # Credit ratios
    df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    
    # Income ratios
    df['INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
    df['INCOME_TO_BIRTH_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH']
    
    # Time ratios
    df['ID_TO_BIRTH_RATIO'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH']
    df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']

    # EXT_SOURCE_X FEATURE
    df['APPS_EXT_SOURCE_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['APPS_EXT_SOURCE_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['APPS_EXT_SOURCE_STD'] = df['APPS_EXT_SOURCE_STD'].fillna(df['APPS_EXT_SOURCE_STD'].mean())
    df['APP_SCORE1_TO_BIRTH_RATIO'] = df['EXT_SOURCE_1'] / (df['DAYS_BIRTH'] / 365.25)
    df['APP_SCORE2_TO_BIRTH_RATIO'] = df['EXT_SOURCE_2'] / (df['DAYS_BIRTH'] / 365.25)
    df['APP_SCORE3_TO_BIRTH_RATIO'] = df['EXT_SOURCE_3'] / (df['DAYS_BIRTH'] / 365.25)
    df['APP_SCORE1_TO_EMPLOY_RATIO'] = df['EXT_SOURCE_1'] / (df['DAYS_EMPLOYED'] / 365.25)
    df['APP_EXT_SOURCE_2*EXT_SOURCE_3*DAYS_BIRTH'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['DAYS_BIRTH']
    df['APP_SCORE1_TO_FAM_CNT_RATIO'] = df['EXT_SOURCE_1'] / df['CNT_FAM_MEMBERS']
    df['APP_SCORE1_TO_GOODS_RATIO'] = df['EXT_SOURCE_1'] / df['AMT_GOODS_PRICE']
    df['APP_SCORE1_TO_CREDIT_RATIO'] = df['EXT_SOURCE_1'] / df['AMT_CREDIT']
    df['APP_SCORE1_TO_SCORE2_RATIO'] = df['EXT_SOURCE_1'] / df['EXT_SOURCE_2']
    df['APP_SCORE1_TO_SCORE3_RATIO'] = df['EXT_SOURCE_1'] / df['EXT_SOURCE_3']
    df['APP_SCORE2_TO_CREDIT_RATIO'] = df['EXT_SOURCE_2'] / df['AMT_CREDIT']
    df['APP_SCORE2_TO_REGION_RATING_RATIO'] = df['EXT_SOURCE_2'] / df['REGION_RATING_CLIENT']
    df['APP_SCORE2_TO_CITY_RATING_RATIO'] = df['EXT_SOURCE_2'] / df['REGION_RATING_CLIENT_W_CITY']
    df['APP_SCORE2_TO_POP_RATIO'] = df['EXT_SOURCE_2'] / df['REGION_POPULATION_RELATIVE']
    df['APP_SCORE2_TO_PHONE_CHANGE_RATIO'] = df['EXT_SOURCE_2'] / df['DAYS_LAST_PHONE_CHANGE']
    df['APP_EXT_SOURCE_1*EXT_SOURCE_2'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2']
    df['APP_EXT_SOURCE_1*EXT_SOURCE_3'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_3']
    df['APP_EXT_SOURCE_2*EXT_SOURCE_3'] = df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['APP_EXT_SOURCE_1*DAYS_EMPLOYED'] = df['EXT_SOURCE_1'] * df['DAYS_EMPLOYED']
    df['APP_EXT_SOURCE_2*DAYS_EMPLOYED'] = df['EXT_SOURCE_2'] * df['DAYS_EMPLOYED']
    df['APP_EXT_SOURCE_3*DAYS_EMPLOYED'] = df['EXT_SOURCE_3'] * df['DAYS_EMPLOYED']

    # AMT_INCOME_TOTAL : income
    # CNT_FAM_MEMBERS  : the number of family members
    df['APPS_GOODS_INCOME_RATIO'] = df['AMT_GOODS_PRICE'] / df['AMT_INCOME_TOTAL']
    df['APPS_CNT_FAM_INCOME_RATIO'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    
    # DAYS_BIRTH : Client's age in days at the time of application
    # DAYS_EMPLOYED : How many days before the application the person started current employment
    df['APPS_INCOME_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']

    # other feature from better than 0.8
    df['CREDIT_TO_GOODS_RATIO_2'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['APP_AMT_INCOME_TOTAL_12_AMT_ANNUITY_ratio'] = df['AMT_INCOME_TOTAL'] / 12. - df['AMT_ANNUITY']
    df['APP_INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
    df['APP_DAYS_LAST_PHONE_CHANGE_DAYS_EMPLOYED_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['APP_DAYS_EMPLOYED_DAYS_BIRTH_diff'] = df['DAYS_EMPLOYED'] - df['DAYS_BIRTH']
    
    # Groupby the client id (SK_ID_CURR), count the number of previous loans, and rename the column
    previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
    #Merging cell
    df = df.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')


    dropfeatures = ['FLAG_OWN_CAR','OWN_CAR_AGE','FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL','HOUR_APPR_PROCESS_START','APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG','YEARS_BUILD_AVG','COMMONAREA_AVG','ELEVATORS_AVG','ENTRANCES_AVG','FLOORSMAX_AVG','FLOORSMIN_AVG','LANDAREA_AVG','LIVINGAPARTMENTS_AVG','LIVINGAREA_AVG','NONLIVINGAPARTMENTS_AVG','NONLIVINGAREA_AVG','APARTMENTS_MODE','BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE','YEARS_BUILD_MODE','COMMONAREA_MODE','ELEVATORS_MODE','ENTRANCES_MODE','FLOORSMAX_MODE','FLOORSMIN_MODE','LANDAREA_MODE','LIVINGAPARTMENTS_MODE','LIVINGAREA_MODE','NONLIVINGAPARTMENTS_MODE','APARTMENTS_MEDI','BASEMENTAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI','COMMONAREA_MEDI','ELEVATORS_MEDI','ENTRANCES_MEDI','FLOORSMAX_MEDI','FLOORSMIN_MEDI','LANDAREA_MEDI','LIVINGAPARTMENTS_MEDI','LIVINGAREA_MEDI','NONLIVINGAPARTMENTS_MEDI','NONLIVINGAREA_MEDI','TOTALAREA_MODE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']
    df=df.drop(columns=dropfeatures)
    print('Final shape:', df.shape)
    return df

In [8]:
df=application(train,test)

Final shape: (356250, 239)


In [9]:
def bureaubal(bureau, bb):
    # Credit duration and credit/account end date difference
    bureau['CREDIT_DURATION'] = -bureau['DAYS_CREDIT'] + bureau['DAYS_CREDIT_ENDDATE']
    bureau['ENDDATE_DIF'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT']
    
    # Credit to debt ratio and difference
    bureau['DEBT_PERCENTAGE'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_CREDIT_SUM_DEBT']
    bureau['DEBT_CREDIT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
    bureau['CREDIT_TO_ANNUITY_RATIO'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_ANNUITY']
    bureau['BUREAU_CREDIT_FACT_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_ENDDATE_FACT']
    bureau['BUREAU_CREDIT_ENDDATE_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_CREDIT_ENDDATE']
    bureau['BUREAU_CREDIT_DEBT_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / bureau['AMT_CREDIT_SUM']

    # CREDIT_DAY_OVERDUE :
    bureau['BUREAU_IS_DPD'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)
    bureau['BUREAU_IS_DPD_OVER120'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 120 else 0)

    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)

    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size', 'mean']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']

    #Status of Credit Bureau loan during the month
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean', 'min'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean', 'max'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean', 'max', 'sum'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean', 'sum'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum'],
        'SK_ID_BUREAU': ['count'],
        'DAYS_ENDDATE_FACT': ['min', 'max', 'mean'],
        'ENDDATE_DIF': ['min', 'max', 'mean'],
        'BUREAU_CREDIT_FACT_DIFF': ['min', 'max', 'mean'],
        'BUREAU_CREDIT_ENDDATE_DIFF': ['min', 'max', 'mean'],
        'BUREAU_CREDIT_DEBT_RATIO': ['min', 'max', 'mean'],
        'DEBT_CREDIT_DIFF': ['min', 'max', 'mean'],
        'BUREAU_IS_DPD': ['mean', 'sum'],
        'BUREAU_IS_DPD_OVER120': ['mean', 'sum']
        }

    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')

    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')

    print('"Bureau/Bureau Balance" final shape:', bureau_agg.shape)
    return bureau_agg

In [10]:
bureau = bureaubal(bureau, bureau_bal)

"Bureau/Bureau Balance" final shape: (305811, 200)


In [11]:
catVars = [var for var in bureau.columns if bureau[var].dtype=='object']

print('There are {} categorical variables\n'.format(len(catVars)), '\n', catVars)

There are 0 categorical variables
 
 []


In [12]:
def previous_application(prev):
    
    prev, cat_cols = one_hot_encoder(prev, nan_as_category=True)

    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

    # Feature engineering: ratios and difference
    prev['APPLICATION_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
    prev['CREDIT_TO_ANNUITY_RATIO'] = prev['AMT_CREDIT'] / prev['AMT_ANNUITY']
    prev['DOWN_PAYMENT_TO_CREDIT'] = prev['AMT_DOWN_PAYMENT'] / prev['AMT_CREDIT']

    # Interest ratio on previous application (simplified)
    total_payment = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
    prev['SIMPLE_INTERESTS'] = (total_payment / prev['AMT_CREDIT'] - 1) / prev['CNT_PAYMENT']

    # Days last due difference (scheduled x done)
    prev['DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']

    # from off
    prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
    prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
    prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE'] / prev['AMT_APPLICATION']

    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean', 'sum'],
        'AMT_APPLICATION': ['min', 'max', 'mean', 'sum'],
        'AMT_CREDIT': ['min', 'max', 'mean', 'sum'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean', 'sum'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        'SK_ID_PREV': ['nunique'],
        'DAYS_TERMINATION': ['max'],
        'CREDIT_TO_ANNUITY_RATIO': ['mean', 'max'],
        'APPLICATION_CREDIT_DIFF': ['min', 'max', 'mean', 'sum'],
        'DOWN_PAYMENT_TO_CREDIT': ['mean'],
        'PREV_GOODS_DIFF': ['mean', 'max', 'sum'],
        'PREV_GOODS_APPL_RATIO': ['mean', 'max'],
        'DAYS_LAST_DUE_DIFF': ['mean', 'max', 'sum'],
        'SIMPLE_INTERESTS': ['mean', 'max']
    }

    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']

    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')

    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')

    print('"Previous Applications" final shape:', prev_agg.shape)
    return prev_agg

In [13]:
prev_app = previous_application(prev_app)

"Previous Applications" final shape: (338857, 321)


In [14]:
catVars = [var for var in prev_app.columns if prev_app[var].dtype=='object']

print('There are {} categorical variables\n'.format(len(catVars)), '\n', catVars)

There are 0 categorical variables
 
 []


In [15]:

def pos_cash(pos):
    
    pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)

    # Flag months with late payment
    pos['LATE_PAYMENT'] = pos['SK_DPD'].apply(lambda x: 1 if x > 0 else 0)
    pos['POS_IS_DPD'] = pos['SK_DPD'].apply(lambda x: 1 if x > 0 else 0) # <-- same with ['LATE_PAYMENT']
    pos['POS_IS_DPD_UNDER_120'] = pos['SK_DPD'].apply(lambda x: 1 if (x > 0) & (x < 120) else 0)
    pos['POS_IS_DPD_OVER_120'] = pos['SK_DPD'].apply(lambda x: 1 if x >= 120 else 0)

    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size', 'min'],
        'SK_DPD': ['max', 'mean', 'sum', 'var', 'min'],
        'SK_DPD_DEF': ['max', 'mean', 'sum'],
        'SK_ID_PREV': ['nunique'],
        'LATE_PAYMENT': ['mean'],
        'SK_ID_CURR': ['count'],
        'CNT_INSTALMENT': ['min', 'max', 'mean', 'sum'],
        'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean', 'sum'],
        'POS_IS_DPD': ['mean', 'sum'],
        'POS_IS_DPD_UNDER_120': ['mean', 'sum'],
        'POS_IS_DPD_OVER_120': ['mean', 'sum'],
    }

    for cat in cat_cols:
        aggregations[cat] = ['mean']

    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])

    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()


    sort_pos = pos.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'])
    gp = sort_pos.groupby('SK_ID_PREV')
    df_pos = pd.DataFrame()
    df_pos['SK_ID_CURR'] = gp['SK_ID_CURR'].first()
    df_pos['MONTHS_BALANCE_MAX'] = gp['MONTHS_BALANCE'].max()

    # Percentage of previous loans completed and completed before initial term
    df_pos['POS_LOAN_COMPLETED_MEAN'] = gp['NAME_CONTRACT_STATUS_Completed'].mean()
    df_pos['POS_COMPLETED_BEFORE_MEAN'] = gp['CNT_INSTALMENT'].first() - gp['CNT_INSTALMENT'].last()
    df_pos['POS_COMPLETED_BEFORE_MEAN'] = df_pos.apply(lambda x: 1 if x['POS_COMPLETED_BEFORE_MEAN'] > 0 \
                                                                      and x['POS_LOAN_COMPLETED_MEAN'] > 0 else 0, axis=1)
    # Number of remaining installments (future installments) and percentage from total
    df_pos['POS_REMAINING_INSTALMENTS'] = gp['CNT_INSTALMENT_FUTURE'].last()
    df_pos['POS_REMAINING_INSTALMENTS_RATIO'] = gp['CNT_INSTALMENT_FUTURE'].last()/gp['CNT_INSTALMENT'].last()

    # Group by SK_ID_CURR and merge
    df_gp = df_pos.groupby('SK_ID_CURR').sum().reset_index()
    df_gp.drop(['MONTHS_BALANCE_MAX'], axis=1, inplace= True)
    pos_agg = pd.merge(pos_agg, df_gp, on= 'SK_ID_CURR', how= 'left')

    # Percentage of late payments for the 3 most recent applications
    pos = do_sum(pos, ['SK_ID_PREV'], 'LATE_PAYMENT', 'LATE_PAYMENT_SUM')

    # Last month of each application
    last_month_df = pos.groupby('SK_ID_PREV')['MONTHS_BALANCE'].idxmax()

    # Most recent applications (last 3)
    sort_pos = pos.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'])
    gp = sort_pos.iloc[last_month_df].groupby('SK_ID_CURR').tail(3)
    gp_mean = gp.groupby('SK_ID_CURR').mean().reset_index()
    pos_agg = pd.merge(pos_agg, gp_mean[['SK_ID_CURR', 'LATE_PAYMENT_SUM']], on='SK_ID_CURR', how='left')

    print('"Pos-Cash" balance final shape:', pos_agg.shape) 
    return pos_agg


In [16]:
cash=pos_cash(cash)

"Pos-Cash" balance final shape: (337252, 46)


In [17]:
catVars = [var for var in cash.columns if cash[var].dtype=='object']

print('There are {} categorical variables\n'.format(len(catVars)), '\n', catVars)

There are 0 categorical variables
 
 []


In [18]:
def installment(ins):
    
    ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)

    # Group payments and get Payment difference
    ins = do_sum(ins, ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'], 'AMT_PAYMENT', 'AMT_PAYMENT_GROUPED')
    ins['PAYMENT_DIFFERENCE'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT_GROUPED']
    ins['PAYMENT_RATIO'] = ins['AMT_INSTALMENT'] / ins['AMT_PAYMENT_GROUPED']
    ins['PAID_OVER_AMOUNT'] = ins['AMT_PAYMENT'] - ins['AMT_INSTALMENT']
    ins['PAID_OVER'] = (ins['PAID_OVER_AMOUNT'] > 0).astype(int)

    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']

    # Days past due and days before due (no negative values)
    ins['DPD_diff'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD_diff'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD_diff'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD_diff'].apply(lambda x: x if x > 0 else 0)

    # Flag late payment
    ins['LATE_PAYMENT'] = ins['DBD'].apply(lambda x: 1 if x > 0 else 0)
    ins['INSTALMENT_PAYMENT_RATIO'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['LATE_PAYMENT_RATIO'] = ins.apply(lambda x: x['INSTALMENT_PAYMENT_RATIO'] if x['LATE_PAYMENT'] == 1 else 0, axis=1)

    # Flag late payments that have a significant amount
    ins['SIGNIFICANT_LATE_PAYMENT'] = ins['LATE_PAYMENT_RATIO'].apply(lambda x: 1 if x > 0.05 else 0)
    
    # Flag k threshold late payments
    ins['DPD_7'] = ins['DPD'].apply(lambda x: 1 if x >= 7 else 0)
    ins['DPD_15'] = ins['DPD'].apply(lambda x: 1 if x >= 15 else 0)

    ins['INS_IS_DPD_UNDER_120'] = ins['DPD'].apply(lambda x: 1 if (x > 0) & (x < 120) else 0)
    ins['INS_IS_DPD_OVER_120'] = ins['DPD'].apply(lambda x: 1 if (x >= 120) else 0)

    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum', 'var'],
        'DBD': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum', 'min'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum', 'min'],
        'SK_ID_PREV': ['size', 'nunique'],
        'PAYMENT_DIFFERENCE': ['mean'],
        'PAYMENT_RATIO': ['mean', 'max'],
        'LATE_PAYMENT': ['mean', 'sum'],
        'SIGNIFICANT_LATE_PAYMENT': ['mean', 'sum'],
        'LATE_PAYMENT_RATIO': ['mean'],
        'DPD_7': ['mean'],
        'DPD_15': ['mean'],
        'PAID_OVER': ['mean'],
        'DPD_diff':['mean', 'min', 'max'],
        'DBD_diff':['mean', 'min', 'max'],
        'DAYS_INSTALMENT': ['mean', 'max', 'sum'],
        'INS_IS_DPD_UNDER_120': ['mean', 'sum'],
        'INS_IS_DPD_OVER_120': ['mean', 'sum']
    }

    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])

    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()

    # from oof (DAYS_ENTRY_PAYMENT)
    cond_day = ins['DAYS_ENTRY_PAYMENT'] >= -365
    ins_d365_grp = ins[cond_day].groupby('SK_ID_CURR')
    ins_d365_agg_dict = {
        'SK_ID_CURR': ['count'],
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DAYS_ENTRY_PAYMENT': ['mean', 'max', 'sum'],
        'DAYS_INSTALMENT': ['mean', 'max', 'sum'],
        'AMT_INSTALMENT': ['mean', 'max', 'sum'],
        'AMT_PAYMENT': ['mean', 'max', 'sum'],
        'PAYMENT_DIFF': ['mean', 'min', 'max', 'sum'],
        'PAYMENT_PERC': ['mean', 'max'],
        'DPD_diff': ['mean', 'min', 'max'],
        'DPD': ['mean', 'sum'],
        'INS_IS_DPD_UNDER_120': ['mean', 'sum'],
        'INS_IS_DPD_OVER_120': ['mean', 'sum']}

    ins_d365_agg = ins_d365_grp.agg(ins_d365_agg_dict)
    ins_d365_agg.columns = ['INS_D365' + ('_').join(column).upper() for column in ins_d365_agg.columns.ravel()]

    ins_agg = ins_agg.merge(ins_d365_agg, on='SK_ID_CURR', how='left')

    print('"Installments Payments" final shape:', ins_agg.shape)
    return ins_agg


In [19]:
payments=installment(payments)

"Installments Payments" final shape: (339587, 85)


In [20]:
catVars = [var for var in payments.columns if payments[var].dtype=='object']

print('There are {} categorical variables\n'.format(len(catVars)), '\n', catVars)

There are 0 categorical variables
 
 []


In [21]:
def credit_card(cc):    
    
    cc, cat_cols = one_hot_encoder(cc, nan_as_category=True)

    # Amount used from limit
    cc['LIMIT_USE'] = cc['AMT_BALANCE'] / cc['AMT_CREDIT_LIMIT_ACTUAL']
    # Current payment / Min payment
    cc['PAYMENT_DIV_MIN'] = cc['AMT_PAYMENT_CURRENT'] / cc['AMT_INST_MIN_REGULARITY']
    # Late payment <-- 'CARD_IS_DPD'
    cc['LATE_PAYMENT'] = cc['SK_DPD'].apply(lambda x: 1 if x > 0 else 0)
    # How much drawing of limit
    cc['DRAWING_LIMIT_RATIO'] = cc['AMT_DRAWINGS_ATM_CURRENT'] / cc['AMT_CREDIT_LIMIT_ACTUAL']

    cc['CARD_IS_DPD_UNDER_120'] = cc['SK_DPD'].apply(lambda x: 1 if (x > 0) & (x < 120) else 0)
    cc['CARD_IS_DPD_OVER_120'] = cc['SK_DPD'].apply(lambda x: 1 if x >= 120 else 0)

    # General aggregations
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])

    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()

    # Last month balance of each credit card application
    last_ids = cc.groupby('SK_ID_PREV')['MONTHS_BALANCE'].idxmax()
    last_months_df = cc[cc.index.isin(last_ids)]
    cc_agg = group_and_merge(last_months_df,cc_agg,'CC_LAST_', {'AMT_BALANCE': ['mean', 'max']})

    CREDIT_CARD_TIME_AGG = {
        'AMT_BALANCE': ['mean', 'max'],
        'LIMIT_USE': ['max', 'mean'],
        'AMT_CREDIT_LIMIT_ACTUAL':['max'],
        'AMT_DRAWINGS_ATM_CURRENT': ['max', 'sum'],
        'AMT_DRAWINGS_CURRENT': ['max', 'sum'],
        'AMT_DRAWINGS_POS_CURRENT': ['max', 'sum'],
        'AMT_INST_MIN_REGULARITY': ['max', 'mean'],
        'AMT_PAYMENT_TOTAL_CURRENT': ['max','sum'],
        'AMT_TOTAL_RECEIVABLE': ['max', 'mean'],
        'CNT_DRAWINGS_ATM_CURRENT': ['max','sum', 'mean'],
        'CNT_DRAWINGS_CURRENT': ['max', 'mean', 'sum'],
        'CNT_DRAWINGS_POS_CURRENT': ['mean'],
        'SK_DPD': ['mean', 'max', 'sum'],
        'LIMIT_USE': ['min', 'max'],
        'DRAWING_LIMIT_RATIO': ['min', 'max'],
        'LATE_PAYMENT': ['mean', 'sum'],
        'CARD_IS_DPD_UNDER_120': ['mean', 'sum'],
        'CARD_IS_DPD_OVER_120': ['mean', 'sum']
    }

    for months in [12, 24, 48]:
        cc_prev_id = cc[cc['MONTHS_BALANCE'] >= -months]['SK_ID_PREV'].unique()
        cc_recent = cc[cc['SK_ID_PREV'].isin(cc_prev_id)]
        prefix = 'INS_{}M_'.format(months)
        cc_agg = group_and_merge(cc_recent, cc_agg, prefix, CREDIT_CARD_TIME_AGG)


    print('"Credit Card Balance" final shape:', cc_agg.shape)
    return cc_agg

In [22]:
cc_bal=credit_card(cc_bal)

"Credit Card Balance" final shape: (103558, 284)


In [23]:
catVars = [var for var in cc_bal.columns if cc_bal[var].dtype=='object']

print('There are {} categorical variables\n'.format(len(catVars)), '\n', catVars)

There are 0 categorical variables
 
 []


In [78]:
def data_post_processing(dataframe):
    print(f'---=> the DATA POST-PROCESSING is beginning, the dataset has {dataframe.shape[1]} features')
    # keep index related columns
    index_cols = ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']

    dataframe = dataframe.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x))
    print('names of feature are renamed')

    '''# Reduced memory usage
    dataframe = reduce_mem_usage(dataframe)
    print(f'---=> pandas data types of features in the dataset are converted for a reduced memory usage')
'''
    # Remove non-informative columns
    noninformative_cols = []
    for col in dataframe.columns:
        if len(dataframe[col].value_counts()) < 2:
            noninformative_cols.append(col)

    dataframe.drop(noninformative_cols, axis=1, inplace=True)
    print(f'---=> {dataframe.shape[1]} features are remained after removing non-informative features')

    # Removing features not interesting for classifier
    feature_num = dataframe.shape[1]
    #this function does not work reason of insufficient memory, I added selected_feature manually!
    '''auc_limit = 0.7
    sub_m, selected_features = model1(dataframe, n_folds=5)'''
    all_features = dataframe.columns.tolist()
    #selected_feature_df = pd.read_csv('../input/homecredit-best-subs/removed_cols_lgbm.csv')
    #selected_features = selected_feature_df.removed_cols.tolist()
    #remained_features = set(all_features).difference(set(selected_features))
    #dataframe = dataframe.drop(feat, axis=1, inplace=True)
    #dataframe = dataframe[selected_features]
    print(f'{feature_num - dataframe.shape[1]} features are eliminated by LightGBM classifier in step I')
    print(f'---=> {dataframe.shape[1]} features are remained after removing features not interesting for LightGBM classifier')


    # generate new columns with risk_groupanizer
    start_feats_num = dataframe.shape[1]
    cat_cols = [col for col in dataframe.columns if 3 < len(dataframe[col].value_counts()) < 20 and col not in index_cols]
    dataframe, _ = risk_groupanizer(dataframe, column_names=cat_cols, upper_limit_ratio=8.1, lower_limit_ratio=8.1)
    print(f'---=> {dataframe.shape[1] - start_feats_num} features are generated with the risk_groupanizer')


    # ending message of DATA POST-PROCESSING
    print(f'---=> the DATA POST-PROCESSING is ended!, now the dataset has a total {dataframe.shape[1]} features')

    gc.collect()
    return dataframe

In [91]:
def Kfold_LightGBM(df):
    print('===============================================', '\n', '##### the ML in processing...')

    # loading predicted result 
    df_subx = pd.read_csv(r"C:\Users\aksha\notebooks\data\submission.csv")
    df_sub = df_subx[['SK_ID_CURR', 'TARGET']]
    df_sub.columns = ['SK_ID_CURR', 'TARGET']
     
    # split train, and test datasets
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    # delete main dataframe for saving memory
    del df_subx
    gc.collect()

        # Expand train dataset with two times of test dataset including predicted results
    test_df.TARGET = np.where(df_sub.TARGET > 0.75, 1, 0)
    train_df = pd.concat([train_df, test_df], axis=0)
    train_df = pd.concat([train_df, test_df], axis=0)
    train_df = pd.concat([train_df, test_df], axis=0)
    print(f'Train shape: {train_df.shape}, test shape: {test_df.shape} are loaded.')

    # Cross validation model
    folds = KFold(n_splits=6, shuffle=True, random_state=666)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    # limit number of feature to only 174!!!
    feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV']]
    
    # print final shape of dataset to evaluate by LightGBM
    print(f'only {len(feats)} features from a total {train_df.shape[1]} features are used for ML analysis')

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        clf = LGBMClassifier(nthread=-1,
                            #device_type='gpu',
                            n_estimators=5000,
                            learning_rate=0.01,
                            max_depth=11,
                            num_leaves=58,
                            colsample_bytree=0.613,
                            subsample=0.708,
                            max_bin=407,
                            reg_alpha=3.564,
                            reg_lambda=4.930,
                            min_child_weight=6,
                            min_child_samples=165,
                            #keep_training_booster=True,
                            silent=-1,
                            verbose=-1,)

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc')

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))

    # create submission file
    test_df['TARGET'] = sub_preds
    test_df[['SK_ID_CURR', 'TARGET']].to_csv('submission.csv', index=False)
    print('a submission file is created')

In [26]:
#A
def findnan(df):
        # Total missing values
        val = df.isnull().sum()
        
        # Percentage of missing values
        val_percent = 100 * df.isnull().sum() / len(df)
        ty=df.dtypes
        
        # Make a table with the results
        table = pd.concat([val, val_percent, ty], axis=1)
        
        # Rename the columns
        table = table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values', 2: 'Data Type'})
        
        # Sort the table by percentage of missing descending
        table = table[
            table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print(df.shape)
        print(df.columns)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(table.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return table

In [42]:
df1 = df2.merge(cc_bal, how='left', on='SK_ID_CURR')

del df2
gc.collect()

969

In [43]:
catVars1 = [var for var in df1.columns if df1[var].dtype=='object']

print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)
print(df1.shape)

There are 0 categorical variables in Train 
 
 []
(356250, 1173)


In [44]:
findnan(df1)

(356250, 1173)
Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_REALTY',
       'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       ...
       'INS_48M_SK_DPD_MAX', 'INS_48M_SK_DPD_SUM',
       'INS_48M_DRAWING_LIMIT_RATIO_MIN', 'INS_48M_DRAWING_LIMIT_RATIO_MAX',
       'INS_48M_LATE_PAYMENT_MEAN', 'INS_48M_LATE_PAYMENT_SUM',
       'INS_48M_CARD_IS_DPD_UNDER_120_MEAN',
       'INS_48M_CARD_IS_DPD_UNDER_120_SUM',
       'INS_48M_CARD_IS_DPD_OVER_120_MEAN',
       'INS_48M_CARD_IS_DPD_OVER_120_SUM'],
      dtype='object', length=1173)
Your selected dataframe has 1173 columns.
There are 1001 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
REFUSED_DAYS_LAST_DUE_DIFF_MAX,356250,100.0,float64
REFUSED_DAYS_LAST_DUE_DIFF_MEAN,356250,100.0,float64
REFUSED_DAYS_TERMINATION_MAX,356250,100.0,float64
ACTIVE_ENDDATE_DIF_MEAN,354347,99.5,float64
ACTIVE_ENDDATE_DIF_MAX,354347,99.5,float64
...,...,...,...
ANNUITY_INCOME_PERC,36,0.0,float64
APP_AMT_INCOME_TOTAL_12_AMT_ANNUITY_ratio,36,0.0,float64
INCOME_PER_PERSON,2,0.0,float64
CNT_FAM_MEMBERS,2,0.0,float64


In [45]:
df1 = remove_missing_columns(df1)
findnan(df1)

There are 77 columns with greater than 75% missing values.
(356250, 1096)
Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_REALTY',
       'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       ...
       'INS_48M_CNT_DRAWINGS_CURRENT_SUM', 'INS_48M_SK_DPD_MEAN',
       'INS_48M_SK_DPD_MAX', 'INS_48M_SK_DPD_SUM', 'INS_48M_LATE_PAYMENT_MEAN',
       'INS_48M_LATE_PAYMENT_SUM', 'INS_48M_CARD_IS_DPD_UNDER_120_MEAN',
       'INS_48M_CARD_IS_DPD_UNDER_120_SUM',
       'INS_48M_CARD_IS_DPD_OVER_120_MEAN',
       'INS_48M_CARD_IS_DPD_OVER_120_SUM'],
      dtype='object', length=1096)
Your selected dataframe has 1096 columns.
There are 924 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
ACTIVE_AMT_ANNUITY_MEAN,263524,74.0,float64
ACTIVE_AMT_ANNUITY_MAX,263524,74.0,float64
CLOSED_AMT_ANNUITY_MAX,261053,73.3,float64
CLOSED_AMT_ANNUITY_MEAN,261053,73.3,float64
CC_LIMIT_USE_VAR,255107,71.6,float64
...,...,...,...
AMT_ANNUITY,36,0.0,float64
APP_AMT_INCOME_TOTAL_12_AMT_ANNUITY_ratio,36,0.0,float64
INCOME_PER_PERSON,2,0.0,float64
CNT_FAM_MEMBERS,2,0.0,float64


In [60]:
train_df = df1[df1['TARGET'].notnull()]
test_df = df1[df1['TARGET'].isnull()]

print(train_df.shape)
print(test_df.shape)

(307506, 1096)
(48744, 1096)


In [62]:
test_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,INS_48M_CNT_DRAWINGS_CURRENT_SUM,INS_48M_SK_DPD_MEAN,INS_48M_SK_DPD_MAX,INS_48M_SK_DPD_SUM,INS_48M_LATE_PAYMENT_MEAN,INS_48M_LATE_PAYMENT_SUM,INS_48M_CARD_IS_DPD_UNDER_120_MEAN,INS_48M_CARD_IS_DPD_UNDER_120_SUM,INS_48M_CARD_IS_DPD_OVER_120_MEAN,INS_48M_CARD_IS_DPD_OVER_120_SUM
307506,100001,,1,0,0,135000.0,568800.0,20560.5,450000.0,0.01885,...,,,,,,,,,,
307507,100005,,0,0,0,99000.0,222768.0,17370.0,180000.0,0.035792,...,,,,,,,,,,
307508,100013,,0,0,0,202500.0,663264.0,69777.0,630000.0,0.019101,...,23.0,0.010417,1.0,1.0,0.010417,1.0,0.010417,1.0,0.0,0.0
307509,100028,,1,0,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,...,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307510,100038,,0,1,1,180000.0,625500.0,32067.0,625500.0,0.010032,...,,,,,,,,,,


In [66]:
sub, feat = model1(df1, n_folds=5)

[LightGBM] [Info] Number of positive: 19911, number of negative: 226093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.988665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 163734
[LightGBM] [Info] Number of data points in the train set: 246004, number of used features: 1081
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 19889, number of negative: 226116
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.983238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 163870
[LightGBM] [Info] Number of data points in the train set: 246005, number of used features: 1082
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[L


df = df.merge(bureau, how='left', on='SK_ID_CURR')
df = df.merge(prev_app, how='left', on='SK_ID_CURR')
df = df.merge(cash, how='left', on='SK_ID_CURR')
df = df.merge(payments, how='left', on='SK_ID_CURR')
'''test1 = test1.merge(bureau, how='left', on='SK_ID_CURR')
print('--=> Train after merge with bureau:', train.shape)
print('--=> Test after merge with bureau:', test.shape)

catVars = [var for var in train1.columns if train1[var].dtype=='object']
catVars1 = [var for var in test1.columns if test1[var].dtype=='object']

print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)
print('There are {} categorical variables in Test \n'.format(len(catVars1)), '\n', catVars1)'''


'''test1 = test1.merge(prev_app, how='left', on='SK_ID_CURR')
print('--=> Train after merge with Previous Applications:', train.shape)
print('--=> Test after merge with Previous Applications:', test.shape)

catVars = [var for var in train1.columns if train1[var].dtype=='object']
catVars1 = [var for var in test1.columns if test1[var].dtype=='object']

print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)
print('There are {} categorical variables in Test \n'.format(len(catVars1)), '\n', catVars1)'''



'''test1 = test1.merge(cash, how='left', on='SK_ID_CURR')
print('--=> Train after merge with POS_CASH:', train1.shape)
print('--=> Test after merge with POS_CASH:', test1.shape)

catVars = [var for var in train1.columns if train1[var].dtype=='object']
catVars1 = [var for var in test1.columns if test1[var].dtype=='object']

print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)
print('There are {} categorical variables in Test \n'.format(len(catVars1)), '\n', catVars1)'''


'''test1 = test1.merge(payments, how='left', on='SK_ID_CURR')
print('--=> Train after merge with Installment Payments:', train1.shape)
print('--=> Test after merge with Installment Payments', test1.shape)

catVars = [var for var in train1.columns if train1[var].dtype=='object']
catVars1 = [var for var in test1.columns if test1[var].dtype=='object']

print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)
print('There are {} categorical variables in Test \n'.format(len(catVars1)), '\n', catVars1)'''

cc_bal.head()

del temp1, temp2
gc.collect()

findnan(train1)

df1 = df.merge(cc_bal, how='left', on='SK_ID_CURR')
print('--=> Df after merge with Credit Card:', df.shape)
catVars = [var for var in df1.columns if df1[var].dtype=='object']
print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)

'''test2 = test1.merge(cc_bal, how='left', on='SK_ID_CURR')
print('--=> Train after merge with Credit Card:', train2.shape)
print('--=> Test after merge with Credit Card:', test2.shape)

catVars = [var for var in train2.columns if train2[var].dtype=='object']
catVars1 = [var for var in test2.columns if test2[var].dtype=='object']

print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)
print('There are {} categorical variables in Test \n'.format(len(catVars1)), '\n', catVars1)'''

findnan(train2)

#findnan(ftrain)

print('--=> Train after merge with Credit Card:', df.shape)
#print('--=> Test after merge with Credit Card:', ftest.shape)

catVars = [var for var in df.columns if df[var].dtype=='object']
#catVars1 = [var for var in ftest.columns if ftest[var].dtype=='object']

print('There are {} categorical variables in Train \n'.format(len(catVars)), '\n', catVars)
#print('There are {} categorical variables in Test \n'.format(len(catVars1)), '\n', catVars1)

for col in catVars:
    df[col] = pd.to_numeric(df[col], errors='coerce').convert_dtypes() 
    
'''for col in catVars1:
    ftest[col] = pd.to_numeric(ftest[col], errors='coerce').convert_dtypes() 
'''

df1 = remove_missing_columns(df1)
findnan(df1)

'''del df, bureau, bureau_bal, train, test, cash, prev_app, cc_bal, payments
gc.collect()'''


train = train.merge(cash, how='left', on='SK_ID_CURR')
test = test.merge(cash, how='left', on='SK_ID_CURR')
print('--=> Train after merge with POS_CASH:', train.shape)
print('--=> Test after merge with POS_CASH:', test.shape)
train = train.merge(payments, how='left', on='SK_ID_CURR')
test = test.merge(payments, how='left', on='SK_ID_CURR')
print('--=> Train after merge with Installment Payments:', train.shape)
print('--=> Test after merge with Installment Payments', test.shape)
train = train.merge(cc_bal, how='left', on='SK_ID_CURR')
test = test.merge(cc_bal, how='left', on='SK_ID_CURR')
print('--=> Train after merge with Credit Card:', train.shape)
print('--=> Test after merge with Credit Card:', test.shape)


train.describe()

catVars = [var for var in df1.columns if df1[var].dtype=='object']

print('There are {} categorical variables\n'.format(len(catVars)), '\n', catVars)

In [75]:
df1.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,CC_LIMIT_USE_MIN,CC_LIMIT_USE_MAX,CC_LIMIT_USE_MEAN,CC_LIMIT_USE_SUM,CC_LIMIT_USE_VAR,CC_PAYMENT_DIV_MIN_SUM,CC_LATE_PAYMENT_SUM,CC_DRAWING_LIMIT_RATIO_SUM,CC_LAST_AMT_BALANCE_MEAN,CC_LAST_AMT_BALANCE_MAX
0,100002,1.0,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637.0,...,,,,,,,,,,
1,100003,0.0,1,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188.0,...,,,,,,,,,,
2,100004,0.0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225.0,...,,,,,,,,,,
3,100006,0.0,1,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0.0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038.0,...,,,,,,,,,,


In [79]:
df2 = data_post_processing(df1)

print('='*50, '\n')
print('---=> Train final shape:', df2.shape, ' <=---', '\n')
print('=' * 50)

---=> the DATA POST-PROCESSING is beginning, the dataset has 629 features
names of feature are renamed
---=> 629 features are remained after removing non-informative features
0 features are eliminated by LightGBM classifier in step I
---=> 629 features are remained after removing features not interesting for LightGBM classifier
---=> 18 features are generated with the risk_groupanizer
---=> the DATA POST-PROCESSING is ended!, now the dataset has a total 647 features

---=> Train final shape: (356250, 647)  <=--- 



df2,col=one_hot_encoder(df1)

findnan(df2)

In [92]:
Kfold_LightGBM(df2)
print('--=> all calculations are done!! <=--')

 ##### the ML in processing...
Train shape: (453738, 647), test shape: (48744, 647) are loaded.
only 645 features from a total 647 features are used for ML analysis
Fold  1 AUC : 0.879417
Fold  2 AUC : 0.876873
Fold  3 AUC : 0.879510
Fold  4 AUC : 0.874981
Fold  5 AUC : 0.877054
Fold  6 AUC : 0.881072
Full AUC score 0.878152
a submission file is created
--=> all calculations are done!! <=--
