## Imports

In [104]:
import pandas as pd
import pdcast as pdc   # Firstly you should run 'pip install pandas-downcast'
import numpy as np


## Helper Functions

In [118]:

def df_optimize(df) :
        """Function to down cast pandas entries types into smaller versions in order to optimize space in the used dataframes
    returns the resulting dataframe
    """
        
        df = pdc.downcast(df)
        return df 

def df_optimize_path(df_path :str) :

    """Function to down cast pandas entries types into smaller versions in order to optimize space in the used dataframes
    returns the resulting dataframe
    """
 
    df = pd.read_csv(df_path)
    a =  df.memory_usage().sum()
    df = pdc.downcast(df)
    b = df.memory_usage().sum()
    print("Memory saved by : " ,((1-b/a) * 100),"%")
    
    return df

def is_unique_key(df : pd.DataFrame ,columnName : str) :

    """Function to check if column is a primary key in the given dataframe    
    returns true if the entry is primary key, false otherwise
    """

    x = df.shape[0]
    l = df[columnName].drop_duplicates().shape[0]

    return x == l  

def df_aggreg(df ,on ,aggreg_dict) : 
     """Function to do an aggregation on Dataframe columns on a specific function given in the aggreg_dict"""
     df = df.groupby(on).agg(aggreg_dict).reset_index()
     df.columns = [f'{col[0]}_{col[1].lower()}' if isinstance(col, tuple) else col.lower() for col in df.columns]
     df.rename(columns = {"sk_id_prev_": "sk_id_prev", "sk_id_curr_first" : "sk_id_curr","sk_id_curr_" : "sk_id_curr","sk_id_bureau_":"sk_id_bureau"},inplace=True)
     return df
     
      
def last(x) : 
    return x.iloc[-1] 

## Loading and DownCasting of Dataset

In [119]:
loan_applications = df_optimize_path("Dataset/loan_applications_train.csv")
previous_credits = df_optimize_path("Dataset/previous_credits.csv")
credit_bureau_balance=df_optimize_path("Dataset/credit_bureau_balance.csv")
previous_pos_cash_loans=df_optimize_path("Dataset/previous_POS_cash_loans.csv")
previous_credit_cards=df_optimize_path("Dataset/previous_credit_cards.csv")
previous_loan_applications=df_optimize_path("Dataset/previous_loan_applications.csv")
repayment_history=df_optimize_path("Dataset/repayment_history.csv")

Memory saved by :  68.95278420949069 %
Memory saved by :  60.29362021532256 %
Memory saved by :  74.99992979225063 %
Memory saved by :  71.87492750985867 %
Memory saved by :  61.41298197275421 %
Memory saved by :  68.57983431938032 %
Memory saved by :  62.49999081247355 %


## Searching for unique ID's (Primary Keys)

In [107]:
print(is_unique_key(loan_applications,"sk_id_curr"))  # True
print(is_unique_key(previous_loan_applications,"sk_id_prev")) # True
print(is_unique_key(previous_credits,"sk_id_bureau")) # True
print(is_unique_key(previous_credit_cards,"sk_id_curr")) # False
print(is_unique_key(previous_credit_cards,"sk_id_prev")) # False
print(is_unique_key(credit_bureau_balance,"sk_id_bureau")) # False
print(is_unique_key(previous_pos_cash_loans,"sk_id_prev")) # False
print(is_unique_key(repayment_history,"sk_id_curr")) #False

True
True
True
False
False
False
False
False


## Groupby and Aggregation

In [108]:
#Aggregation for previous_credit_cards
aggregations_prev_credit_cards = {'sk_id_curr' : 'first','months_balance': 'max','amt_balance': ['max', 'sum'],'amt_credit_limit_actual': 'max','amt_payment_current': 'mean','amt_inst_min_regularity': 'mean','sk_dpd': 'max','sk_dpd_def': 'max'}
aggregations_prev_pos_cash_loans = {'sk_id_curr': 'first','months_balance': 'max','cnt_instalment': 'mean','cnt_instalment_future': 'sum','name_contract_status': 'last','sk_dpd': ['min', 'max', 'mean'],'sk_dpd_def': ['min', 'max','mean']}
aggregations_repayment_history = {'sk_id_curr': 'first','num_instalment_version': 'nunique', 'num_instalment_number': 'max', 'days_instalment': ['min', 'max'], 'days_entry_payment': ['min', 'max'], 'amt_instalment': 'sum', 'amt_payment': 'sum' }
aggregations_credit_bureau_balance =  {'months_balance': ['min', 'max'], 'status': last}

previous_credit_cards = df_aggreg(previous_credit_cards,"sk_id_prev",aggregations_prev_credit_cards)
print(is_unique_key(previous_credit_cards,"sk_id_prev"))

previous_pos_cash_loans = df_aggreg(previous_pos_cash_loans,"sk_id_prev",aggregations_prev_pos_cash_loans)
print(is_unique_key(previous_pos_cash_loans,"sk_id_prev"))

repayment_history = df_aggreg(repayment_history,"sk_id_prev",aggregations_repayment_history)
print(is_unique_key(repayment_history,"sk_id_prev"))

credit_bureau_balance = df_aggreg(credit_bureau_balance,'sk_id_bureau',aggregations_credit_bureau_balance)
print(is_unique_key(credit_bureau_balance,"sk_id_bureau"))



True
True
True
True


## Inplace merging and downcasting the results

In [109]:

section2 = pd.merge( previous_loan_applications,previous_credit_cards ,on = ["sk_id_prev","sk_id_curr"],how = "left")

print("First merge done")

section2 = pd.merge(section2,previous_pos_cash_loans ,on=["sk_id_prev","sk_id_curr"],how="left")

print("Second merge done")


section2 = pd.merge(section2,repayment_history,on=["sk_id_prev","sk_id_curr"],how='left')

print("Third merge done")

section2 = pd.merge(section2,previous_credits,on="sk_id_curr",how="left")

print("Fourth merge done")
print('Section Two merge Done')
print("___________________________")
print("Merging Section 2 with Section 3 ...")

section3 = pd.merge(section2,credit_bureau_balance,on="sk_id_bureau",how="left")

print("Fifth merge done")
print("___________________________")
print("Merging Section 1 with the other Sections ...")

section1 = pd.merge(loan_applications,section3,on="sk_id_curr",how="left")

print("Merge Complete")

First merge done
Second merge done
Third merge done
Fourth merge done
Section Two merge Done
___________________________
Merging Section 2 with Section 3 ...
Fifth merge done
___________________________
Merging Section 1 with the other Sections ...
Merge Complete


In [110]:
section1

Unnamed: 0,sk_id_curr,target,name_contract_type_x,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit_x,amt_annuity,...,amt_credit_sum,amt_credit_sum_debt,amt_credit_sum_limit,amt_credit_sum_overdue,credit_type,days_credit_update,amt_annuity_y,months_balance_min,months_balance_max,status_last
0,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,40761.0,,,0.0,Credit card,-1038.0,0.0,-36.0,-15.0,X
1,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.00000,,0.0,Credit card,-47.0,,-15.0,0.0,0
2,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,135000.0,0.00000,0.0,0.0,Consumer credit,-1185.0,0.0,-47.0,-32.0,X
3,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,19071.0,,,0.0,Consumer credit,-906.0,0.0,-36.0,-21.0,X
4,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,120735.0,0.00000,0.0,0.0,Consumer credit,-34.0,0.0,-21.0,-18.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8044095,456255,False,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,436032.0,363573.53125,0.0,0.0,Consumer credit,-25.0,0.0,-11.0,0.0,X
8044096,456255,False,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,450000.0,191005.46875,0.0,0.0,Consumer credit,-55.0,3244.5,-14.0,0.0,X
8044097,456255,False,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,900000.0,,,0.0,Consumer credit,-781.0,0.0,-49.0,-17.0,X
8044098,456255,False,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,38925.0,,,0.0,Credit card,-779.0,3244.5,-76.0,-44.0,X
