## Imports

In [1]:
import pandas as pd
import pdcast as pdc   # Firstly you should run 'pip install pandas-downcast'
import numpy as np


## Helper Functions

In [2]:

def df_optimize(df) :
        """Function to down cast pandas entries types into smaller versions in order to optimize space in the used dataframes
    returns the resulting dataframe
    """
        
        df = pdc.downcast(df)
        return df 

def df_optimize_path(df_path :str) :

    """Function to down cast pandas entries types into smaller versions in order to optimize space in the used dataframes
    returns the resulting dataframe
    """
 
    df = pd.read_csv(df_path)
    a =  df.memory_usage().sum()
    df = pdc.downcast(df)
    b = df.memory_usage().sum()
    print("Memory saved by : " ,((1-b/a) * 100),"%")
    
    return df

def is_unique_key(df : pd.DataFrame ,columnName : str) :

    """Function to check if column is a primary key in the given dataframe    
    returns true if the entry is primary key, false otherwise
    """

    x = df.shape[0]
    l = df[columnName].drop_duplicates().shape[0]

    return x == l  

def df_aggreg(df ,on ,aggreg_dict) : 
     """Function to do an aggregation on Dataframe columns on a specific function given in the aggreg_dict"""
     df = df.groupby(on).agg(aggreg_dict).reset_index()
     df.columns = [f'{col[0]}_{col[1].lower()}' if isinstance(col, tuple) else col.lower() for col in df.columns]
     df.rename(columns = {"sk_id_prev_": "sk_id_prev", "sk_id_curr_first" : "sk_id_curr","sk_id_curr_" : "sk_id_curr","sk_id_bureau_":"sk_id_bureau"},inplace=True)
     return df
     
      
def last(x) : 
    return x.iloc[-1] 

## Loading and DownCasting of Dataset

In [3]:
loan_applications = df_optimize_path("Dataset/loan_applications_train.csv")
previous_credits = df_optimize_path("Dataset/previous_credits.csv")
credit_bureau_balance=df_optimize_path("Dataset/credit_bureau_balance.csv")
previous_pos_cash_loans=df_optimize_path("Dataset/previous_POS_cash_loans.csv")
previous_credit_cards=df_optimize_path("Dataset/previous_credit_cards.csv")
previous_loan_applications=df_optimize_path("Dataset/previous_loan_applications.csv")
repayment_history=df_optimize_path("Dataset/repayment_history.csv")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

## Searching for unique ID's (Primary Keys)

In [None]:
print(is_unique_key(loan_applications,"sk_id_curr"))  # True
print(is_unique_key(previous_loan_applications,"sk_id_prev")) # True
print(is_unique_key(previous_credits,"sk_id_bureau")) # True
print(is_unique_key(previous_credit_cards,"sk_id_curr")) # False
print(is_unique_key(previous_credit_cards,"sk_id_prev")) # False
print(is_unique_key(credit_bureau_balance,"sk_id_bureau")) # False
print(is_unique_key(previous_pos_cash_loans,"sk_id_prev")) # False
print(is_unique_key(repayment_history,"sk_id_curr")) #False

True
True
True
False
False
False
False
False


# Get Dummies

In [None]:
loan_applications = pd.get_dummies(loan_applications, columns=loan_applications.select_dtypes(include=['category']).columns)
previous_credits = pd.get_dummies(previous_credits, columns=previous_credits.select_dtypes(include=['category']).columns)
credit_bureau_balance = pd.get_dummies(credit_bureau_balance, columns=credit_bureau_balance.select_dtypes(include=['category']).columns)
previous_pos_cash_loans = pd.get_dummies(previous_pos_cash_loans, columns=previous_pos_cash_loans.select_dtypes(include=['category']).columns)
previous_credit_cards = pd.get_dummies(previous_credit_cards, columns=previous_credit_cards.select_dtypes(include=['category']).columns)
previous_loan_applications = pd.get_dummies(previous_loan_applications, columns=previous_loan_applications.select_dtypes(include=['category']).columns)
repayment_history = pd.get_dummies(repayment_history, columns=repayment_history.select_dtypes(include=['category']).columns)

## Groupby and Aggregation

In [None]:
#Aggregation for previous_credit_cards
aggregations_prev_credit_cards = {
    'sk_id_curr': 'first',
    'months_balance': 'max',
    'amt_balance': 'max',
    'amt_balance': 'sum',
    'amt_credit_limit_actual': 'max',
    'amt_payment_current': 'mean',
    'amt_inst_min_regularity': 'mean',
    'sk_dpd': 'max',
    'sk_dpd_def': 'max'
}
aggregations_prev_pos_cash_loans = {
    'sk_id_curr': 'first',
    'months_balance': 'max',
    'cnt_instalment': 'mean',
    'cnt_instalment_future': 'sum',
    'sk_dpd': 'max',
    'sk_dpd_def': 'max',
    'name_contract_status_Active': 'sum', 
    'name_contract_status_Amortized debt': 'sum',
    'name_contract_status_Approved': 'sum',
    'name_contract_status_Canceled': 'sum',
    'name_contract_status_Completed': 'sum',
    'name_contract_status_Demand': 'sum',
    'name_contract_status_Returned to the store': 'sum',
    'name_contract_status_Signed': 'sum',
    'name_contract_status_XNA': 'sum'
}
aggregations_repayment_history = {
    'sk_id_curr': 'first',
    'num_instalment_version': 'nunique',
    'num_instalment_number': 'max',
    'days_instalment': ['min', 'max'],
    'days_entry_payment': ['min', 'max'],
    'amt_instalment': 'sum',
    'amt_payment': 'sum'
}
aggregations_credit_bureau_balance = {
    'months_balance': ['min', 'max'],
    'status_0': 'last',
    'status_1': 'last',
    'status_2': 'last',
    'status_3': 'last',
    'status_4': 'last',
    'status_5': 'last',
    'status_C': 'last',
    'status_X': 'last'
}
previous_credit_cards = df_aggreg(previous_credit_cards,"sk_id_prev",aggregations_prev_credit_cards)
print(is_unique_key(previous_credit_cards,"sk_id_prev"))

previous_pos_cash_loans = df_aggreg(previous_pos_cash_loans,"sk_id_prev",aggregations_prev_pos_cash_loans)
print(is_unique_key(previous_pos_cash_loans,"sk_id_prev"))

repayment_history = df_aggreg(repayment_history,"sk_id_prev",aggregations_repayment_history)
print(is_unique_key(repayment_history,"sk_id_prev"))

credit_bureau_balance = df_aggreg(credit_bureau_balance,'sk_id_bureau',aggregations_credit_bureau_balance)
print(is_unique_key(credit_bureau_balance,"sk_id_bureau"))



True
True
True
True


## Inplace merging and downcasting the results

In [None]:

section2 = pd.merge( previous_loan_applications,previous_credit_cards ,on = ["sk_id_prev","sk_id_curr"],how = "left")

print("First merge done")

section2 = pd.merge(section2,previous_pos_cash_loans ,on=["sk_id_prev","sk_id_curr"],how="inner")

print("Second merge done")


section2 = pd.merge(section2,repayment_history,on=["sk_id_prev","sk_id_curr"],how='inner')

print("Third merge done")

print('Section Two merge Done')
print("___________________________")
print("Merging inside Section 3 ...")

section3 = pd.merge(previous_credits,credit_bureau_balance,on="sk_id_bureau",how="inner")

print("Fourth merge done")

section3_2 = pd.merge(section2, section3, on="sk_id_curr", how="inner")

section1 = loan_applications

full = pd.merge(section1, section3_2, on="sk_id_curr", how="inner")

print("Merge Complete")

First merge done
Second merge done
Third merge done
Fourth merge done
Section Two merge Done
___________________________
Merging Section 2 with Section 3 ...
Fifth merge done
___________________________
Merging Section 1 with the other Sections ...
Merge Complete
