# Loan Prediction Competition (Zindi)

In [13]:
import pandas as pd
import numpy as np

from datetime import datetime


pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

## 1. Exploring data

In [14]:
train_perf = pd.read_csv('/Users/ayoubelqadi/PycharmProjects/zindi_paper/data/trainperf.csv', low_memory=False)
train_prevloans = pd.read_csv('/Users/ayoubelqadi/PycharmProjects/zindi_paper/data/trainprevloans.csv', low_memory=False)
train_demographics = pd.read_csv('/Users/ayoubelqadi/PycharmProjects/zindi_paper/data/traindemographics.csv', low_memory=False)


**Each customer is labelled as bad or good customer using the passed loans and the demographic data. In prevloans table we find all the previous loans by customer**

In [106]:
train_perf.head(n=2)

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good
1,8a85886e54beabf90154c0a29ae757c0,301965204,2,2017-07-05 17:04:41.000000,2017-07-05 16:04:18.000000,15000.0,17250.0,30,,Good


In [107]:
train_demographics.head(n=2)

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10 00:00:00.000000,Savings,3.319219,6.528604,GT Bank,,,
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21 00:00:00.000000,Savings,3.325598,7.119403,Sterling Bank,,Permanent,


In [108]:
train_prevloans.head(n=2)

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,2016-08-15 18:22:40.000000,2016-08-15 17:22:32.000000,10000.0,13000.0,30,2016-09-01 16:06:48.000000,,2016-09-14 00:00:00.000000,2016-09-01 15:51:43.000000
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,2017-04-28 18:39:07.000000,2017-04-28 17:38:53.000000,10000.0,13000.0,30,2017-05-28 14:44:49.000000,,2017-05-30 00:00:00.000000,2017-05-26 00:00:00.000000


## Data Preparation (Paper)

**Processing Demographic data**

1. Drop columns with % of missing data greater than 90%
2. Get age of the customer (birthdate given)


In [109]:
def counter_nan(series):
    return sum(series.isna())

def get_percentage_missing_data(df, threshold, show_percentage=False):
    '''Inputs: Datframe, thresold
    Function: return the features with more than thresold of missing data'''
    rows=df.shape[0]
    missing_features = []
    perc_missing_features = {}
    for column in df.columns:
        counter_perc = counter_nan(df[column])/rows
        perc_missing_features[column] = round(counter_perc*100, 2)
        if counter_perc > threshold:
            missing_features.append(column)
        else:
            continue
            
    if show_percentage:
        print(perc_missing_features)
    else:
        pass
    return missing_features
# Fill nan employment status clients with indefined
def fill_nan_employment(df):
    df['employment_status_clients'] = df['employment_status_clients'].replace(to_replace=np.nan, value='non-defined')
            
    

In [110]:
columns_to_drop = get_percentage_missing_data(train_demographics, 0.8, show_percentage=True)

{'customerid': 0.0, 'birthdate': 0.0, 'bank_account_type': 0.0, 'longitude_gps': 0.0, 'latitude_gps': 0.0, 'bank_name_clients': 0.0, 'bank_branch_clients': 98.83, 'employment_status_clients': 14.91, 'level_of_education_clients': 86.49}


In [111]:
#Drop columns with high percentage of missing values
fill_nan_employment(train_demographics)
train_demographics = train_demographics.drop(columns=columns_to_drop)

----

In [112]:
def birthday_to_age(string_date):
    current_year = 2018
    year_birthday = int(string_date.split('-')[0])
    age = 2018-year_birthday
    return age

In [113]:
#Transfomr birthdate in current age of each customer
train_demographics['birthdate'] = train_demographics['birthdate'].apply(birthday_to_age)

**Performance dataset + Previous Loan dataset**
1. Create new features: interest, rate of interest using total due and loan amount
2. Convert term days to year. (scale to year)
3. Extract dates and treat them


In [114]:
def get_interest_data(previous_loan_df):
    '''Input: dataframe with the previous loans data
    Function: add two new columns to the dataframe containing the interest rate and the rate amount'''
    total_due = previous_loan_df['totaldue']
    loan_amount = previous_loan_df['loanamount']
    interest = total_due-loan_amount
    interest_rate = round((interest/loan_amount)*100,2)
    previous_loan_df['interest'] = interest
    previous_loan_df['interest_rate'] = interest_rate

def scale_termdays(previous_loan_df):
    '''Scale termdays using a year as unit'''
    previous_loan_df['termdays'] = round(previous_loan_df['termdays']/365, 2)
    


In [115]:
get_interest_data(train_prevloans)


In [116]:
scale_termdays(train_prevloans)

In [117]:
#Date features

In [118]:
def get_date_features(df, keep_customer_id=False):
    '''Return a vector with the name of the date features'''
    
    if keep_customer_id:
        date_features = ['customerid']
    else:
        date_features = []
        
    for column in df.columns:
        if 'date' in column:
            date_features.append(column)
        else:
            continue
    return date_features

def get_non_date_features(df, keep_customer_id=False):
    '''Return a vector with the non date features'''
    if keep_customer_id:
        non_date_features = ['customerid']
    else:
        non_date_features = []
        
    for column in df.columns[1:]:
        if 'date' not in column:
            non_date_features.append(column)
        else:
            continue
    return non_date_features


##### Create a series of date features using the date features extracted from prevloans dataframe

In [119]:
date_features_df = train_prevloans.drop(columns=get_non_date_features(train_prevloans))

In [120]:
def string_to_datetime(df):
    '''Convert date with string format into date variables'''
    for date_feature in get_date_features(df):
        df[date_feature] = df[date_feature].apply(lambda date: datetime.strptime(date.split(' ')[0], '%Y-%m-%d'))
    
    

In [121]:
#String dates to datetime format
string_to_datetime(date_features_df)

In [122]:
def differentiate_dates(df, columns_to_differentiate=[]):
    '''Create a new column in which we store the distance between two dates'''
    #Days scale
    name_new_column = 'diff_'+columns_to_differentiate[0]+'_'+columns_to_differentiate[1]
    sec_in_day = 60*60*24
    #
    df[name_new_column] = df[columns_to_differentiate[0]] - df[columns_to_differentiate[1]]
    df[name_new_column] = df[name_new_column].apply(lambda x: round((x.total_seconds()/sec_in_day), 2))


In [123]:
#Create a two features tha computes the 'speed' of repayment
differentiate_dates(date_features_df, ['firstduedate', 'firstrepaiddate'])
differentiate_dates(date_features_df, ['closeddate', 'creationdate'])

In [124]:
def date_to_day(df):
    '''Create a new variable containing in which day of the week the due and the repaid took place'''
    columns_map = {
        'firstrepaiddate': 'repaid_dayofweek',
        'firstduedate': 'due_dayofweek'
    }
    for key, value in columns_map.items():
        df[value] = df[key].apply(lambda date: datetime.weekday(date))
        
def is_month_start(df):
        columns_map = {
        'firstrepaiddate': 'is_month_start_repaid',
        'firstduedate': 'is_month_start_due'
    }
        for key, value in columns_map.items():
            df[value] = df[key].apply(lambda date: 1 if date.day == 1 else 0)
    

In [125]:
#Which day of the week the due and the repaid took place
date_to_day(date_features_df)

In [126]:
#is_month_start(date_features_df)

In [127]:
#Merge the transformed date features with the prevloans data
train_prevloans_t = pd.concat([train_prevloans,date_features_df.drop(columns=get_date_features(train_prevloans, keep_customer_id=True))], axis=1)

In [128]:
train_prevloans_t.head(n=2)

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate,interest,interest_rate,diff_firstduedate_firstrepaiddate,diff_closeddate_creationdate,repaid_dayofweek,due_dayofweek
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,2016-08-15 18:22:40.000000,2016-08-15 17:22:32.000000,10000.0,13000.0,0.08,2016-09-01 16:06:48.000000,,2016-09-14 00:00:00.000000,2016-09-01 15:51:43.000000,3000.0,30.0,13.0,17.0,3,2
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,2017-04-28 18:39:07.000000,2017-04-28 17:38:53.000000,10000.0,13000.0,0.08,2017-05-28 14:44:49.000000,,2017-05-30 00:00:00.000000,2017-05-26 00:00:00.000000,3000.0,30.0,4.0,30.0,4,1


In [129]:
#Non relevant variables once transformed the data
columns_to_drop = ['systemloanid', 'approveddate', 'creationdate', 'closeddate', 'referredby', 'firstduedate',
                  'firstrepaiddate']
train_prevloans_t = train_prevloans_t.drop(columns=columns_to_drop)


In [130]:
train_prevloans_t.head(n=10)

Unnamed: 0,customerid,loannumber,loanamount,totaldue,termdays,interest,interest_rate,diff_firstduedate_firstrepaiddate,diff_closeddate_creationdate,repaid_dayofweek,due_dayofweek
0,8a2a81a74ce8c05d014cfb32a0da1049,2,10000.0,13000.0,0.08,3000.0,30.0,13.0,17.0,3,2
1,8a2a81a74ce8c05d014cfb32a0da1049,9,10000.0,13000.0,0.08,3000.0,30.0,4.0,30.0,4,1
2,8a2a81a74ce8c05d014cfb32a0da1049,8,20000.0,23800.0,0.08,3800.0,19.0,-22.0,52.0,2,1
3,8a8588f35438fe12015444567666018e,5,10000.0,11500.0,0.04,1500.0,15.0,0.0,15.0,0,0
4,8a85890754145ace015429211b513e16,2,10000.0,11500.0,0.04,1500.0,15.0,-11.0,27.0,4,0
5,8a858970548359cc0154883481981866,5,20000.0,23800.0,0.08,3800.0,19.0,1.0,29.0,1,2
6,8a858970548359cc0154883481981866,8,30000.0,39000.0,0.16,9000.0,30.0,1.0,29.0,0,1
7,8a8589f35451855401546b0738c42524,6,20000.0,24500.0,0.08,4500.0,22.5,0.0,32.0,0,0
8,8a858e095c59b91b015c5e5cea3719bc,2,10000.0,13000.0,0.08,3000.0,30.0,3.0,27.0,0,3
9,8a858e1158dc4d830158f7bde4f47ea7,3,10000.0,11500.0,0.04,1500.0,15.0,0.0,15.0,1,1


-----

In [131]:
def get_statistics_by_customer(df):
    '''Return a df with the aggregated data by customer
    
    '''
    stats = ['max', 'min', 'mean']
    df_dict = {}
    for stat in stats:
        if stat == 'max':
            rename_feature = {}
            for column in df.columns[2:]:
                rename_feature[column] = column+'_'+stat
            df_dict[stat] = df.groupby(['customerid']).max()
            df_dict[stat] = df_dict[stat].rename(columns=rename_feature)
            df_dict[stat] = df_dict[stat].drop(columns=['loannumber'])
        elif stat == 'min':
            rename_feature = {}
            for column in df.columns[2:]:
                rename_feature[column] = column+'_'+stat
            df_dict[stat] = df.groupby(['customerid']).min()
            df_dict[stat] = df_dict[stat].rename(columns=rename_feature)
            df_dict[stat] = df_dict[stat].drop(columns=['loannumber'])
        elif stat == 'mean':
            rename_feature = {}
            for column in df.columns[2:]:
                rename_feature[column] = column+'_'+stat
            df_dict[stat] = df.groupby(['customerid']).mean()
            df_dict[stat] = df_dict[stat].rename(columns=rename_feature)
            df_dict[stat] = df_dict[stat].drop(columns=['loannumber'])
            
    df_stat = pd.concat([df_dict['max'], df_dict['min'], df_dict['mean']], axis=1)
    
    return df_stat
 
            

            

            
        
            
         

In [132]:
#Transformed prevloans data
train_prevloans_t = get_statistics_by_customer(train_prevloans_t)
train_prevloans_t['customerid'] = train_prevloans_t.index
train_prevloans_t = train_prevloans_t.reset_index(drop=True)


In [133]:
train_prevloans_t.shape

(4359, 28)

##### Review why thera are less rows in the transformed dataframe than the original one

In [134]:
#Merge prevloans data with demographic data
transformed_data = train_demographics.merge(train_prevloans_t, how='inner', on=['customerid'])

In [135]:
train_perf.merge(transformed_data, how='inner', on=['customerid'])

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,loanamount_max,totaldue_max,termdays_max,interest_max,interest_rate_max,diff_firstduedate_firstrepaiddate_max,diff_closeddate_creationdate_max,repaid_dayofweek_max,due_dayofweek_max,loanamount_min,totaldue_min,termdays_min,interest_min,interest_rate_min,diff_firstduedate_firstrepaiddate_min,diff_closeddate_creationdate_min,repaid_dayofweek_min,due_dayofweek_min,loanamount_mean,totaldue_mean,termdays_mean,interest_mean,interest_rate_mean,diff_firstduedate_firstrepaiddate_mean,diff_closeddate_creationdate_mean,repaid_dayofweek_mean,due_dayofweek_mean
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good,46,Other,3.432010,6.433055,Diamond Bank,Permanent,30000.0,34500.0,0.08,4500.0,30.0,29.0,69.0,6,4,10000.0,13000.0,0.08,3000.0,15.0,-39.0,1.0,0,0,18181.818182,22081.818182,0.080000,3900.000000,22.909091,0.909091,30.000000,2.000000,1.636364
1,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57.000000,2017-07-06 13:52:51.000000,20000.0,22250.0,15,,Good,34,Other,11.139350,10.292041,EcoBank,Permanent,10000.0,13000.0,0.08,3000.0,30.0,10.0,45.0,4,4,10000.0,11500.0,0.04,1500.0,15.0,-30.0,6.0,0,0,10000.000000,11750.000000,0.046667,1750.000000,17.500000,-0.833333,18.833333,2.166667,1.333333
2,8a85890754145ace015429211b513e16,301999343,3,2017-07-27 19:00:41.000000,2017-07-27 18:00:35.000000,10000.0,11500.0,15,,Good,41,Savings,3.985770,7.491708,First Bank,Permanent,10000.0,13000.0,0.08,3000.0,30.0,-4.0,36.0,4,0,10000.0,11500.0,0.04,1500.0,15.0,-11.0,27.0,4,0,10000.000000,12250.000000,0.060000,2250.000000,22.500000,-7.500000,31.500000,4.000000,0.000000
3,8a858970548359cc0154883481981866,301962360,9,2017-07-03 23:42:45.000000,2017-07-03 22:42:39.000000,40000.0,44000.0,30,,Good,32,Other,7.457913,9.076574,GT Bank,Permanent,30000.0,39000.0,0.16,9000.0,30.0,8.0,30.0,6,4,10000.0,13000.0,0.08,3000.0,19.0,1.0,23.0,0,0,18750.000000,23550.000000,0.100000,4800.000000,25.875000,3.125000,27.375000,3.375000,1.250000
4,8a858e095c59b91b015c5e5cea3719bc,301972027,3,2017-07-10 19:25:33.000000,2017-07-10 18:25:28.000000,10000.0,13000.0,30,,Good,29,Savings,3.259444,6.557011,GT Bank,Permanent,10000.0,13000.0,0.08,3000.0,30.0,3.0,27.0,1,3,10000.0,11500.0,0.04,1500.0,15.0,2.0,13.0,0,3,10000.000000,12250.000000,0.060000,2250.000000,22.500000,2.500000,20.000000,0.500000,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3267,8a858e395cb1d4d9015cb2115b1d13d7,301964335,2,2017-07-05 08:23:02.000000,2017-07-05 07:22:56.000000,10000.0,11500.0,15,8a858eb75c21a2b9015c29ebece12d01,Bad,39,Savings,3.302387,6.568690,GT Bank,non-defined,10000.0,11500.0,0.04,1500.0,15.0,0.0,17.0,0,0,10000.0,11500.0,0.04,1500.0,15.0,0.0,17.0,0,0,10000.000000,11500.000000,0.040000,1500.000000,15.000000,0.000000,17.000000,0.000000,0.000000
3268,8a858ee85cf400f5015cf44ab1c42d5c,301998967,2,2017-07-27 15:35:47.000000,2017-07-27 14:35:40.000000,10000.0,13000.0,30,,Bad,29,Savings,4.607358,8.460608,FCMB,Permanent,10000.0,13000.0,0.08,3000.0,30.0,4.0,27.0,3,0,10000.0,13000.0,0.08,3000.0,30.0,4.0,27.0,3,0,10000.000000,13000.000000,0.080000,3000.000000,30.000000,4.000000,27.000000,3.000000,0.000000
3269,8a858f365b2547f3015b284597147c94,301995576,3,2017-07-25 16:25:57.000000,2017-07-25 15:24:47.000000,10000.0,11500.0,15,,Bad,29,Savings,3.976842,7.409129,First Bank,Permanent,10000.0,11500.0,0.04,1500.0,15.0,0.0,32.0,4,4,10000.0,11500.0,0.04,1500.0,15.0,-16.0,15.0,2,0,10000.000000,11500.000000,0.040000,1500.000000,15.000000,-8.000000,23.500000,3.000000,2.000000
3270,8a858f935ca09667015ca0ee3bc63f51,301977679,2,2017-07-14 13:50:27.000000,2017-07-14 12:50:21.000000,10000.0,13000.0,30,8a858eda5c8863ff015c9dead65807bb,Bad,28,Savings,3.986089,7.386796,Skye Bank,Permanent,10000.0,13000.0,0.08,3000.0,30.0,0.0,30.0,3,3,10000.0,13000.0,0.08,3000.0,30.0,0.0,30.0,3,3,10000.000000,13000.000000,0.080000,3000.000000,30.000000,0.000000,30.000000,3.000000,3.000000


## Functions 

In [10]:
def counter_nan(series):
    return sum(series.isna())


def get_percentage_missing_data(df, threshold, show_percentage=False):
    '''Inputs: Datframe, thresold
    Function: return the features with more than thresold of missing data'''
    rows=df.shape[0]
    missing_features = []
    perc_missing_features = {}
    for column in df.columns:
        counter_perc = counter_nan(df[column])/rows
        perc_missing_features[column] = round(counter_perc*100, 2)
        if counter_perc > threshold:
            missing_features.append(column)
        else:
            continue
            
    if show_percentage:
        print(perc_missing_features)
    else:
        pass
    return missing_features
# Fill nan employment status clients with indefined
def fill_nan_employment(df):
    df['employment_status_clients'] = df['employment_status_clients'].replace(to_replace=np.nan, value='non-defined')


    
    
def birthday_to_age(string_date):
    current_year = 2018
    year_birthday = int(string_date.split('-')[0])
    age = 2018-year_birthday
    return age



def get_interest_data(previous_loan_df):
    '''Input: dataframe with the previous loans data
    Function: add two new columns to the dataframe containing the interest rate and the rate amount'''
    total_due = previous_loan_df['totaldue']
    loan_amount = previous_loan_df['loanamount']
    interest = total_due-loan_amount
    interest_rate = round((interest/loan_amount)*100,2)
    previous_loan_df['interest'] = interest
    previous_loan_df['interest_rate'] = interest_rate
    
    

def scale_termdays(previous_loan_df):
    '''Scale termdays using a year as unit'''
    previous_loan_df['termdays'] = round(previous_loan_df['termdays']/365, 2)
    
    
    
def get_date_features(df, keep_customer_id=False):
    '''Return a vector with the name of the date features'''
    
    if keep_customer_id:
        date_features = ['customerid']
    else:
        date_features = []
        
    for column in df.columns:
        if 'date' in column:
            date_features.append(column)
        else:
            continue
    return date_features



def get_non_date_features(df, keep_customer_id=False):
    '''Return a vector with the non date features'''
    if keep_customer_id:
        non_date_features = ['customerid']
    else:
        non_date_features = []
        
    for column in df.columns[1:]:
        if 'date' not in column:
            non_date_features.append(column)
        else:
            continue
    return non_date_features

    

def string_to_datetime(df):
    '''Convert date with string format into date variables'''
    for date_feature in get_date_features(df):
        df[date_feature] = df[date_feature].apply(lambda date: datetime.strptime(date.split(' ')[0], '%Y-%m-%d'))
    

    
def differentiate_dates(df, columns_to_differentiate=[]):
    '''Create a new column in which we store the distance between two dates'''
    #Days scale
    name_new_column = 'diff_'+columns_to_differentiate[0]+'_'+columns_to_differentiate[1]
    sec_in_day = 60*60*24
    #
    df[name_new_column] = df[columns_to_differentiate[0]] - df[columns_to_differentiate[1]]
    df[name_new_column] = df[name_new_column].apply(lambda x: round((x.total_seconds()/sec_in_day), 2))
    
    
def date_to_day(df):
    '''Create a new variable containing in which day of the week the due and the repaid took place'''
    columns_map = {
        'firstrepaiddate': 'repaid_dayofweek',
        'firstduedate': 'due_dayofweek'
    }
    for key, value in columns_map.items():
        df[value] = df[key].apply(lambda date: datetime.weekday(date))
        
        
        
def is_month_start(df):
        columns_map = {
        'firstrepaiddate': 'is_month_start_repaid',
        'firstduedate': 'is_month_start_due'
    }
        for key, value in columns_map.items():
            df[value] = df[key].apply(lambda date: 1 if date.day == 1 else 0)
            

def get_statistics_by_customer(df):
    '''Return a df with the aggregated data by customer
    
    '''
    stats = ['max', 'min', 'mean']
    df_dict = {}
    for stat in stats:
        if stat == 'max':
            rename_feature = {}
            for column in df.columns[2:]:
                rename_feature[column] = column+'_'+stat
            df_dict[stat] = df.groupby(['customerid']).max()
            df_dict[stat] = df_dict[stat].rename(columns=rename_feature)
            df_dict[stat] = df_dict[stat].drop(columns=['loannumber'])
        elif stat == 'min':
            rename_feature = {}
            for column in df.columns[2:]:
                rename_feature[column] = column+'_'+stat
            df_dict[stat] = df.groupby(['customerid']).min()
            df_dict[stat] = df_dict[stat].rename(columns=rename_feature)
            df_dict[stat] = df_dict[stat].drop(columns=['loannumber'])
        elif stat == 'mean':
            rename_feature = {}
            for column in df.columns[2:]:
                rename_feature[column] = column+'_'+stat
            df_dict[stat] = df.groupby(['customerid']).mean()
            df_dict[stat] = df_dict[stat].rename(columns=rename_feature)
            df_dict[stat] = df_dict[stat].drop(columns=['loannumber'])
            
    df_stat = pd.concat([df_dict['max'], df_dict['min'], df_dict['mean']], axis=1)
    
    return df_stat
 
            

            

            



In [15]:
def get_final_df(demographics, prev_loans, perf):
    #Drop columns with high percentage of missing values
    columns_to_drop = get_percentage_missing_data(demographics, 0.8, show_percentage=False)
    fill_nan_employment(demographics)
    demographics = demographics.drop(columns=columns_to_drop)
    #Transform birthdate in current age of each customer
    demographics['birthdate'] = demographics['birthdate'].apply(birthday_to_age)
    #Transform prevloans
        #payment info
    get_interest_data(prev_loans)
    scale_termdays(prev_loans)
        #date features treatment
    date_features_df = prev_loans.drop(columns=get_non_date_features(prev_loans))
        #String dates to datetime format
    string_to_datetime(date_features_df)
        #Create a two features tha computes the 'speed' of repayment
    differentiate_dates(date_features_df, ['firstduedate', 'firstrepaiddate'])
    differentiate_dates(date_features_df, ['closeddate', 'creationdate'])
        #Which day of the week the due and the repaid took place
    date_to_day(date_features_df)
        #Merge the transformed date features with the prevloans data
    prev_loans = pd.concat([train_prevloans,date_features_df.drop(columns=get_date_features(train_prevloans, keep_customer_id=True))], axis=1)
        #Non relevant variables once transformed the data
    columns_to_drop = ['systemloanid', 'approveddate', 'creationdate', 'closeddate', 'referredby', 'firstduedate',
                  'firstrepaiddate']
    prev_loans = prev_loans.drop(columns=columns_to_drop)
        #final prev loans data
    prev_loans = get_statistics_by_customer(prev_loans)
    prev_loans['customerid'] = prev_loans.index
    prev_loans = prev_loans.reset_index(drop=True)
    
    #Merge prevloans data with demographic data
    transformed_data = demographics.merge(prev_loans, how='inner', on=['customerid'])
    final_data = perf.merge(transformed_data, how='inner', on=['customerid'])
    
    #Drop dates loans
    final_data = final_data.drop(columns=['approveddate', 'creationdate', 'referredby'])
    #one hot encoding
    final_data = pd.get_dummies(final_data, columns=['bank_account_type', 'bank_name_clients', 'employment_status_clients'])
    
    return final_data


In [16]:
final = get_final_df(demographics=train_demographics, prev_loans=train_prevloans, perf=train_perf)