In [9]:
#Import Packages
import pandas as pd
import numpy as np

In [2]:
#read in data
month1 = pd.read_csv('data/train_month_1.csv')
month2 = pd.read_csv('data/train_month_2.csv')
month3 = pd.read_csv('data/train_month_3_with_target.csv')

Between the first and second month data, we try to figure out if certain columns changes. We then store the booleans (True/False) in a python list to indicate which features change and which remain constant. Note that we set client_id to True, so we can merge the final datasets together using this column

In [3]:
notequal = []
for col in list(month1):
    notequal.append(not month1[col].equals(month2[col]))
notequal[0] = True
print(notequal)

[True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, True, False, True, True]


Repeat for the first and third month. Note that we  append an extra True value at the end, because the third month dataset has an additional column (Target), that was previously not available in the other two datasets.

In [4]:
notequal2 = []
for col in list(month1):
    notequal2.append(not month1[col].equals(month3[col]))
notequal2.append(True)
notequal2[0] = True

print(notequal2)

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, True, False, True, True, True]


In [19]:
#rename the columns in month 3, to prevent features from replacing each other when merging
month3 = month3.add_suffix("_3")
month3 = month3.rename(columns={"client_id_3":"client_id", "target_3":"target"})
month3.head()

Unnamed: 0,client_id,homebanking_active_3,has_homebanking_3,has_insurance_21_3,has_insurance_23_3,has_life_insurance_fixed_cap_3,has_life_insurance_decreasing_cap_3,has_fire_car_other_insurance_3,has_personal_loan_3,has_mortgage_loan_3,...,customer_since_bank_3,customer_gender_3,customer_birth_date_3,customer_postal_code_3,customer_occupation_code_3,customer_self_employed_3,customer_education_3,customer_children_3,customer_relationship_3,target
0,910df42ad36243aa4ce16324cd7b15b0,0,0,0,0,0,0,1,0,0,...,1994-08,1,1943-09,3630,9.0,0,0.0,,,0
1,4e19dc3a54323c5bbfc374664b950cd1,1,1,0,0,0,0,0,0,0,...,2017-01,1,1994-02,2460,9.0,0,,mature,couple,0
2,f5d08db1b86c0cb0f566bf446cff1fb4,1,1,0,0,0,0,1,0,0,...,1980-12,2,1936-10,2660,9.0,0,,,single,0
3,26170ecf63653e215c52f4262c1c4859,0,0,0,0,0,0,1,0,0,...,2013-10,1,1946-09,6600,9.0,0,,,,0
4,c078009957dffb64f20e61b41220a976,0,0,0,0,0,0,0,0,0,...,2012-11,2,1996-04,8550,9.0,0,,mature,couple,1


Next, we merge the three datasets, only joining features that change from month to months. Anything constant such as date of birth only appear once in the final dataset. All features that appear repeatedly from month to month will have a suffix of a number indicating the feature for the particular month. For example, "has_homebanking_1" is the associated feature that indicates whether a client has homebanking for month 1, while "has_homebanking_2" is for month 2, etc.

In [20]:
full_df = pd.merge(month1, month2.loc[:,notequal], left_on="client_id", right_on="client_id", how="inner", suffixes=["_1", "_2"])
full_df = pd.merge(full_df, month3.loc[:,notequal2], left_on="client_id", right_on="client_id", how="inner")
full_df.head()

Unnamed: 0,client_id,homebanking_active_1,has_homebanking_1,has_insurance_21_1,has_insurance_23_1,has_life_insurance_fixed_cap_1,has_life_insurance_decreasing_cap_1,has_fire_car_other_insurance_1,has_personal_loan_1,has_mortgage_loan_1,...,bal_pension_saving_3,bal_savings_account_3,bal_savings_account_starter_3,bal_current_account_starter_3,visits_distinct_so_3,visits_distinct_so_areas_3,customer_self_employed_3,customer_children_3,customer_relationship_3,target
0,910df42ad36243aa4ce16324cd7b15b0,0,0,0,0,0,0,1,0,0,...,0,22000,0,0,1.0,1.0,0,,,0
1,4e19dc3a54323c5bbfc374664b950cd1,1,1,0,0,0,0,0,0,0,...,0,10570,0,0,1.0,1.0,0,mature,couple,0
2,f5d08db1b86c0cb0f566bf446cff1fb4,1,1,0,0,0,0,1,0,0,...,0,15200,0,0,1.0,1.0,0,,single,0
3,26170ecf63653e215c52f4262c1c4859,0,0,0,0,0,0,1,0,0,...,0,29020,0,0,1.0,1.0,0,,,0
4,c078009957dffb64f20e61b41220a976,0,0,0,0,0,0,0,0,0,...,0,13650,0,0,1.0,1.0,0,mature,couple,1


To avoid having multiple repeated features for different months, we combine them either in sequences of binary or an aggregate of count.

In [21]:
#these features have a binary value indicating presence/absence
status_features = list(month1.loc[:, notequal].columns)[:14]
status_features

['client_id',
 'homebanking_active',
 'has_homebanking',
 'has_insurance_21',
 'has_insurance_23',
 'has_life_insurance_fixed_cap',
 'has_life_insurance_decreasing_cap',
 'has_fire_car_other_insurance',
 'has_personal_loan',
 'has_mortgage_loan',
 'has_current_account',
 'has_pension_saving',
 'has_savings_account',
 'has_current_account_starter']

For the first 13 features (excluding client_id), we can concat the binary features into a string sequence to indicate a 3 month combination pattern:

In [22]:
for col in status_features:
    if col != "client_id":
        
        full_df[str(col) + "_seq"] = full_df.filter(regex="^"+str(col)+"_[0-9]").apply(lambda x: ''.join(x.astype(str)), axis=1)

In [23]:
full_df.filter(regex=".*_seq").head()

Unnamed: 0,homebanking_active_seq,has_homebanking_seq,has_insurance_21_seq,has_insurance_23_seq,has_life_insurance_fixed_cap_seq,has_life_insurance_decreasing_cap_seq,has_fire_car_other_insurance_seq,has_personal_loan_seq,has_mortgage_loan_seq,has_current_account_seq,has_pension_saving_seq,has_savings_account_seq,has_current_account_starter_seq
0,0,0,0,0,0,0,111,0,0,111,0,111,0
1,111,111,0,0,0,0,0,0,0,111,0,111,0
2,111,111,0,0,0,0,111,0,0,111,0,111,0
3,0,0,0,0,0,0,111,0,0,0,0,111,0
4,0,0,0,0,0,0,0,0,0,0,0,111,0


We can also simply aggregate the binary values, to count how many positive occurances a particular client had in the 3 month span for these 14 features:

In [24]:
for col in status_features:
    if col != "client_id":
        
        full_df[str(col) + "_count"] = full_df.filter(regex="^"+str(col)+"_[0-9]").sum(axis=1)

In [35]:
full_df.head()

Unnamed: 0,client_id,homebanking_active_1,has_homebanking_1,has_insurance_21_1,has_insurance_23_1,has_life_insurance_fixed_cap_1,has_life_insurance_decreasing_cap_1,has_fire_car_other_insurance_1,has_personal_loan_1,has_mortgage_loan_1,...,has_insurance_23_count,has_life_insurance_fixed_cap_count,has_life_insurance_decreasing_cap_count,has_fire_car_other_insurance_count,has_personal_loan_count,has_mortgage_loan_count,has_current_account_count,has_pension_saving_count,has_savings_account_count,has_current_account_starter_count
0,910df42ad36243aa4ce16324cd7b15b0,0,0,0,0,0,0,1,0,0,...,0,0,0,3,0,0,3,0,3,0
1,4e19dc3a54323c5bbfc374664b950cd1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,3,0
2,f5d08db1b86c0cb0f566bf446cff1fb4,1,1,0,0,0,0,1,0,0,...,0,0,0,3,0,0,3,0,3,0
3,26170ecf63653e215c52f4262c1c4859,0,0,0,0,0,0,1,0,0,...,0,0,0,3,0,0,0,0,3,0
4,c078009957dffb64f20e61b41220a976,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0


For Balance features, we take the difference between the third and first month, to track the change over time. Note that this also includes visits

In [31]:
balance_features = list(month1.loc[:, notequal].columns)[14:28]
balance_features

['bal_insurance_21',
 'bal_insurance_23',
 'cap_life_insurance_fixed_cap',
 'cap_life_insurance_decreasing_cap',
 'prem_fire_car_other_insurance',
 'bal_personal_loan',
 'bal_mortgage_loan',
 'bal_current_account',
 'bal_pension_saving',
 'bal_savings_account',
 'bal_savings_account_starter',
 'bal_current_account_starter',
 'visits_distinct_so',
 'visits_distinct_so_areas']

In [None]:
import numpy as np

In [68]:
for col in balance_features:
    full_df[str(col)+"_diff"] = np.array(full_df.filter(regex="^"+str(col)+"_3")) - np.array(full_df.filter(regex="^"+str(col)+"_1"))

In [70]:
full_df.to_csv("data_train_final.csv", index=False)

In [5]:
full_df = pd.read_csv("data_train_final.csv")

Convert Column into datetime object

In [6]:
full_df['customer_since_all_d']=pd.to_datetime(full_df['customer_since_all'], format='%Y-%m-%d')
full_df['customer_birth_date_d']=pd.to_datetime(full_df['customer_birth_date'], format='%Y-%m-%d')

Compute the age of the client

In [10]:
full_df['age']=2018-pd.DatetimeIndex(full_df['customer_birth_date_d']).year
full_df['mob']=((pd.to_datetime('2018-10-01')-full_df['customer_birth_date_d'])/np.timedelta64(1, 'M')).astype('int')

In [13]:
full_df['insurance']=full_df[['has_insurance_21_3','has_insurance_23_3','has_life_insurance_fixed_cap_3','has_life_insurance_decreasing_cap_3','has_fire_car_other_insurance_3']].apply(lambda row: row.max(),axis=1)
full_df['savings']=full_df[['has_pension_saving_3','has_savings_account_3','has_savings_account_starter_3','has_current_account_starter_3']].apply(lambda row: row.max(),axis=1)
full_df['loan']=full_df[['has_personal_loan_3','has_mortgage_loan_3','has_current_account_3']].apply(lambda row: row.max(),axis=1)
full_df['has_account']=full_df[['insurance','savings','loan']].apply(lambda row: row.max(),axis=1)
full_df['bal_insurance']=full_df[['bal_insurance_21_3','bal_insurance_23_3']].apply(lambda row: row.sum(),axis=1)
full_df['bal_savings']=full_df[['bal_pension_saving_3','bal_savings_account_3','bal_savings_account_starter_3','bal_current_account_starter_3']].apply(lambda row: row.sum(),axis=1)
full_df['bal_loan']=full_df[['bal_personal_loan_3','bal_mortgage_loan_3','bal_current_account_3']].apply(lambda row: row.sum(),axis=1)
full_df['debt_ratio']=np.where(full_df['bal_loan']/full_df['bal_savings']>=0,full_df['bal_loan']/full_df['bal_savings'],0)
full_df['debt_ratio']=np.where(full_df['bal_loan']/full_df['bal_savings']>=0,full_df['bal_loan']/full_df['bal_savings'],0)
full_df['gender'] = np.where(full_df['customer_gender']==1, 1, 0)


In [14]:
full_df.to_csv("data_train_final.csv", index=False)