In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

In [98]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [99]:
# Since the PaymentsHistory column is a column of lists, we will sum all the values in the list and create a new column
# called TotalPaymentsHistory
train['TotalPaymentsHistory'] = train['PaymentsHistory'].apply(lambda x: sum(eval(x)))
train.drop('PaymentsHistory', axis=1, inplace=True)

# For the TransactionDates column, we will extract the first and last date and create two new columns
# called FirstTransactionDate and LastTransactionDate
train['FirstTransactionDate'] = train['TransactionDates'].apply(lambda x: eval(x)[0])
train['LastTransactionDate'] = train['TransactionDates'].apply(lambda x: eval(x)[-1])

train.drop('TransactionDates', axis=1, inplace=True)

# create a target variable
train['Target'] = train[['m1','m2','m3', 'm4', 'm5', 'm6']].sum(axis=1)
train.drop(['m1','m2','m3', 'm4', 'm5', 'm6'], axis=1, inplace=True)


In [100]:
# Convert transaction dates to datetime
train['FirstTransactionDate'] = pd.to_datetime(train['FirstTransactionDate'], format='%m-%Y', errors='coerce')
train['LastTransactionDate'] = pd.to_datetime(train['LastTransactionDate'], format='%m-%Y', errors='coerce')

# extract the year, month and day from the first transaction date
train['FirstTransactionYear'] = train['FirstTransactionDate'].dt.year
train['FirstTransactionMonth'] = train['FirstTransactionDate'].dt.month

# extract the year, month and day from the last transaction date
train['LastTransactionYear'] = train['LastTransactionDate'].dt.year
train['LastTransactionMonth'] = train['LastTransactionDate'].dt.month

In [102]:
metadata = pd.read_csv('metadata.csv')
metadata.head()

Unnamed: 0,ID,RegistrationDate,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,Term,TotalContractValue,ExpectedTermDate,FirstPaymentDate,LastPaymentDate
0,ID_K00S4N4,2015-12-10 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Male,41.0,Mount Kenya Region,Embu,Other,d_light,364,14740.0,2016-12-08 00:00:00,2015-12-10 09:52:35,2016-10-23 04:52:30
1,ID_6L67PAA,2015-12-09 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Male,33.0,Coast Region,Kilifi,Other,d_light,364,14740.0,2016-12-07 00:00:00,2015-12-09 13:14:03,2020-05-24 15:32:18
2,ID_102CV85,2015-12-18 00:00:00,2000,2018-03-29 10:14:58,35.0,FINANCED,DAILY,35,7,Female,48.0,Nairobi Region,Makueni,Business,d_light,392,29480.0,2017-01-13 00:00:00,2015-12-18 06:22:34,2017-02-01 15:23:44
3,ID_HXBJFHB,2015-11-25 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Female,43.0,,UNKNOWN,Teacher,d_light,364,14740.0,2016-11-23 00:00:00,2015-11-25 13:25:57,2017-05-22 16:46:54
4,ID_3K9VZ5J,2015-12-02 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Female,56.0,Mount Kenya Region,Kirinyaga,Other,d_light,364,14740.0,2016-11-30 00:00:00,2015-12-05 10:34:32,2017-05-12 16:50:52


In [103]:
metadata.isna().sum()

ID                         0
RegistrationDate           0
Deposit                    0
UpsellDate             36370
AccessoryRate              0
PaymentMethod              0
rateTypeEntity             0
RatePerUnit                0
DaysOnDeposit              0
MainApplicantGender        0
Age                     6939
Region                  1934
Town                       0
Occupation                 0
SupplierName               0
Term                       0
TotalContractValue         0
ExpectedTermDate           0
FirstPaymentDate           0
LastPaymentDate            0
dtype: int64

In [104]:
# Drop the columns with more than 50% missing values
def drop_missing_values(df, threshold=0.5):
    return df.dropna(thresh=threshold*len(df), axis=1)

metadata = drop_missing_values(metadata, threshold=0.5)

# Impute the missing values with the mode
def impute_missing_values(df):
    for col in df.columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    return df

metadata = impute_missing_values(metadata)

metadata.isna().sum()

ID                     0
RegistrationDate       0
Deposit                0
AccessoryRate          0
PaymentMethod          0
rateTypeEntity         0
RatePerUnit            0
DaysOnDeposit          0
MainApplicantGender    0
Age                    0
Region                 0
Town                   0
Occupation             0
SupplierName           0
Term                   0
TotalContractValue     0
ExpectedTermDate       0
FirstPaymentDate       0
LastPaymentDate        0
dtype: int64

In [105]:
# Merge the metadata with the train dataset
train = pd.merge(train, metadata, on='ID', how='left')
train.isna().sum()

ID                       0
TotalPaymentsHistory     0
FirstTransactionDate     0
LastTransactionDate      0
Target                   0
FirstTransactionYear     0
FirstTransactionMonth    0
LastTransactionYear      0
LastTransactionMonth     0
RegistrationDate         0
Deposit                  0
AccessoryRate            0
PaymentMethod            0
rateTypeEntity           0
RatePerUnit              0
DaysOnDeposit            0
MainApplicantGender      0
Age                      0
Region                   0
Town                     0
Occupation               0
SupplierName             0
Term                     0
TotalContractValue       0
ExpectedTermDate         0
FirstPaymentDate         0
LastPaymentDate          0
dtype: int64

In [107]:
train.shape, test.shape

((28007, 27), (9336, 3))