In [1]:
## importing libraries
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [4]:
## reading the files and loading them into dataframes.
train = pd.read_csv('C:/Users/Amine/Desktop/Standard Bank Tech Impact Challenge Xente credit scoring challenge/Train.csv')
test= pd.read_csv('C:/Users/Amine/Desktop/Standard Bank Tech Impact Challenge Xente credit scoring challenge/Test.csv')
sample = pd.read_csv('C:/Users/Amine/Desktop/Standard Bank Tech Impact Challenge Xente credit scoring challenge/sample_submission.csv')
mask = pd.read_csv('C:/Users/Amine/Desktop/Standard Bank Tech Impact Challenge Xente credit scoring challenge/unlinked_masked_final.csv')
variabs = pd.read_csv('C:/Users/Amine/Desktop/Standard Bank Tech Impact Challenge Xente credit scoring challenge/VariableDefinitions.csv')

In [5]:
## Transform dates types from 'object' to 'datetime'
train.TransactionStartTime=pd.to_datetime(train.TransactionStartTime)
test.TransactionStartTime=pd.to_datetime(test.TransactionStartTime)
train.IssuedDateLoan=pd.to_datetime(train.IssuedDateLoan)
test.IssuedDateLoan=pd.to_datetime(test.IssuedDateLoan)
train.PaidOnDate=pd.to_datetime(train.PaidOnDate)
train.DueDate=pd.to_datetime(train.DueDate)

In [6]:
train.corr()

Unnamed: 0,Value,Amount,CountryCode,TransactionStatus,AmountLoan,IsFinalPayBack,IsThirdPartyConfirmed,IsDefaulted
Value,1.0,-0.999995,,-0.008597,0.554949,-0.399866,-0.086357,0.33322
Amount,-0.999995,1.0,,0.008536,-0.554698,0.400056,0.086438,-0.333401
CountryCode,,,,,,,,
TransactionStatus,-0.008597,0.008536,,1.0,,,,
AmountLoan,0.554949,-0.554698,,,1.0,-0.04588,-0.195072,0.031049
IsFinalPayBack,-0.399866,0.400056,,,-0.04588,1.0,-0.069143,-0.594954
IsThirdPartyConfirmed,-0.086357,0.086438,,,-0.195072,-0.069143,1.0,0.017547
IsDefaulted,0.33322,-0.333401,,,0.031049,-0.594954,0.017547,1.0


In [7]:
## creating variables to transfer the information contained in the rows of the same transaction.
train['Number_Of_Split_Payments'] = 0 ## this is a count on the number of payments on the same loan. It will take a 0 for singled-rowed transactions, 1+ for multi-row transacs.
#train['Sum_Diff_Time_Payments'] = 0 ## I'm thinking of summing the delays between all payments made on a loan. It will take 0 for loans paid in a single time, 1+ for multiple payments on the same loan.
test['Number_Of_Split_Payments']=0
#test['Sum_Diff_Time_Payments']=0

In [8]:
## creating the feature : number of split payments on a loan.
train['Number_Of_Split_Payments']=train['TransactionId'].map(train.groupby('TransactionId').agg('count')['Number_Of_Split_Payments'])
test['Number_Of_Split_Payments']=test['TransactionId'].map(test.groupby('TransactionId').agg('count')['Number_Of_Split_Payments'])

In [9]:
train.drop(train[(train.TransactionId=='TransactionId_703')|((train.TransactionId=='TransactionId_927'))].index,axis=0,inplace=True)

In [10]:
## Lets drop the duplicate rows with the same transaction ID and keep the last one. (as in with the latest payment installment )
train.drop_duplicates(subset=['TransactionId'],keep='last',inplace=True)
test.drop_duplicates(subset=['TransactionId'],keep='last',inplace=True)

In [11]:
train.drop(['CountryCode','Currency','CurrencyCode','SubscriptionId','ProviderId','ChannelId'],axis=1,inplace=True)
test.drop(['CountryCode','CurrencyCode','SubscriptionId','ProviderId','ChannelId'],axis=1,inplace=True)

### Feature engineering

In [12]:
train['Count_Rejected_Loans'] = train['CustomerId'].map(train[train.TransactionStatus==0].groupby('CustomerId').LoanId.size())
test['Count_Rejected_Loans'] = test['CustomerId'].map(train[train.TransactionStatus==0].groupby('CustomerId').LoanId.size())
## then we should impute the columns of customers that were not found in the rejected list with 0 as in they have never been rejected.
train.Count_Rejected_Loans.fillna(value=0,inplace=True)
test.Count_Rejected_Loans.fillna(value=0,inplace=True)

In [13]:
## group train/test together to perform cumulative count
all_data=pd.concat((train,test)).copy()
## Initialize and compute values for the new feature
all_data['Cumulative_Reject']=0
all_data.loc[all_data.TransactionStatus==0,'Cumulative_Reject'] = all_data[all_data.TransactionStatus==0].groupby('CustomerId').cumcount()
## Separate all_data into train and test
train1=all_data[:len(train)]
test1=all_data[len(train):]
train['Cumulative_Reject']=0
test['Cumulative_Reject']=0
train['Cumulative_Reject']=train1['Cumulative_Reject']
test['Cumulative_Reject']=test1['Cumulative_Reject']

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [14]:
purchasestats=train[train.TransactionStatus==0].groupby('CustomerId').Value.agg(('mean','std','min','max'))
train['prchs_mean']=train['CustomerId'].map(purchasestats['mean'])
train['prchs_std']=train['CustomerId'].map(purchasestats['std'])
train['prchs_max']=train['CustomerId'].map(purchasestats['max'])
train['prchs_min']=train['CustomerId'].map(purchasestats['min'])
test['prchs_mean']=test['CustomerId'].map(purchasestats['mean'])
test['prchs_std']=test['CustomerId'].map(purchasestats['std'])
test['prchs_max']=test['CustomerId'].map(purchasestats['max'])
test['prchs_min']=test['CustomerId'].map(purchasestats['min'])

In [15]:
valuegroups=mask.groupby('CustomerId').Value.agg(('mean','std','min','max','count'))
train['mean_cus_transac']=train['CustomerId'].map(valuegroups['mean'])
train['std_cus_transac']=train['CustomerId'].map(valuegroups['std'])
train['min_cus_transac']=train['CustomerId'].map(valuegroups['min'])
train['max_cus_transac']=train['CustomerId'].map(valuegroups['max'])
test['mean_cus_transac']=test['CustomerId'].map(valuegroups['mean'])
test['std_cus_transac']=test['CustomerId'].map(valuegroups['std'])
test['min_cus_transac']=test['CustomerId'].map(valuegroups['min'])
test['max_cus_transac']=test['CustomerId'].map(valuegroups['max'])

In [16]:
train['Day_Of_Week']= train.TransactionStartTime.dt.weekday
test['Day_Of_Week'] =test.TransactionStartTime.dt.weekday
train['Day_in_month']=train.TransactionStartTime.dt.day
test['Day_in_month']=test.TransactionStartTime.dt.day

In [17]:
from datetime import date
datemin = date(2018,9,21)
datemax= date(2019,7,17)
(datemax-datemin).days
datesinc=pd.DataFrame(columns=['date','inc_value'])
datesinc.loc[0,'inc_value']=1
datesinc.loc[0,'date']=datemin
from datetime import timedelta
for i in range(2,301):
    datesinc.loc[i-1,'inc_value']=i
    datesinc.loc[i-1,'date']=datemin + timedelta(days=i-1)
train['inc_value_date']=train.TransactionStartTime.dt.date.map(datesinc.set_index('date').inc_value)
test['inc_value_date']=test.TransactionStartTime.dt.date.map(datesinc.set_index('date').inc_value)

In [18]:
train.inc_value_date = train.inc_value_date.astype(np.int64)
test.inc_value_date = test.inc_value_date.astype(np.int64)

In [19]:
aa=train[(train.TransactionStatus==1)&(train.TransactionStartTime<train.DueDate)].groupby('CustomerId').agg(('count','mean','std','min','max')).Value
#train['number_transac_before_due']=train['CustomerId'].map(aa['count'])
train['before_due_mean'] = train['CustomerId'].map(aa['mean'])
train['before_due_std'] = train['CustomerId'].map(aa['std'])
train['before_due_min'] = train['CustomerId'].map(aa['min'])
train['before_due_max'] = train['CustomerId'].map(aa['max'])
test['before_due_mean'] = test['CustomerId'].map(aa['mean'])
test['before_due_std'] = test['CustomerId'].map(aa['std'])
test['before_due_min'] = test['CustomerId'].map(aa['min'])
test['before_due_max'] = test['CustomerId'].map(aa['max'])

In [20]:
train['Cnt_missed_payment']=0
train.loc[train.DueDate<train.PaidOnDate,'Cnt_missed_payment']=train[train.DueDate<train.PaidOnDate].groupby('CustomerId').cumcount()
test['Cnt_missed_payment']=test['CustomerId'].map(train.groupby('CustomerId').agg('max').Cnt_missed_payment)

In [21]:
#train['IssuedDateLoan']=pd.to_datetime(train.IssuedDateLoan)
#train['Time_till_first_transac_onloan'] = (train['TransactionStartTime']-train['IssuedDateLoan']).astype('timedelta64[h]')
#train[(train['TransactionStartTime']-train['IssuedDateLoan']).astype('timedelta64[h]')<0]
#train['Factor_Loan']= train

In [22]:
train=train[train.IsDefaulted.notnull()]

In [23]:
mask.columns

Index(['TransactionId', 'BatchId', 'CustomerId', 'CurrencyCode', 'CountryCode',
       'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount',
       'Value', 'TransactionStartTime'],
      dtype='object')

In [24]:
test.columns

Index(['CustomerId', 'TransactionStartTime', 'Value', 'Amount',
       'TransactionId', 'BatchId', 'ProductId', 'ProductCategory',
       'TransactionStatus', 'IssuedDateLoan', 'LoanId', 'InvestorId',
       'LoanApplicationId', 'ThirdPartyId', 'Number_Of_Split_Payments',
       'Count_Rejected_Loans', 'Cumulative_Reject', 'prchs_mean', 'prchs_std',
       'prchs_max', 'prchs_min', 'mean_cus_transac', 'std_cus_transac',
       'min_cus_transac', 'max_cus_transac', 'count_cus_transac',
       'Day_Of_Week', 'Day_in_month', 'inc_value_date', 'before_due_mean',
       'before_due_std', 'before_due_min', 'before_due_max',
       'Cnt_missed_payment'],
      dtype='object')

In [25]:
train[train.IsDefaulted==1]

Unnamed: 0,CustomerId,TransactionStartTime,Value,Amount,TransactionId,BatchId,ProductId,ProductCategory,TransactionStatus,IssuedDateLoan,...,max_cus_transac,count_cus_transac,Day_Of_Week,Day_in_month,inc_value_date,before_due_mean,before_due_std,before_due_min,before_due_max,Cnt_missed_payment
26,CustomerId_474,2018-10-26 09:19:02,10000.0,-10000.0,TransactionId_2299,BatchId_500,ProductId_7,airtime,1,2018-10-26 09:19:00,...,28840.0,20.0,4,26,36,5166.666667,4752.192476,500.0,10000.0,0
37,CustomerId_474,2018-11-01 17:56:50,5000.0,-5000.0,TransactionId_1452,BatchId_728,ProductId_3,airtime,1,2018-11-01 17:56:48,...,28840.0,20.0,3,1,42,5166.666667,4752.192476,500.0,10000.0,1
202,CustomerId_429,2018-11-30 06:56:43,3000.0,-3000.0,TransactionId_358,BatchId_1911,ProductId_3,airtime,1,2018-11-30 06:56:40,...,2000.0,2.0,4,30,71,3000.000000,,3000.0,3000.0,0
221,CustomerId_434,2018-11-30 15:55:38,14000.0,-14000.0,TransactionId_1954,BatchId_74,ProductId_1,airtime,1,2018-11-30 15:55:36,...,10000.0,45.0,4,30,71,14000.000000,,14000.0,14000.0,0
504,CustomerId_503,2018-12-21 04:43:32,14999.0,-14999.0,TransactionId_1855,BatchId_387,ProductId_7,airtime,1,2018-12-21 04:43:30,...,10000.0,47.0,4,21,92,14999.000000,,14999.0,14999.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,CustomerId_92,2019-03-06 07:54:28,192000.0,-192000.0,TransactionId_1971,BatchId_569,ProductId_18,retail,1,2019-03-06 07:54:27,...,,,2,6,167,192000.000000,,192000.0,192000.0,0
1839,CustomerId_91,2019-03-07 08:13:38,192000.0,-192000.0,TransactionId_2330,BatchId_1957,ProductId_18,retail,1,2019-03-07 08:13:38,...,,,3,7,168,192000.000000,,192000.0,192000.0,0
1844,CustomerId_63,2019-03-07 08:13:54,192000.0,-192000.0,TransactionId_1710,BatchId_968,ProductId_18,retail,1,2019-03-07 08:13:52,...,,,3,7,168,192000.000000,,192000.0,192000.0,0
1846,CustomerId_414,2019-03-07 08:14:04,192000.0,-192000.0,TransactionId_2332,BatchId_203,ProductId_18,retail,1,2019-03-07 08:14:03,...,,,3,7,168,192000.000000,,192000.0,192000.0,0


In [26]:
train.loc[:,'new_customer']=0
test.loc[:,'new_customer']=0
train.loc[train.mean_cus_transac.isnull(),'new_customer']=1
test.loc[test.mean_cus_transac.isnull(),'new_customer']=1

In [28]:
features = ['CustomerId', 'TransactionStartTime', 
            'Value', 'Amount',
       'TransactionId', 
            'BatchId', 
             'ProductId',
    'mean_cus_transac','std_cus_transac', 'min_cus_transac', 'max_cus_transac', 
       'ProductCategory', 'TransactionStatus', 
            'IssuedDateLoan',
       'LoanId', 'LoanApplicationId', 'ThirdPartyId',
       'Number_Of_Split_Payments', 
            'Count_Rejected_Loans', 
           'Cumulative_Reject',
       'prchs_mean', 'prchs_std',
    'prchs_max', 'prchs_min', 
            'InvestorId',
            'Day_Of_Week',
    'Day_in_month','new_customer' 
    'inc_value_date',
    'before_due_mean', 'before_due_std',
       'before_due_min', 'before_due_max', 
            'Cnt_missed_payment'
]

In [61]:
oce = ce.OneHotEncoder(cols=['ProductId'])#,'ProductCategory','InvestorId'])
tce = ce.TargetEncoder(cols=['CustomerId'],smoothing=40,min_samples_leaf=5)

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
X = train[features]
X_test = test[features]
y=train.IsDefaulted.copy()

In [64]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,stratify=y,random_state=10)

In [65]:
X_train = oce.fit_transform(X_train)
X_train = tce.fit_transform(X_train,y_train)
X_val = oce.transform(X_val)
X_val = tce.transform(X_val)

In [73]:
xgb = XGBClassifier(max_depth=3,colsample_bytree=0.6,min_child_weight=10,learning_rate=0.25,n_estimators=100,objective = "binary:logistic")
xgb.fit(X_train,y_train)
val_pred=xgb.predict_proba(X_val)[:,1]
print(roc_auc_score(y_val,val_pred))

0.9894667544437129


In [74]:
X = oce.fit_transform(X)
X = tce.fit_transform(X_train,y_train)
X_test = test[features]
X_test = oce.transform(X_test)
X_test = tce.transform(X_test)

In [75]:
xgb.fit(X,y)
test_pred=xgb.predict_proba(X_test)[:,1]

In [76]:
sample_submission = pd.DataFrame(columns=['TransactionId','IsDefaulted'])
sample_submission['TransactionId'] = test['TransactionId']
sample_submission['IsDefaulted'] = test_pred

In [77]:
sample_submission.to_csv('C:/Users/Amine/Desktop/STB_Submit.csv',index=False)