In [None]:
import os
import warnings
import numpy as np  
import seaborn as sns
import pandas as pd, os, gc
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, RobustScaler

%matplotlib inline
warnings.filterwarnings('ignore')
from typing import List

# HELPER FUNCTION

In [None]:
class DoubleValidationEncoderNumerical:
    """
    Encoder with validation within
    """
    def __init__(self, cols: List, encoder, folds):
        """
        :param cols: Categorical columns
        :param encoder: Encoder class
        :param folds: Folds to split the data
        """
        self.cols = cols
        self.encoder = encoder
        self.encoders_dict = {}
        self.folds = folds

    def fit_transform(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        for n_fold, (train_idx, val_idx) in enumerate(self.folds.split(X, y)):
            X_train, X_val = X.loc[train_idx].reset_index(drop=True), X.loc[val_idx].reset_index(drop=True)
            y_train, y_val = y[train_idx], y[val_idx]
            _ = self.encoder.fit_transform(X_train, y_train)

            # transform validation part and get all necessary cols
            val_t = self.encoder.transform(X_val)

            if n_fold == 0:
                cols_representation = np.zeros((X.shape[0], val_t.shape[1]))
            
            self.encoders_dict[n_fold] = self.encoder

            cols_representation[val_idx, :] += val_t.values

        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)

        return cols_representation

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.reset_index(drop=True)

        cols_representation = None

        for encoder in self.encoders_dict.values():
            test_tr = encoder.transform(X)

            if cols_representation is None:
                cols_representation = np.zeros(test_tr.shape)

            cols_representation = cols_representation + test_tr / self.folds.n_splits

        cols_representation = pd.DataFrame(cols_representation, columns=X.columns)
        
        return cols_representation


class FrequencyEncoder:
    def __init__(self, cols):
        self.cols = cols
        self.counts_dict = None

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        counts_dict = {}
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict[col] = dict(zip(values, counts))
        self.counts_dict = counts_dict

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        counts_dict_test = {}
        res = []
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict_test[col] = dict(zip(values, counts))

            # if value is in "train" keys - replace "test" counts with "train" counts
            for k in [key for key in counts_dict_test[col].keys() if key in self.counts_dict[col].keys()]:
                counts_dict_test[col][k] = self.counts_dict[col][k]

            res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
        res = np.hstack(res)

        X[self.cols] = res
        return X

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.fit(X, y)
        X = self.transform(X)
        return X

## Reduce memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Parsing Date columns
train = pd.read_csv('Train.csv',parse_dates=['TransactionStartTime','PaidOnDate','DueDate','IssuedDateLoan'])
test = pd.read_csv('Test.csv',parse_dates=['TransactionStartTime','IssuedDateLoan'])
sample =  pd.read_csv('sample_submission.csv')
unlinked_masked_final = pd.read_csv('unlinked_masked_final.csv',parse_dates=['TransactionStartTime'])

In [None]:
print (train['Value'].min() , 'and' , train['Value'].max() ,'loan values')
print (test['Value'].min() , 'and' , test['Value'].max() ,'loan values')

In [5]:
print(train.shape, test.shape, unlinked_masked_final.shape)

(2100, 27) (905, 19) (16327, 12)


In [7]:
#categorical and numerical columns into variables
num_col = train.select_dtypes(include=np.number).columns
cat_col = train.select_dtypes(exclude=np.number).columns

In [8]:
#lets reduce memory usage
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to  0.35 Mb (19.9% reduction)
Mem. usage decreased to  0.11 Mb (13.8% reduction)


In [10]:
#checking for missing value
train.apply(lambda x: sum(x.isnull()), axis =0)

CustomerId                 0
TransactionStartTime       0
Value                      0
Amount                     0
TransactionId              0
BatchId                    0
SubscriptionId             0
CurrencyCode               0
CountryCode                0
ProviderId                 0
ProductId                  0
ProductCategory            0
ChannelId                  0
TransactionStatus          0
IssuedDateLoan           612
AmountLoan               612
Currency                 612
LoanId                   612
PaidOnDate               612
IsFinalPayBack           612
InvestorId               612
DueDate                  614
LoanApplicationId        617
PayBackId                612
ThirdPartyId             614
IsThirdPartyConfirmed    612
IsDefaulted              612
dtype: int64

In [11]:
#checking for missing value
test.apply(lambda x: sum(x.isnull()), axis =0)

CustomerId                0
TransactionStartTime      0
Value                     0
Amount                    0
TransactionId             0
BatchId                   0
SubscriptionId            0
CurrencyCode              0
CountryCode               0
ProviderId                0
ProductId                 0
ProductCategory           0
ChannelId                 0
TransactionStatus         0
IssuedDateLoan          427
LoanId                  427
InvestorId              427
LoanApplicationId       427
ThirdPartyId            427
dtype: int64

#After findings, missing values was as a result of rejected loan or no loan

# TransactionStartTime

In [12]:
#No transaction has a multiple Default state, its either a transaction was paid totally or defaulted totally

# Categorical Encoding

In [13]:
train[cat_col].nunique().sort_values().index

Index(['ChannelId', 'Currency', 'CurrencyCode', 'ProviderId', 'InvestorId',
       'SubscriptionId', 'ProductCategory', 'ProductId', 'CustomerId',
       'DueDate', 'LoanApplicationId', 'IssuedDateLoan', 'LoanId',
       'ThirdPartyId', 'PaidOnDate', 'PayBackId', 'BatchId', 'TransactionId',
       'TransactionStartTime'],
      dtype='object')

In [14]:
for col in ['ChannelId','ProviderId', 'InvestorId',
       'SubscriptionId', 'ProductCategory', 'ProductId', 'CustomerId']:
    train_col = train[col].unique()
    test_col = test[col].unique()
    
    print ('Not in Test ' + col + ' ',[i  for i in train_col if i not in test_col])
    print ('Not in Train ' + col + ' ',[i  for i in test_col  if i not in train_col])
    print('==' * 18)

Not in Test ChannelId  []
Not in Train ChannelId  []
Not in Test ProviderId  []
Not in Train ProviderId  []
Not in Test InvestorId  [nan, 'InvestorId_3']
Not in Train InvestorId  [nan]
Not in Test SubscriptionId  ['SubscriptionId_2', 'SubscriptionId_4', 'SubscriptionId_6']
Not in Train SubscriptionId  ['SubscriptionId_3']
Not in Test ProductCategory  []
Not in Train ProductCategory  ['ticket']
Not in Test ProductId  ['ProductId_16']
Not in Train ProductId  ['ProductId_14', 'ProductId_12', 'ProductId_11']
Not in Test CustomerId  ['CustomerId_305', 'CustomerId_433', 'CustomerId_329', 'CustomerId_405', 'CustomerId_266', 'CustomerId_303', 'CustomerId_144', 'CustomerId_136', 'CustomerId_492', 'CustomerId_125', 'CustomerId_71', 'CustomerId_431', 'CustomerId_1', 'CustomerId_425', 'CustomerId_249', 'CustomerId_493', 'CustomerId_119', 'CustomerId_31', 'CustomerId_453', 'CustomerId_351', 'CustomerId_114', 'CustomerId_357', 'CustomerId_501', 'CustomerId_339', 'CustomerId_82', 'CustomerId_255', 'C

# Frequency Encoding

In [15]:
def encode_FE(train, test, cols, normalize = True, ext_train = None, ext_test= None):
    norm = normalize
    for col in cols:
        if ext_train is None:
            df = pd.concat([train[col],test[col]])
            nm = col+'_FE'
        else:
            df = pd.concat([ext_train[col],ext_test[col]])
            nm = "rejected"+"_"+ col +"_FE" 
        vc = df.value_counts(dropna=True, normalize=norm).to_dict()
        vc[-1] = -1
        train[nm] = train[col].map(vc)
        train[nm] = train[nm].astype('float32')
        test[nm] = test[col].map(vc)
        test[nm] = test[nm].astype('float32')
        train[nm].fillna(0,inplace=True)
        test[nm].fillna(0,inplace=True)
                
        del df; x=gc.collect()
        print(nm,', ',end='')

# Label Encoding

In [16]:
# LABEL ENCODE
def encode_LE(train,test,cols,verbose=True):
    for col in cols:
        df_comb = pd.concat([train[col],test[col]],axis=0)
        df_comb,_ = df_comb.factorize(sort=True)
        nm = col
        if df_comb.max()>32000: 
            train[nm] = df_comb[:len(train)].astype('int32')
            test[nm] = df_comb[len(train):].astype('int32')
        else:
            train[nm] = df_comb[:len(train)].astype('int16')
            test[nm] = df_comb[len(train):].astype('int16')
        del df_comb; x=gc.collect()
        if verbose: print(nm,', ',end='')

# Advanced Encoding

In [17]:
def encode_AG__2(group ,main_columns, aggregations, train_df=train, test_df=test, ext_src=None,
              fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:  
            for agg_type in aggregations:
                if ext_src is None: 
                    temp_df = pd.concat([train_df[group +[main_column]], test_df[group +[main_column]]])
                    new_col_name = group[0]+"_"+group[1]+"_"+main_column+'_'+agg_type
                                    
                else:
                    temp_df = ext_src.copy()
                    new_col_name = "ext_data"+ "_"+group[0]+"_"+group[1]+"_"+main_column+'_'+agg_type
                    
                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby(group)[main_column].agg([agg_type]).reset_index(level=group).rename(
                                                        columns={agg_type: new_col_name})
                
                


                train_df[new_col_name] = pd.merge(train_df, temp_df, on=group, how='left')[new_col_name].astype('float32')
                test_df[new_col_name]  = pd.merge(test_df, temp_df, on=group, how='left')[new_col_name].astype('float32')
                
                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)
                
                print("'"+new_col_name+"'",', ',end='')
                
# GROUP AGGREGATION MEAN AND STD
# https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda
def encode_AG(uids ,main_columns, aggregations, train_df=train, test_df=test, ext_src=None,
              fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                if ext_src is None: 
                    temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                    new_col_name = main_column+'_'+col+'_'+agg_type
                                    
                else:
                    temp_df = ext_src.copy()
                    new_col_name = "ext_data"+ "_"+main_column+'_'+col+'_'+agg_type

                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df).astype('float32')
                
                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)
                
                print("'"+new_col_name+"'",', ',end='')
                
                
# COMBINE FEATURES
def encode_CB(col1,col2,df1=train,df2=test):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) 
#     encode_LE(nm,verbose=False)
    print(nm,', ',end='')
    
# GROUP AGGREGATION NUNIQUE
def encode_AG2( uids,main_columns, train_df=train, test_df=test):
    for main_column in main_columns:  
        for col in uids:
            comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
            train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
            test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
            print(col+'_'+main_column+'_ct, ',end='')


In [18]:
# df rejected loan
df_rejected = train[train.IsDefaulted.isnull()]
# remove duplicate Transaction to avoid noisy parameters
df_rejected.drop_duplicates(subset=['CustomerId','TransactionId'], keep='first', inplace=True)

In [19]:
# Test
## Get number of rejected loan for each customer
df_rejected_test = test[test.LoanId.isnull()]
## remove multiple, duplicate Transaction
df_rejected_test.drop_duplicates( subset=['CustomerId','TransactionId'], keep='first', inplace=True)

In [20]:
# Remove rejected loan
cleaned_test = test.iloc[test.IssuedDateLoan[~test.LoanId.isnull()].index]
cleaned_train = train.iloc[train.IssuedDateLoan[~train.LoanId.isnull()].index]

In [21]:
# Get count of defaulters rejected loan
encode_FE(cleaned_train, cleaned_test, ['CustomerId'], normalize=False, ext_train=df_rejected, ext_test=df_rejected_test)
# count of accepted loan
encode_AG2(['CustomerId'] ,  ['TransactionId'], train_df=cleaned_train, test_df=cleaned_test)

rejected_CustomerId_FE , CustomerId_TransactionId_ct, 

In [22]:
# get loan ratio
cleaned_train["rejected_loan_ratio"] = cleaned_train.rejected_CustomerId_FE/(cleaned_train.rejected_CustomerId_FE + cleaned_train.CustomerId_TransactionId_ct)
cleaned_test["rejected_loan_ratio"] = cleaned_test.rejected_CustomerId_FE/(cleaned_test.rejected_CustomerId_FE + cleaned_test.CustomerId_TransactionId_ct)

In [23]:
cleaned_train["rejected_loan_ratio"]

9       0.105263
10      0.105263
11      0.105263
13      0.020000
14      0.020000
          ...   
2095    0.200000
2096    0.200000
2097    0.285714
2098    0.285714
2099    0.086957
Name: rejected_loan_ratio, Length: 1488, dtype: float32

In [24]:
encode_AG(['CustomerId'] ,  ['Value'], ['mean','min','max','std'], train_df=cleaned_train, test_df=cleaned_test, 
              fillna=True, usena=False)
# how many type of product this customers are used to
encode_AG2(['CustomerId'] ,  ['ProductId', 'ProductCategory'], train_df=cleaned_train, test_df=cleaned_test)

'Value_CustomerId_mean' , 'Value_CustomerId_min' , 'Value_CustomerId_max' , 'Value_CustomerId_std' , CustomerId_ProductId_ct, CustomerId_ProductCategory_ct, 

In [25]:
# Get mean/median number of times a customer pays a loan, this will help to know the customers paying ability
#Get count of customer
encode_FE(cleaned_train, cleaned_test, ['CustomerId', 'TransactionId'], normalize=False)
# Get unique number of transaction of customers
encode_AG2(['CustomerId'] ,  ['TransactionId'], train_df=cleaned_train, test_df=cleaned_test)

cleaned_train["meanTransactionPerLoan"] = cleaned_train["CustomerId_FE"]/cleaned_train["CustomerId_TransactionId_ct"]
cleaned_test["meanTransactionPerLoan"] = cleaned_test["CustomerId_FE"]/cleaned_test["CustomerId_TransactionId_ct"]


CustomerId_FE , TransactionId_FE , CustomerId_TransactionId_ct, 

In [26]:
## Divide meanTransac by the current trans
cleaned_train["Value_Mean_Ratio"] = cleaned_train["Value"]/cleaned_train["Value_CustomerId_mean"]
cleaned_test["Value_Mean_Ratio"] = cleaned_test["Value"]/cleaned_test["Value_CustomerId_mean"]

In [27]:
## Subtract meanTransac by the current trans
cleaned_train["Value_Mean_Minus"] = cleaned_train["Value"]-cleaned_train["Value_CustomerId_mean"]
cleaned_test["Value_Mean_Minus"] = cleaned_test["Value"]-cleaned_test["Value_CustomerId_mean"]

In [29]:
#combine product and ca`btegorical(product category)
encode_CB("ProductId","ProductCategory",df1=cleaned_train,df2=cleaned_test)

ProductId_ProductCategory , 

In [30]:
#Get max and min of customer expense per product using previous transaction data
encode_AG(['CustomerId'] ,  ['Value'], ['mean','min','max','std'], train_df=cleaned_train, test_df=cleaned_test, 
              fillna=True, usena=False, ext_src=unlinked_masked_final)

'ext_data_Value_CustomerId_mean' , 'ext_data_Value_CustomerId_min' , 'ext_data_Value_CustomerId_max' , 'ext_data_Value_CustomerId_std' , 

In [31]:
# mean,max,min,std of cost of previous loan per productId
encode_AG__2(['CustomerId',"ProductId"] ,  ['Value'], ['mean','min','max','std'], train_df=cleaned_train, test_df=cleaned_test, 
              fillna=True, usena=False)

# mean,max,min,std of cost of previous loan per productCategory
encode_AG__2(['CustomerId',"ProductCategory"] ,  ['Value'], ['mean','min','max','std'], train_df=cleaned_train, test_df=cleaned_test, 
              fillna=True, usena=False)

'CustomerId_ProductId_Value_mean' , 'CustomerId_ProductId_Value_min' , 'CustomerId_ProductId_Value_max' , 'CustomerId_ProductId_Value_std' , 'CustomerId_ProductCategory_Value_mean' , 'CustomerId_ProductCategory_Value_min' , 'CustomerId_ProductCategory_Value_max' , 'CustomerId_ProductCategory_Value_std' , 

In [32]:
# mean,max,min,std of cost of previous loan per productId
encode_AG__2(['CustomerId',"ProductId"] ,  ['Value'], ['mean','min','max','std'], train_df=cleaned_train, test_df=cleaned_test, 
              fillna=True, usena=False, ext_src=unlinked_masked_final)

# mean,max,min,std of cost of previous loan per productCategory
encode_AG__2(['CustomerId',"ProductCategory"] ,  ['Value'], ['mean','min','max','std'], train_df=cleaned_train, test_df=cleaned_test, 
              fillna=True, usena=False, ext_src=unlinked_masked_final)

'ext_data_CustomerId_ProductId_Value_mean' , 'ext_data_CustomerId_ProductId_Value_min' , 'ext_data_CustomerId_ProductId_Value_max' , 'ext_data_CustomerId_ProductId_Value_std' , 'ext_data_CustomerId_ProductCategory_Value_mean' , 'ext_data_CustomerId_ProductCategory_Value_min' , 'ext_data_CustomerId_ProductCategory_Value_max' , 'ext_data_CustomerId_ProductCategory_Value_std' , 

In [33]:
# Ratio of product_value mean to current value mean
cleaned_train["CustomerId_ProductCategory_Value_Ratio"] = cleaned_train["Value"]/cleaned_train["CustomerId_ProductCategory_Value_mean"]
cleaned_test["CustomerId_ProductCategory_Value_Ratio"] = cleaned_test["Value"]/cleaned_test["CustomerId_ProductCategory_Value_mean"]

# Ratio of product_value mean to current value mean
cleaned_train["CustomerId_ProductId_Value_Ratio"] = cleaned_train["Value"]/cleaned_train["CustomerId_ProductId_Value_mean"]
cleaned_test["CustomerId_ProductId_Value_Ratio"] = cleaned_test["Value"]/cleaned_test["CustomerId_ProductId_Value_mean"]

In [34]:
# Ratio of product_value mean to current value mean
cleaned_train["ext_data_CustomerId_ProductCategory_Value_Ratio"] = cleaned_train["Value"]/cleaned_train["ext_data_CustomerId_ProductCategory_Value_mean"]
cleaned_test["ext_data_CustomerId_ProductCategory_Value_Ratio"] = cleaned_test["Value"]/cleaned_test["ext_data_CustomerId_ProductCategory_Value_mean"]

# Ratio of product_value mean to current value mean
cleaned_train["ext_data_CustomerId_ProductId_Value_Ratio"] = cleaned_train["Value"]/cleaned_train["ext_data_CustomerId_ProductId_Value_mean"]
cleaned_test["ext_data_CustomerId_ProductId_Value_Ratio"] = cleaned_test["Value"]/cleaned_test["ext_data_CustomerId_ProductId_Value_mean"]

In [35]:
#Remove duplicate transaction because they have the same transaction id, and we will be able to track the timeframe for transactions
cleaned_train.drop_duplicates(subset=['CustomerId','TransactionId'], keep='first', inplace=True)

In [36]:
#frequency encode
encode_FE(cleaned_train, cleaned_test, ['CustomerId', 'TransactionId'])
#Label encode customer Id
encode_LE(cleaned_train, cleaned_test, ['CustomerId'])
# label Encode
encode_LE(cleaned_train, cleaned_test,['ProductCategory', 'ProductId','SubscriptionId','InvestorId',"ProductId_ProductCategory"])

CustomerId_FE , TransactionId_FE , CustomerId , ProductCategory , ProductId , SubscriptionId , InvestorId , ProductId_ProductCategory , 

# TIME PREPROCESSING

In [37]:
def date_trans(train, cols):
    for col in cols:
        
# Add features from date time
        attr = ['Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
                'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'] +['Hour', 'Minute', 'Second']
        for n in attr: train[col + n] = getattr(train[col].dt, n.lower())
        train[col + 'Elapsed'] = train[col].astype(np.int64) // 10 ** 9
        
def date_trans_due(train, cols):
    for col in cols:
        
# Add features from date time
        attr = ['Day', 'Dayofweek',
                'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
        for n in attr: train[col + n] = getattr(train[col].dt, n.lower())
        train[col + 'Elapsed'] = train[col].astype(np.int64) // 10 ** 9

In [38]:
# fill missing dueDate
cleaned_train.DueDate.fillna(cleaned_train['IssuedDateLoan'] +  pd.to_timedelta(30, unit='d'),inplace=True)

In [39]:
# TransactionTime
date_trans(cleaned_train, ['TransactionStartTime'])
date_trans(cleaned_test, ['TransactionStartTime'])

In [40]:
# Issue Time
cleaned_train['Issue_Trans_Diff'] = (cleaned_train.IssuedDateLoan - cleaned_train.TransactionStartTime).dt.total_seconds()
cleaned_test['Issue_Trans_Diff'] = (cleaned_test.IssuedDateLoan - cleaned_test.TransactionStartTime).dt.total_seconds()

In [41]:
# all Due days are 30 Days, so calculate for test
# Calculate due date in test since not in train
# DueDate
cleaned_test['DueDate'] = cleaned_test['IssuedDateLoan'] +  pd.to_timedelta(30, unit='d')
date_trans_due(cleaned_train, ['DueDate'])
date_trans_due(cleaned_test, ['DueDate'])

In [42]:
#BOOLEAN TO INT
cleaned_train[cleaned_train.select_dtypes(include='bool').columns] = cleaned_train.select_dtypes(include='bool').astype(int)
cleaned_test[cleaned_test.select_dtypes(include='bool').columns] = cleaned_test.select_dtypes(include='bool').astype(int)

# PRINCIPAL COMPONENT ANALYSIS / CLUSTERING

In [43]:
col_start = ['Value', 'Amount', 'ProductId', 'ProductCategory', 'TransactionStatus',
       'InvestorId', 'rejected_CustomerId_FE', 'CustomerId_TransactionId_ct',
       'rejected_loan_ratio', 'Value_CustomerId_mean', 'Value_CustomerId_min',
       'Value_CustomerId_max', 'Value_CustomerId_std',
       'CustomerId_ProductId_ct', 'CustomerId_ProductCategory_ct',
       'CustomerId_FE', 'TransactionId_FE', 'meanTransactionPerLoan',
       'Value_Mean_Ratio', 'Value_Mean_Minus', 'ProductId_ProductCategory',
       'ext_data_Value_CustomerId_mean', 'ext_data_Value_CustomerId_min',
       'ext_data_Value_CustomerId_max', 'ext_data_Value_CustomerId_std',
       'CustomerId_ProductId_Value_mean', 'CustomerId_ProductId_Value_min',
       'CustomerId_ProductId_Value_max', 'CustomerId_ProductId_Value_std',
       'CustomerId_ProductCategory_Value_mean',
       'CustomerId_ProductCategory_Value_min',
       'CustomerId_ProductCategory_Value_max',
       'CustomerId_ProductCategory_Value_std',
       'ext_data_CustomerId_ProductId_Value_mean',
       'ext_data_CustomerId_ProductId_Value_min',
       'ext_data_CustomerId_ProductId_Value_max',
       'ext_data_CustomerId_ProductId_Value_std',
       'ext_data_CustomerId_ProductCategory_Value_mean',
       'ext_data_CustomerId_ProductCategory_Value_min',
       'ext_data_CustomerId_ProductCategory_Value_max',
       'ext_data_CustomerId_ProductCategory_Value_std',
       'CustomerId_ProductCategory_Value_Ratio',
       'CustomerId_ProductId_Value_Ratio',
       'ext_data_CustomerId_ProductCategory_Value_Ratio',
       'ext_data_CustomerId_ProductId_Value_Ratio',
       'TransactionStartTimeMonth', 'TransactionStartTimeWeek',
       'TransactionStartTimeDay', 'TransactionStartTimeDayofweek',
       'TransactionStartTimeDayofyear', 'TransactionStartTimeIs_month_end',
       'TransactionStartTimeIs_month_start',
       'TransactionStartTimeIs_quarter_end',
       'TransactionStartTimeIs_quarter_start',
       'TransactionStartTimeIs_year_end', 'TransactionStartTimeIs_year_start',
       'TransactionStartTimeHour', 'TransactionStartTimeMinute',
       'TransactionStartTimeSecond', 'TransactionStartTimeElapsed',
       'Issue_Trans_Diff', 'DueDateDay', 'DueDateDayofweek',
       'DueDateIs_month_end', 'DueDateIs_month_start', 'DueDateIs_quarter_end',
       'DueDateIs_quarter_start', 'DueDateIs_year_end', 'DueDateIs_year_start',
       'DueDateElapsed']

In [44]:
all_data = pd.concat([cleaned_train, cleaned_test])

In [45]:
#pca
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
# Create a scaler object
sc = StandardScaler()
# Fit the scaler to the features and transform
X_std = sc.fit_transform(all_data[col_start])
# Create a pca object with the 2 components as a parameter
pca = decomposition.PCA(n_components=3)
# Fit the PCA and transform the data
X_std_pca = pca.fit_transform(X_std)

In [46]:
all_data["pc1"] = 0
all_data["pc2"] = 0
all_data["pc3"] = 0
all_data[["pc1","pc2","pc3"]] = X_std_pca

In [47]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=1996).fit(all_data[col_start])
all_data["kmeans"] = kmeans.predict(all_data[col_start])

In [48]:
cleaned_train = all_data[:1153]
cleaned_test = all_data[1153:]

In [49]:
train_col = ['CustomerId', 
             'Value', 
            'SubscriptionId', 
            'ProductId', 
            'ProductCategory',
       
        'IsDefaulted', 
             'rejected_CustomerId_FE',
       'CustomerId_TransactionId_ct',
             'rejected_loan_ratio',
       'Value_CustomerId_mean', 'Value_CustomerId_min', 'Value_CustomerId_max',
       'Value_CustomerId_std', 'CustomerId_ProductId_ct',
       'CustomerId_ProductCategory_ct', 'CustomerId_FE', 'TransactionId_FE',
       'meanTransactionPerLoan', 'Value_Mean_Ratio', 'Value_Mean_Minus',
       'ProductId_ProductCategory', 'ext_data_Value_CustomerId_mean',
       'ext_data_Value_CustomerId_min', 'ext_data_Value_CustomerId_max',
       'ext_data_Value_CustomerId_std', 'CustomerId_ProductId_Value_mean',
       'CustomerId_ProductId_Value_min', 'CustomerId_ProductId_Value_max',
       'CustomerId_ProductId_Value_std',
       'CustomerId_ProductCategory_Value_mean',
       'CustomerId_ProductCategory_Value_min',
       'CustomerId_ProductCategory_Value_max',
       'CustomerId_ProductCategory_Value_std',
       'ext_data_CustomerId_ProductId_Value_mean',
       'ext_data_CustomerId_ProductId_Value_min',
       'ext_data_CustomerId_ProductId_Value_max',
       'ext_data_CustomerId_ProductId_Value_std',
       'ext_data_CustomerId_ProductCategory_Value_mean',
       'ext_data_CustomerId_ProductCategory_Value_min',
       'ext_data_CustomerId_ProductCategory_Value_max',
       'ext_data_CustomerId_ProductCategory_Value_std',
       'CustomerId_ProductCategory_Value_Ratio',
       'CustomerId_ProductId_Value_Ratio',
       'ext_data_CustomerId_ProductCategory_Value_Ratio',
       'ext_data_CustomerId_ProductId_Value_Ratio',
       'TransactionStartTimeMonth', 'TransactionStartTimeWeek',
       'TransactionStartTimeDay', 'TransactionStartTimeDayofweek',
       'TransactionStartTimeDayofyear', 'TransactionStartTimeIs_month_end',
       'TransactionStartTimeIs_month_start',
       'TransactionStartTimeIs_quarter_end',
       'TransactionStartTimeIs_quarter_start',
       'TransactionStartTimeHour', 'TransactionStartTimeMinute',
       'TransactionStartTimeSecond', 'TransactionStartTimeElapsed',
       'Issue_Trans_Diff', 'DueDateDay', 'DueDateDayofweek',
       'DueDateIs_month_end', 'DueDateIs_month_start', 'DueDateIs_quarter_end',
       'DueDateIs_quarter_start', 'DueDateIs_year_end', 'DueDateIs_year_start',
       'DueDateElapsed','pc1', 'pc2', 'pc3', 'kmeans'
            ]
Target_name="IsDefaulted"
not_used_cols=[Target_name,'TransactionStartTime',"TransactionStartTimeMonth"]
features_name=[ f for f in train_col if f not in not_used_cols]

In [50]:
cleaned_train.reset_index(inplace=True)

In [51]:
# There are 6 group fold, so do a train test on the 6 fold
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  auc, roc_auc_score, roc_curve

from sklearn.pipeline import make_pipeline

sc = StandardScaler()

In [52]:
cleaned_train.to_csv('cleaned_train.csv', index=False)
cleaned_test.to_csv('cleaned_test.csv', index=False)

In [None]:
#lets head to modelling