In [18]:
import boto3
import sagemaker

import pandas as pd
import numpy as np

import pickle

In [19]:
session = boto3.session.Session()
region_name = session.region_name
print(region_name)
sagemaker_session = sagemaker.Session()
bucket = 'klarnadataset'
smclient = boto3.Session().client('sagemaker')

us-east-2


In [20]:
# load original data from S3 bucket
data_key = 'dataset.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

df = pd.read_csv(data_location,delimiter=';')
columns_original = df.columns

In [1]:
def preprocessing_data(df):
    
    column_predictors_account_status = ['account_amount_added_12_24m', 'account_days_in_dc_12_24m',
       'account_days_in_rem_12_24m', 'account_days_in_term_12_24m',                     
       'age', 'avg_payment_span_0_12m','avg_payment_span_0_3m', 
       'has_paid', 'max_paid_inv_0_12m', 
       'max_paid_inv_0_24m', 'num_active_inv',
       'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 
       'num_arch_ok_0_12m', 'num_arch_ok_12_24m', 
       'num_arch_rem_0_12m', 'num_unpaid_bills',
       'status_last_archived_0_24m', 'status_2nd_last_archived_0_24m',
       'status_3rd_last_archived_0_24m', 'status_max_archived_0_6_months',
       'status_max_archived_0_12_months', 'status_max_archived_0_24_months',
       'recovery_debt', 'sum_capital_paid_account_0_12m',
       'sum_capital_paid_account_12_24m', 'sum_paid_inv_0_12m', 
        'time_hours', 'worst_status_active_inv','num_active_div_by_paid_inv_0_12m']

    column_predictors = ['account_amount_added_12_24m', 'account_days_in_dc_12_24m',
       'account_days_in_rem_12_24m', 'account_days_in_term_12_24m',
       'age', 'avg_payment_span_0_12m','avg_payment_span_0_3m', 
       'has_paid', 'max_paid_inv_0_12m', 
       'max_paid_inv_0_24m', 'num_active_inv',
       'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 
       'num_arch_ok_0_12m', 'num_arch_ok_12_24m', 
       'num_arch_rem_0_12m', 'num_unpaid_bills',
       'status_last_archived_0_24m', 'status_2nd_last_archived_0_24m',
       'status_3rd_last_archived_0_24m', 'status_max_archived_0_6_months',
       'status_max_archived_0_12_months', 'status_max_archived_0_24_months',
       'recovery_debt', 'sum_capital_paid_account_0_12m',
       'sum_capital_paid_account_12_24m', 'sum_paid_inv_0_12m', 
        'time_hours', 'worst_status_active_inv']

    column_predictors_worst_status = ['account_amount_added_12_24m', 'account_days_in_dc_12_24m',
       'account_days_in_rem_12_24m', 'account_days_in_term_12_24m',                     
       'age', 'avg_payment_span_0_12m','avg_payment_span_0_3m', 
       'has_paid', 'max_paid_inv_0_12m', 'account_status', 
       'max_paid_inv_0_24m', 'num_active_inv',
       'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 
       'num_arch_ok_0_12m', 'num_arch_ok_12_24m', 
       'num_arch_rem_0_12m', 'num_unpaid_bills',
       'status_last_archived_0_24m', 'status_2nd_last_archived_0_24m',
       'status_3rd_last_archived_0_24m', 'status_max_archived_0_6_months',
       'status_max_archived_0_12_months', 'status_max_archived_0_24_months',
       'recovery_debt', 'sum_capital_paid_account_0_12m',
       'sum_capital_paid_account_12_24m', 'sum_paid_inv_0_12m', 
        'time_hours', 'worst_status_active_inv','num_active_div_by_paid_inv_0_12m']

    medians = [0.0, 23.0, 73.0, 62.0]
    slope = 1.2
    
    def inputer_account_days_in_rem_12_24m(x,medians): # imputing using its correlation with account_worst_status_12_24m
        a = 0
        if pd.isnull(x['account_days_in_rem_12_24m']):
            if x['account_worst_status_12_24m'] == 1:
                a = medians[0]
            elif x['account_worst_status_12_24m'] == 2:
                a = medians[1]
            elif x['account_worst_status_12_24m'] in [3,4]:
                a = (medians[2] + medians[3]) / 2
        else:
            a = x['account_days_in_rem_12_24m']
        return a

    def inputer_avg_payment_span_0_12m(x,lr_avg_payment):   # imputing values using the linear model fitted on the training set
        a = 0
        if pd.isnull(x['avg_payment_span_0_12m']):
            a = lr_avg_payment.predict(np.array(x[['status_last_archived_0_24m',
                            'status_max_archived_0_12_months',
                            'status_max_archived_0_24_months',
                            'num_arch_rem_0_12m']]).reshape(1,4))[0]
            a = np.clip(a,0,None)
        else:
            a = x['avg_payment_span_0_12m']
        return a

    def inputer_worst_status_active_inv(x,scaler,lr_worst_status):
        a = 0
        if pd.isnull(x['worst_status_active_inv']):
            X_data = scaler.transform(np.array(x[['recovery_debt', 
                                     'avg_payment_span_0_12m',
                                     'avg_payment_span_0_3m',
                                     'num_arch_rem_0_12m']]).reshape(1,4))
            a = lr_worst_status.predict(X_data)[0]
            a = np.clip(a,1.,3.)
        else:
            a = x['worst_status_active_inv']
        return a

    def inputer_num_active_div_by_paid_inv_0_12m(x,scaler,knn_div_by_paid):
        a = 0
        if pd.isnull(x['num_active_div_by_paid_inv_0_12m']):   
            X_data_normalized = scaler.transform(np.array(x[column_predictors]).reshape(1,len(column_predictors)))
            a = knn_div_by_paid.predict(X_data_normalized)[0]
            a = np.clip(a,0.,None)
        else:
            a = x['num_active_div_by_paid_inv_0_12m']
        return a


    def inputer_account_status(x, feature,scaler_account,knn_account):
        a = 0
        if pd.isnull(x[feature]):   
            X_data_normalized = scaler_account.transform(np.array(x[column_predictors_account_status]).reshape(1,len(column_predictors_account_status)))
            a = knn_account.predict(X_data_normalized)[0]
        else:
            a = x[feature]
        assert a != 0, 'problem with inputer_account_status'
        return a

    def email_address_format(x):
        if x in ['no_match','Nick']:
            x = 0 # doesn't reveal personal information
        elif x in ['F+L','L1+F','F','F1+L','L','Initials']:
            x = 1 # provides personal information
        else:
            pass
        return x
    
    #concat_data = pd.read_csv(df)
    concat_data = df
    #drop columns
    
    concat_data.drop(columns=['num_arch_written_off_0_12m',
                 'num_arch_written_off_12_24m',
                 'account_incoming_debt_vs_paid_0_24m',
                 'merchant_category'],inplace=True)
    

    # account_days_in_dc_12_24m & account_days_in_term_12_24m
    concat_data['account_days_in_dc_12_24m'].fillna(value=0,inplace=True)
    concat_data['account_days_in_term_12_24m'].fillna(value=0,inplace=True)
    
    # account_days_in_rem_12_24m
    account_worst_status_medians = [0.0, 23.0, 73.0, 62.0]
    concat_data['account_days_in_rem_12_24m'] = concat_data.apply(lambda x: inputer_account_days_in_rem_12_24m(x,medians),axis=1)
    
    # avg_payment_span_0_12m
    s3 = boto3.resource('s3')
    lr_avg_payment = pickle.loads(s3.Bucket(bucket).Object("lr_avg_payment.pkl").get()['Body'].read())
    concat_data['avg_payment_span_0_12m'] = concat_data.apply(lambda x: inputer_avg_payment_span_0_12m(x,lr_avg_payment),axis=1)
    
    # avg_payment_span_0_3m
    concat_data['avg_payment_span_0_3m'] = concat_data.apply(lambda x: (slope * x['avg_payment_span_0_12m']) if pd.isnull(x['avg_payment_span_0_3m']) else x['avg_payment_span_0_3m'],axis=1)
    
    # worst_status_active_inv
    # load models
    scaler_worst_status_active_inv = pickle.loads(s3.Bucket(bucket).Object("scaler_worst_status_active_inv.pkl").get()['Body'].read())
    lr_worst_status_active_inv = pickle.loads(s3.Bucket(bucket).Object("lr_worst_status_active_inv.pkl").get()['Body'].read())

    concat_data['worst_status_active_inv'] = concat_data.apply(lambda x: inputer_worst_status_active_inv(x,
                                                                                                 scaler_worst_status_active_inv,
                                                                                                 lr_worst_status_active_inv),
                                                               axis=1)
    # num_active_div_by_paid_inv_0_12m
    num_active_dim_scaler = pickle.loads(s3.Bucket(bucket).Object("num_active_dim_scaler.pkl").get()['Body'].read())
    knn_div_by_paid = pickle.loads(s3.Bucket(bucket).Object("knn_div_by_paid.pkl").get()['Body'].read())



    concat_data['num_active_div_by_paid_inv_0_12m'] = concat_data.apply(lambda x: inputer_num_active_div_by_paid_inv_0_12m(x,
                                                                                                                       num_active_dim_scaler,
                                                                                                                      knn_div_by_paid),axis=1)
    # account_status
    feature_name= 'account_status'
    account_status_scaler = pickle.loads(s3.Bucket(bucket).Object('account_scaler.pkl').get()['Body'].read())
    account_status_knn = pickle.loads(s3.Bucket(bucket).Object("account_status_knn.pkl").get()['Body'].read())
    concat_data[feature_name] = concat_data.apply(lambda x: inputer_account_status(x, feature_name,account_status_scaler,account_status_knn),axis=1)


    # worst_status: new feature
    concat_data['worst_status'] = pd.DataFrame(concat_data[['account_status','account_worst_status_0_3m',
                                'account_worst_status_12_24m',
                                'account_worst_status_3_6m',
                                'account_worst_status_6_12m']].max(axis=1))
    # drop some features
    concat_data.drop(columns=['account_worst_status_0_3m',
                                'account_worst_status_12_24m',
                                'account_worst_status_3_6m',
                                'account_worst_status_6_12m'],inplace=True)
    
    feature_name_ = 'worst_status'
    worst_status_scaler = pickle.loads(s3.Bucket(bucket).Object('worst_status_scaler.pkl').get()['Body'].read())
    worst_status_knn = pickle.loads(s3.Bucket(bucket).Object("worst_status_knn.pkl").get()['Body'].read())
    concat_data[feature_name] = concat_data.apply(lambda x: inputer_account_status(x, feature_name_,worst_status_scaler,worst_status_knn),axis=1)


    # drop uuid
    concat_data.drop(columns=['uuid'],inplace=True)
    
    # dummy variables for merchant_grup
    concat_data = pd.get_dummies(concat_data, columns=["merchant_group"], prefix=["merchant_group"] )
    
    # name_in_email to categorical variables
    concat_data['name_in_email'] = concat_data['name_in_email'].apply(lambda x: email_address_format(x))
    
    # 'has_paid' from boolean to categorical
    concat_data['has_paid'] = concat_data['has_paid'].apply(lambda x: 0. if x == True else 1.)
    
    # normalize the whole dataset
    robust_scaler = pickle.loads(s3.Bucket(bucket).Object('robust_scaler.pkl').get()['Body'].read())
    concat_data = robust_scaler.transform(concat_data)
    concat_data = concat_data[:,1:]
    return concat_data

In [81]:
test_set = df[df['default'].isnull()]

In [82]:
test_set_ = test_set.copy()
clean_test = preprocessing_data(test_set_)