In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For statistics, preprocessing and ML
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Disabling warnings
import warnings
warnings.simplefilter("ignore")

In [2]:
def read_csv(path):
    """The function reads a csv file, converts it to a data frame and returns a copy of the data frame."""
    data = pd.read_csv(path, encoding = 'unicode_escape')
    return data.copy()

previous_application = read_csv("previous_application.csv")
application_train = read_csv("application_train.csv")
application_test = read_csv("application_test.csv")

In [3]:
df_previous_application = previous_application.copy()

In [4]:
df_application_train = application_train.copy()

In [5]:
df_application_test = application_test.copy()

In [6]:
df_previous_application.replace({'DAYS_FIRST_DRAWING':365243,'DAYS_FIRST_DUE':365243,'DAYS_LAST_DUE_1ST_VERSION':365243,
                                   'DAYS_LAST_DUE':365243,'DAYS_TERMINATION':365243}, np.nan, inplace=True)
df_previous_application.AMT_DOWN_PAYMENT.fillna(0, inplace=True)
df_previous_application.RATE_DOWN_PAYMENT.fillna(0, inplace=True)
df_previous_application.NAME_TYPE_SUITE.fillna('Unknown',inplace=True)

In [7]:
df_previous_application.NAME_CONTRACT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.replace({'NAME_CASH_LOAN_PURPOSE':{'XNA':'Unknown', 'XAP':'Unknown'}},inplace=True)
df_previous_application.NAME_PAYMENT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.replace({'CODE_REJECT_REASON':{'XAP':'Not refused', 'XNA':'Other'}},inplace=True)
df_previous_application.NAME_CLIENT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.NAME_GOODS_CATEGORY.replace('XNA','Unknown',inplace=True)
df_previous_application.NAME_PORTFOLIO.replace('XNA',np.nan,inplace=True)
df_previous_application.NAME_PRODUCT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.NAME_SELLER_INDUSTRY.replace('XNA','Unknown',inplace=True)
df_previous_application.replace({'NAME_YIELD_GROUP':{'XNA':0,'low_normal':1,'low_action':1,'middle':3,'high':4}},inplace=True)

In [8]:
total_nan = df_previous_application.isnull().sum().sort_values(ascending = False)
percent_nan = (df_previous_application.isnull().sum()/df_previous_application.isnull().count()*100).sort_values(ascending = False)
missing_previous_data = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_previous_data.head(17)

Unnamed: 0,Total_nan,Percent_nan
RATE_INTEREST_PRIVILEGED,1664263,99.643698
RATE_INTEREST_PRIMARY,1664263,99.643698
DAYS_FIRST_DRAWING,1607509,96.245691
DAYS_TERMINATION,898978,53.824121
DAYS_LAST_DUE,884286,52.944473
DAYS_LAST_DUE_1ST_VERSION,766929,45.918008
DAYS_FIRST_DUE,713710,42.73165
NFLAG_INSURED_ON_APPROVAL,673065,40.298129
AMT_GOODS_PRICE,385515,23.081773
AMT_ANNUITY,372235,22.286665


In [9]:
df_previous_application.drop(labels=['RATE_INTEREST_PRIVILEGED','RATE_INTEREST_PRIMARY','DAYS_FIRST_DRAWING'], 
                             axis=1, inplace=True)

In [10]:
df_previous_application['PREVIOUS_TERM'] = df_previous_application['AMT_CREDIT'] / df_previous_application['AMT_ANNUITY']

In [11]:
df_previous_application['INTEREST'] = df_previous_application['CNT_PAYMENT']*df_previous_application['AMT_ANNUITY'] - df_previous_application['AMT_CREDIT']

In [12]:
df_previous_application.replace([np.inf, -np.inf], np.nan,inplace=True)

### NaN imputation
- We need to fill NaN values to use LGBM Regressor. 
- We can use YCImpute package to impute NaN values. EM algorithm gives result in a shorter time. KNN is time consuming and Iterforest gives error. 
- A function is formed for imputation. We need to convert the dataframe into an array and the values become float. We add lines to keep the datatypes same as before the method.

In [13]:
from ycimpute.imputer import iterforest
from ycimpute.imputer import EM
from ycimpute.imputer import knnimput

def nan_imputer(df):
    int_columns = df.select_dtypes(include='int64').columns
    cat_columns = df.select_dtypes(include='uint8').columns
    var_names = df.columns
    np_df = np.array(df)
    df = EM().complete(np_df)
    df = pd.DataFrame(df, columns = var_names)
    df[int_columns] = df[int_columns].astype('int64')
    df[cat_columns] = df[cat_columns].astype('uint8')
    return df
num_columns = df_previous_application.select_dtypes(exclude='object').columns
df_previous_application[num_columns] = nan_imputer(df_previous_application[num_columns])

## LGBM Model
- We use AMT_CREDIT,AMT_ANNUITY, PREVIOUS_TERM features to predict CNT_PAYMENT.
- We build a model with LGBM in order to predict CNT_PAYMENT in application dataframes, because we do not have this feature in these dataframes. Then, we can create new features for interest rates. 
- This approach is taken from the kaggle write-up of @kingychiu. 
- https://www.kaggle.com/c/home-credit-default-risk/discussion/64598

In [14]:
X = df_previous_application[['AMT_CREDIT','AMT_ANNUITY','PREVIOUS_TERM']]
y = df_previous_application['CNT_PAYMENT']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

In [16]:
lgbm_model = LGBMRegressor().fit(X_train, y_train)

In [17]:
y_pred = lgbm_model.predict(X_test, 
                            num_iteration = lgbm_model.best_iteration_)

In [18]:
np.sqrt(mean_squared_error(y_test, y_pred))

2.3175438628412164

In [19]:
y_pred

array([ 6.00707997, 11.44419284, 12.08686038, ..., 12.1480679 ,
        0.02669369,  0.02669369])

### Predicting CNT_PAYMENT
- We use AMT_CREDIT, AMT_ANNUITY and AMT_CREDIT/AMT_ANNUITY in application train and test dataframes to predict CNT_PAYMENT through the model constructed with the use of previous application data set. 
- The interest calculations are taken from the kaggle write-up of @kingychiu. 
- https://www.kaggle.com/c/home-credit-default-risk/discussion/64598

In [20]:
df_application_train['CREDIT_TERM'] = df_application_train['AMT_CREDIT'] / df_application_train['AMT_ANNUITY']
df_application_test['CREDIT_TERM'] = df_application_test['AMT_CREDIT'] / df_application_test['AMT_ANNUITY']

In [21]:
df_application_train.replace([np.inf, -np.inf], np.nan,inplace=True)
df_application_test.replace([np.inf, -np.inf], np.nan,inplace=True)

In [22]:
X_app_train = df_application_train[['AMT_CREDIT','AMT_ANNUITY','CREDIT_TERM']]
y_pred_train = lgbm_model.predict(X_app_train, 
                            num_iteration = lgbm_model.best_iteration_)

In [23]:
X_app_test = df_application_test[['AMT_CREDIT','AMT_ANNUITY','CREDIT_TERM']]
y_pred_test = lgbm_model.predict(X_app_test, 
                            num_iteration = lgbm_model.best_iteration_)

In [24]:
cnt_payment_train = pd.DataFrame(y_pred_train)
cnt_payment_test = pd.DataFrame(y_pred_test)

In [25]:
# cnt_payment_train.to_csv('train_cnt_payment.csv', index=False)
# cnt_payment_test.to_csv('test_cnt_payment.csv', index=False)

In [None]:
# display(df_previous_application.head())
# display(df_previous_application.info())
# display(df_previous_application.isnull().sum())

In [None]:
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")
# pd.get_option("display.max_rows")