In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

from process_data import ProcessData, DataframeLabelEncoder

In [2]:
card_dev = pd.read_csv('data/card_dev.csv', sep=';')
loan_dev = pd.read_csv('data/loan_dev.csv', sep=';')
trans_dev = pd.read_csv('data/trans_dev.csv', sep=';')

card_comp = pd.read_csv('data/card.csv', sep=';')
loan_comp = pd.read_csv('data/loan_comp.csv', sep=';')
trans_comp = pd.read_csv('data/trans.csv', sep=';')

  trans_dev = pd.read_csv('data/trans_dev.csv', sep=';')


In [3]:
train_pd = ProcessData(card_dev, loan_dev, trans_dev)
train_df = train_pd.transform()


In [4]:
test_pd = ProcessData(card_comp, loan_comp, trans_comp)
test_df = test_pd.transform()

In [5]:
labelEncoder = DataframeLabelEncoder(train_df)
labelEncoder.fit()

In [6]:
train_df = labelEncoder.transform(train_df)
test_df = labelEncoder.transform(test_df)

In [7]:
train_df.to_csv('output/loan_train.csv', index=False)
test_df.to_csv('output/loan_test.csv', index=False)

In [8]:
# These columns will be used as the inputs of the models
input_cols = [
    #'duration_loan', 
    'payments_loan', 
    #'account_district', 
    #'account_frequency', 
    #'account_date', 
    'owner_male',
    #'owner_birthdate',
    #'owner_district', 
    #'owner_card_type', 
    #'owner_card_issued',
    #'disponent_male',
    #'disponent_birthdate',
    #'disponent_district',
    'disponent_card_type', 
    #'disponent_card_issued', 
    #'count_trans_credits',
    #'count_trans_withdrawals', 
    #'count_trans_credit_cash',
    #'count_trans_withdrawal_cash', 
    #'count_trans_withdrawal_card',
    'count_trans_collection_other_bank',
    #'count_trans_remittance_other_bank',
    'count_trans_ksymbol_interest_credited',
    #'count_trans_ksymbol_household',
    #'count_trans_ksymbol_payment_for_statement',
    'count_trans_ksymbol_insurance_payment',
    #'count_trans_ksymbol_sanction_interest_if_negative_balance',
    'count_trans_ksymbol_oldage_pension', 
    'last_trans_balance',
    'mean_trans_balance', 
    #'mean_trans_amount_absolute',
    'mean_trans_amount_credit', 
    #'mean_trans_amount_withdrawal',
    #'mean_trans_amount_signed'
    ]

# The output columns are the genres
output_cols = ['status']

In [9]:
test_df.head()

Unnamed: 0,loan_id,date_loan,duration_loan,payments_loan,status,account_district,account_frequency,account_date,owner_district,owner_card_type,...,last_trans_balance,mean_trans_balance,mean_trans_amount_absolute,mean_trans_amount_credit,mean_trans_amount_withdrawal,mean_trans_amount_signed,owner_male,owner_birthdate,disponent_male,disponent_birthdate
0,5895,970103,60,1566,1,45,1,951009,45,3,...,49548.5,54520.202247,8051.737079,12769.22,5653.016949,556.725843,1,620601,,
1,7122,970104,36,7240,1,22,1,950902,22,3,...,11565.4,31518.182051,6935.733333,9526.772414,5402.261224,148.276923,0,490121,,
2,6173,970108,48,4845,1,14,0,950419,14,3,...,45754.0,40175.6125,7217.385795,15302.486047,4603.406015,259.965341,0,390427,,
3,6142,970121,60,3698,1,9,1,960316,9,3,...,38913.4,44440.912676,6430.809859,11797.642857,4176.74,548.077465,0,430929,1.0,460604.0
4,5358,970121,12,3210,1,42,1,950604,42,3,...,18914.3,20231.313158,2373.266667,2736.026923,2069.016129,122.757895,1,391206,,


In [10]:
def calc_pred(model):
    print(input_cols)
    print(train_df.columns)
    inputs = train_df[input_cols].values
    classes = train_df[output_cols].values
    resclf = model.fit(inputs, classes)
    return resclf.predict_proba(test_df[input_cols].values)[:,1]

In [11]:
def save_model_results(name, results):
    pred = pd.DataFrame(results, columns=['status'])
    sub = pd.concat([test_df.loan_id,pred],axis=1)
    sub.rename(columns={"loan_id": "Id", "status": "Predicted"}, inplace=True)
    sub.set_index('Id',inplace=True)
    sub.to_csv(f'{name}.csv')

### Decision Tree

Best params for DecisionTreeClassifier: 
```json
{
    'splitter': 'best',
    'min_samples_split': 6,
    'min_samples_leaf': 5,
    'max_features': 6,
    'max_depth': 41,
    'criterion': 'gini'
}
```

In [12]:
model = DecisionTreeClassifier(splitter='best', min_samples_split=6, min_samples_leaf=5, max_features=6, max_depth=41, criterion='gini')

In [13]:
res = calc_pred(model)
save_model_results('decision_tree_pred', res)


['payments_loan', 'owner_male', 'disponent_card_type', 'count_trans_collection_other_bank', 'count_trans_ksymbol_interest_credited', 'count_trans_ksymbol_insurance_payment', 'count_trans_ksymbol_oldage_pension', 'last_trans_balance', 'mean_trans_balance', 'mean_trans_amount_credit']
Index(['loan_id', 'date_loan', 'duration_loan', 'payments_loan', 'status',
       'account_district', 'account_frequency', 'account_date',
       'owner_district', 'owner_card_type', 'owner_card_issued',
       'disponent_district', 'disponent_card_type', 'disponent_card_issued',
       'count_trans_credits', 'count_trans_withdrawals',
       'count_trans_credit_cash', 'count_trans_withdrawal_cash',
       'count_trans_withdrawal_card', 'count_trans_collection_other_bank',
       'count_trans_remittance_other_bank',
       'count_trans_ksymbol_interest_credited',
       'count_trans_ksymbol_household',
       'count_trans_ksymbol_payment_for_statement',
       'count_trans_ksymbol_insurance_payment',
      

### Stacking Classifier

No hyperparameters defined yet

In [14]:
#from catboost import CatBoostRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


level0 = list()
level0.append(('logreg', LogisticRegression() ))
level0.append(('forest', DecisionTreeClassifier()))
level0.append(('xgboost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')))
level0.append(('lgbm', LGBMClassifier()))
level0.append(('xtrees', ExtraTreesClassifier()))


level1 = RandomForestClassifier()
clf = StackingClassifier(estimators=level0, final_estimator=level1, cv=4)



In [15]:
res = calc_pred(model)
save_model_results('stack_classifier_pred', res)

['payments_loan', 'owner_male', 'disponent_card_type', 'count_trans_collection_other_bank', 'count_trans_ksymbol_interest_credited', 'count_trans_ksymbol_insurance_payment', 'count_trans_ksymbol_oldage_pension', 'last_trans_balance', 'mean_trans_balance', 'mean_trans_amount_credit']
Index(['loan_id', 'date_loan', 'duration_loan', 'payments_loan', 'status',
       'account_district', 'account_frequency', 'account_date',
       'owner_district', 'owner_card_type', 'owner_card_issued',
       'disponent_district', 'disponent_card_type', 'disponent_card_issued',
       'count_trans_credits', 'count_trans_withdrawals',
       'count_trans_credit_cash', 'count_trans_withdrawal_cash',
       'count_trans_withdrawal_card', 'count_trans_collection_other_bank',
       'count_trans_remittance_other_bank',
       'count_trans_ksymbol_interest_credited',
       'count_trans_ksymbol_household',
       'count_trans_ksymbol_payment_for_statement',
       'count_trans_ksymbol_insurance_payment',
      