# Load imports and data

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import seaborn as sns

In [None]:
data = pd.read_csv('output/loan_dev_transformed.csv')
data.columns

In [None]:
input_cols = [
#'Id',
'loan_date',
'loan_duration',
'loan_payments',
'Predicted',
'account_frequency',
'account_district_region',
'account_district_no_inhabitants',
'account_district_no_municipalities_0_499',
'account_district_no_municipalities_500_1999',
'account_district_no_municipalities_2000_9999',
'account_district_no_municipalities_10000_plus',
'account_district_no_cities',
'account_district_ratio_urban_inhabitants',
'account_district_average_salary',
'account_district_unemployment_rate_95',
'account_district_unemployment_rate_96',
'account_district_no_enterpreneurs_per_1000_inhabitants',
'account_district_no_crimes_95',
'account_district_no_crimes_96',
'owner_card_type',
# 'owner_district_region',
# 'owner_district_no_inhabitants',
# 'owner_district_no_municipalities_0_499',
# 'owner_district_no_municipalities_500_1999',
# 'owner_district_no_municipalities_2000_9999',
# 'owner_district_no_municipalities_10000_plus',
# 'owner_district_no_cities',
# 'owner_district_ratio_urban_inhabitants',
# 'owner_district_average_salary',
# 'owner_district_unemployment_rate_95',
# 'owner_district_unemployment_rate_96',
# 'owner_district_no_enterpreneurs_per_1000_inhabitants',
# 'owner_district_no_crimes_95',
# 'owner_district_no_crimes_96',
'count_trans_credits',
'count_trans_withdrawals',
'count_trans_credit_cash',
'count_trans_withdrawal_cash',
'count_trans_withdrawal_card',
'count_trans_collection_other_bank',
'count_trans_remittance_other_bank',
'count_trans_ksymbol_interest_credited',
'count_trans_ksymbol_household',
'count_trans_ksymbol_payment_for_statement',
#'count_trans_ksymbol_insurance_payment',
'count_trans_ksymbol_sanction_interest_if_negative_balance',
#'count_trans_ksymbol_oldage_pension',
'last_trans_balance',
'mean_trans_balance',
'mean_trans_amount_absolute',
'mean_trans_amount_credit',
'mean_trans_amount_withdrawal',
'mean_trans_amount_signed',
'owner_male',
'owner_age',
'account_age_months',
'has_disponent',
]

In [None]:
data = data[input_cols]

# Feature selection

In [None]:
with_nulls = []
for i in data.isna().any().iteritems():
    if i[1]:
        with_nulls.append(i[0])
        print(i[0])

In [None]:
def corr_plot(corr):

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
cor_matrix = data[input_cols].corr().abs()
corr_plot(cor_matrix)

In [None]:
val = cor_matrix['Predicted'].sort_values(ascending=True)
sorted_columns = val.index.values
sorted_columns

In [None]:
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
upper_tri

In [None]:
corr = data.corr().abs()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.8:
            if columns[j]:
                columns[j] = False
selected_columns = data.columns[columns]


In [None]:
selected_columns = selected_columns[1:].values
selected_columns.shape

In [None]:
selected_columns

In [None]:
print(f'Removed {len(data.columns) - len(selected_columns)} columns')

In [None]:
data = data[selected_columns]
corr_plot(data.corr().abs())

In [None]:
fig = plt.figure(figsize = (20, 25))
j = 0
for i in data.columns:
    plt.subplot(6, 4, j+1)
    j += 1
    sns.distplot(data[i][data['Predicted']==0], color='g', label = 'approved')
    sns.distplot(data[i][data['Predicted']==1], color='r', label = 'rejected')
    plt.legend(loc='best')
fig.suptitle('Credit Card Approval Analysis')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()

In [None]:
Y = data["Predicted"].values
X = data.drop(["Predicted", "Id", "loan_date"], axis=1).values

In [None]:

estimator = DecisionTreeClassifier(splitter='best', min_samples_split=6, min_samples_leaf=5, max_features='sqrt', max_depth=41, criterion='gini')
selector = RFECV(estimator, cv=TimeSeriesSplit(), scoring='roc_auc')
selector.fit(X, Y)


In [None]:
plt.figure( figsize=(16, 6))
plt.title('Total features selected versus roc_auc score')
plt.xlabel('Total features selected')
plt.ylabel('Model accuracy')
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.show()


In [None]:
df_features = pd.DataFrame(columns = ['feature', 'support', 'ranking'])

for i in range(X.shape[1]):
    row = {'feature': i, 'support': selector.support_[i], 'ranking': selector.ranking_[i]}
    df_features = df_features.append(row, ignore_index=True)
    
df_features.sort_values(by='ranking').head(10)


In [None]:
sfs = SequentialFeatureSelector(clf, scoring="roc_auc", direction="backward", n_features_to_select='auto', tol=0.05, cv=5, n_jobs=-1)
sfs.fit(data[input_cols], data[output_cols])
sfs.get_feature_names_out()

In [None]:
selected_features = selector.get_support(1)
selected_features

In [None]:
new_inputs = data[data.columns[selected_features]]
new_inputs.columns