# Load imports and data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PowerTransformer, Binarizer


In [None]:
dev = pd.read_csv('output/loan_dev.csv')
# dev = pd.read_csv('output/loan_dev_without_outliers.csv')
comp = pd.read_csv('output/loan_comp.csv')
all = pd.read_csv('output/loan.csv')

list(dev.columns)


# Transformation Utils

In [None]:
def show_hist(columns, trend=True):
    for column in columns:
        sns.histplot(dev[column], kde=trend)
        plt.show()


def transform(columns, scaler):
    scaler = scaler.fit(dev[columns])
    dev[columns] = scaler.transform(dev[columns])
    comp[columns] = scaler.transform(comp[columns])
    all[columns] = scaler.transform(all[columns])


# Account & Owner
#### (most dists are exactly the same)

In [None]:
[col for col in dev if col.startswith('account') or col.startswith('owner')]


In [None]:
log_attrs = ['_district_no_inhabitants']


show_hist(['account' + attr for attr in log_attrs])


In [None]:
log_cols = [subject + attr for subject in ['account', 'owner']
            for attr in log_attrs]

transform(log_cols, PowerTransformer())

show_hist(['account' + attr for attr in log_attrs])


In [None]:
norm_attrs = ['_district_no_municipalities_0_499',
              '_district_no_municipalities_500_1999', '_district_no_municipalities_2000_9999', '_district_no_cities', '_district_ratio_urban_inhabitants', '_district_average_salary',
              '_district_unemployment_rate_95', '_district_unemployment_rate_96', '_district_no_enterpreneurs_per_1000_inhabitants', '_district_no_crimes_95', '_district_no_crimes_96']

# '_age_months' only occurs on account
show_hist(['account' + attr for attr in norm_attrs] +
          ['account_age_months', 'owner_age'])


In [None]:
norm_cols = [subject + attr for subject in ['account', 'owner']
             for attr in norm_attrs]

norm_cols.append('account_age_months')
norm_cols.append('owner_age')

transform(norm_cols, StandardScaler())

show_hist(['account' + attr for attr in norm_attrs] +
          ['account_age_months', 'owner_age'])


# Transactions

In [None]:
trans_cols = ['loan_payments',
              'count_trans_credits',
              'count_trans_withdrawals',
              'count_trans_credit_cash',
              'count_trans_withdrawal_cash',
              'count_trans_withdrawal_card',
              'count_trans_collection_other_bank',
              'count_trans_remittance_other_bank',
              'count_trans_ksymbol_interest_credited',
              'count_trans_ksymbol_household',
              'count_trans_ksymbol_payment_for_statement',
              'count_trans_ksymbol_sanction_interest_if_negative_balance',
              'last_trans_balance',
              'mean_trans_balance',
              'mean_trans_amount_absolute',
              'mean_trans_amount_credit',
              'mean_trans_amount_withdrawal',
              'mean_trans_amount_signed']

show_hist(trans_cols)


In [None]:
log_cols = ['count_trans_credits',
            'count_trans_withdrawals',
            'count_trans_withdrawal_cash',
            'count_trans_ksymbol_interest_credited',
            'last_trans_balance',
            'mean_trans_amount_absolute',
            'mean_trans_amount_withdrawal',
            'mean_trans_amount_signed']

transform(log_cols, PowerTransformer())

show_hist(log_cols)


In [None]:
bin_cols = ['count_trans_collection_other_bank',
            'count_trans_remittance_other_bank',
            'count_trans_ksymbol_household',
            'count_trans_ksymbol_sanction_interest_if_negative_balance']

transform(bin_cols, Binarizer())

show_hist(bin_cols, trend=False)


In [None]:
norm_cols = [col for col in trans_cols if col not in log_cols +
             bin_cols and col != 'count_trans_withdrawal_card']

transform(norm_cols, StandardScaler())

show_hist(norm_cols)


# Save data

In [None]:
dev.to_csv('output/loan_dev_transformed.csv', index=False)
# dev = pd.read_csv('output/loan_dev_without_outliers_transformed.csv')
comp.to_csv('output/loan_comp_transformed.csv', index=False)
all.to_csv('output/loan_transformed.csv', index=False)