## Prelimiaries

For a Bayesian classification example for illustrating the **sheets/cards** family.

In [1]:
!rm -rf *.sh

<br>

### Packages

In [2]:
import subprocess

In [3]:
if 'google.colab' in str(get_ipython()):
    subprocess.run('wget -q https://raw.githubusercontent.com/briefings/credit/develop/scripts.sh', shell=True)
    subprocess.run('chmod u+x scripts.sh', shell=True)
    subprocess.run('./scripts.sh', shell=True)

<br>

### Paths

In [4]:
import os
import pathlib
import sys

In [5]:
if not 'google.colab' in str(get_ipython()):
    
    notebooks = os.getcwd()
    parent = str(pathlib.Path(notebooks).parent)
    sys.path.append(parent)

<br>

### Libraries

In [6]:
import logging
import collections

import pandas as pd
import numpy as np

import json

import sklearn.preprocessing

<br>

### Logging

In [7]:
logging.basicConfig(level=logging.INFO, format='%(message)s\n%(asctime)s.%(msecs)03d', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>

### Custom

In [8]:
import config

import credit.src.archetype

<br>

Configurations

In [9]:
configurations = config.Config()

In [10]:
warehouse_directory = os.path.join(parent, 'warehouse', 'data')
if not os.path.exists(warehouse_directory):
    os.makedirs(warehouse_directory)
    
raw_data_directory = os.path.join(parent, 'data')
if not os.path.exists(raw_data_directory):
    os.makedirs(raw_data_directory)    

<br>
<br>

## Data

### Raw

Glossary:

> **e**: existing, **i**: installments, **n**: number, **acc**: account, **chq**: cheque, **emp**: employment, **inc**: income, **res**: residence, **curr**: current, **disp**: disposable



In [11]:
fields = ['e_chq_acc_status', 'duration_months', 'credit_history', 'purpose', 'credit_amount', 'savings_acc_class', 'curr_emp_class', 
 'i_rate_by_disp_inc', 'sex_and_status', 'other_debtors_class', 'curr_res_since', 'property', 'age_years', 'other_i_plans', 'housing',
 'n_e_credits_this_bank', 'job', 'n_dependants', 'telephone', 'foreign_worker', 'label']

<br>

URL

In [12]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'

<br>

Read

In [13]:
try:
    data = pd.read_csv(filepath_or_buffer=url, sep=' ', header=None, encoding='utf-8')
except OSError as err:
    raise Exception(err.strerror) in err

data.columns = fields

In [14]:
logger.info(data.head())

  e_chq_acc_status  duration_months credit_history purpose  credit_amount  \
0              A11                6            A34     A43           1169   
1              A12               48            A32     A43           5951   
2              A14               12            A34     A46           2096   
3              A11               42            A32     A42           7882   
4              A11               24            A33     A40           4870   

  savings_acc_class curr_emp_class  i_rate_by_disp_inc sex_and_status  \
0               A65            A75                   4            A93   
1               A61            A73                   2            A92   
2               A61            A74                   2            A93   
3               A61            A74                   2            A93   
4               A61            A73                   3            A93   

  other_debtors_class  ...  property age_years  other_i_plans housing  \
0                A101  ..

<br>

Write

In [15]:
data.to_csv(path_or_buf=os.path.join(raw_data_directory, 'credit.csv'), 
            header=True, index=False, encoding='utf-8')

<br>

### Numeric Data

In [16]:
numeric = data[configurations.numeric]
logger.info(numeric.info())

None
2021-06-22 19:28:47.818


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   duration_months        1000 non-null   int64
 1   credit_amount          1000 non-null   int64
 2   i_rate_by_disp_inc     1000 non-null   int64
 3   curr_res_since         1000 non-null   int64
 4   age_years              1000 non-null   int64
 5   n_e_credits_this_bank  1000 non-null   int64
 6   n_dependants           1000 non-null   int64
dtypes: int64(7)
memory usage: 54.8 KB


<br>

### Labels

* 2: unreasonable $\rightarrow$ 0
* 1: reasonable $\rightarrow$ 1

In [17]:
labels = data[['label']].copy()
labels.loc[:, 'label'] = labels['label'].mod(2)
labels.columns = ['reasonable']

<br>
<br>

## Encoding

In [18]:
archetype = credit.src.archetype.Archetype()

categories = archetype.categories()
logger.info('{}'.format(categories._fields))

('fields', 'arrays', 'dictionary')
2021-06-22 19:28:47.838


In [19]:
instances = data[categories.fields]

<br>

### Transformation Functions

In [20]:
def sex(frame: pd.DataFrame):

    # sex: 1 -> female, 0 -> male
    values: pd.Series = frame['sex_and_status'].copy().apply(
        lambda x: 1 if (x == 'A92' or x == 'A95') else 0)
    values.rename('female', inplace=True)

    return values.to_frame()
    

<br>

Aside

In [21]:
sex_ = sex(frame=data[['sex_and_status']])

<br>
<br>

### Baseline

In [22]:
originals = data[categories.fields]

In [23]:
baseline = pd.concat((numeric, originals, sex_, labels), axis=1)

In [24]:
baseline.to_csv(path_or_buf=os.path.join(warehouse_directory, 'baseline.csv'), header=True, index=False, encoding='utf-8')

In [25]:
baseline.head().T

Unnamed: 0,0,1,2,3,4
duration_months,6,48,12,42,24
credit_amount,1169,5951,2096,7882,4870
i_rate_by_disp_inc,4,2,2,2,3
curr_res_since,4,2,3,4,4
age_years,67,22,49,45,53
n_e_credits_this_bank,2,1,1,1,2
n_dependants,1,1,2,2,2
e_chq_acc_status,A11,A12,A14,A11,A11
credit_history,A34,A32,A34,A32,A33
purpose,A43,A43,A46,A42,A40


<br>

### Natural Numbers

In [26]:
enc = sklearn.preprocessing.OrdinalEncoder(categories=categories.arrays, dtype=np.int)
ordinals_ = enc.fit_transform(X=instances)
ordinals = pd.DataFrame(data=ordinals_, columns=categories.fields)

In [27]:
natural = pd.concat((numeric, ordinals, sex_, labels), axis=1)

In [28]:
natural.to_csv(path_or_buf=os.path.join(warehouse_directory, 'natural.csv'), header=True, index=False, encoding='utf-8')

In [29]:
natural.head().T

Unnamed: 0,0,1,2,3,4
duration_months,6,48,12,42,24
credit_amount,1169,5951,2096,7882,4870
i_rate_by_disp_inc,4,2,2,2,3
curr_res_since,4,2,3,4,4
age_years,67,22,49,45,53
n_e_credits_this_bank,2,1,1,1,2
n_dependants,1,1,2,2,2
e_chq_acc_status,0,1,3,0,0
credit_history,4,2,4,2,3
purpose,3,3,6,2,0


<br>

### One Hot

In [30]:
enc = sklearn.preprocessing.OneHotEncoder(categories=categories.arrays, drop='if_binary', sparse=False, dtype=np.int)

bits_ = enc.fit_transform(X=instances)
columns = [column[(column.rindex('_') + 1):] for column in enc.get_feature_names()]
bits = pd.DataFrame(data=bits_, columns=columns)

In [31]:
modelling = pd.concat([numeric, bits, sex_, labels], axis=1)

In [32]:
modelling.to_csv(path_or_buf=os.path.join(warehouse_directory, 'modelling.csv'), header=True, index=False, encoding='utf-8')

In [33]:
modelling.head().T

Unnamed: 0,0,1,2,3,4
duration_months,6,48,12,42,24
credit_amount,1169,5951,2096,7882,4870
i_rate_by_disp_inc,4,2,2,2,3
curr_res_since,4,2,3,4,4
age_years,67,22,49,45,53
...,...,...,...,...,...
A174,0,0,0,0,0
A192,1,0,0,0,0
A202,0,0,0,0,0
female,0,1,0,0,0


<br>
<br>

## Fields Help

In [34]:
BinaryData = collections.namedtuple(
            typename='BinaryData', field_names=['fields', 'pairs'])

In [35]:
categories.fields

['e_chq_acc_status',
 'credit_history',
 'purpose',
 'savings_acc_class',
 'curr_emp_class',
 'sex_and_status',
 'other_debtors_class',
 'property',
 'other_i_plans',
 'housing',
 'job',
 'telephone',
 'foreign_worker']

In [36]:
categories.arrays

[array(['A11', 'A12', 'A13', 'A14'], dtype='<U3'),
 array(['A30', 'A31', 'A32', 'A33', 'A34'], dtype='<U3'),
 array(['A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48',
        'A49', 'A410'], dtype='<U4'),
 array(['A61', 'A62', 'A63', 'A64', 'A65'], dtype='<U3'),
 array(['A71', 'A72', 'A73', 'A74', 'A75'], dtype='<U3'),
 array(['A91', 'A92', 'A93', 'A94', 'A95'], dtype='<U3'),
 array(['A101', 'A102', 'A103'], dtype='<U4'),
 array(['A121', 'A122', 'A123', 'A124'], dtype='<U4'),
 array(['A141', 'A142', 'A143'], dtype='<U4'),
 array(['A151', 'A152', 'A153'], dtype='<U4'),
 array(['A171', 'A172', 'A173', 'A174'], dtype='<U4'),
 array(['A191', 'A192'], dtype='<U4'),
 array(['A201', 'A202'], dtype='<U4')]

<br>
<br>

**Note**

All frames have the **7 numeric fields**

* ['duration_months', 'credit_amount', 'i_rate_by_disp_inc', 'curr_res_since', 'age_years', 'n_e_credits_this_bank', 'n_dependants']

In brief

* **baseline, 22 Fields**: the 7 numeric fields, the label field, 1 binary field $\rightarrow$ female, the 13 categorical fields $\rightarrow$<br> ['e_chq_acc_status',
 'credit_history', 'purpose', 'savings_acc_class', 'curr_emp_class', 'sex_and_status',<br> 'other_debtors_class', 'property', 'other_i_plans', 'housing', 'job', 'telephone', 'foreign_worker']<br>**However**, the values of the categorical fields are the archetype/original values; ref. variables zip(categories.fields, categories.arrays)
 


* **natural, 22 Fields**: the 7 numeric fields, the label field, 1 binary field $\rightarrow$ female, the 13 categorical fields $\rightarrow$<br> ['e_chq_acc_status',
 'credit_history', 'purpose', 'savings_acc_class', 'curr_emp_class', 'sex_and_status',<br> 'other_debtors_class', 'property', 'other_i_plans', 'housing', 'job', 'telephone', 'foreign_worker']<br>**However**, the values of the categorical fields are natural numbers that where assigned via Ordinal Encoding of archetype/original values.



* **modelling, 63 Fields**: the 7 numeric fields, the label field, ...

In [37]:
modelling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 63 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   duration_months        1000 non-null   int64
 1   credit_amount          1000 non-null   int64
 2   i_rate_by_disp_inc     1000 non-null   int64
 3   curr_res_since         1000 non-null   int64
 4   age_years              1000 non-null   int64
 5   n_e_credits_this_bank  1000 non-null   int64
 6   n_dependants           1000 non-null   int64
 7   A11                    1000 non-null   int32
 8   A12                    1000 non-null   int32
 9   A13                    1000 non-null   int32
 10  A14                    1000 non-null   int32
 11  A30                    1000 non-null   int32
 12  A31                    1000 non-null   int32
 13  A32                    1000 non-null   int32
 14  A33                    1000 non-null   int32
 15  A34                    1000 non-null   

In [38]:
modelling.head()

Unnamed: 0,duration_months,credit_amount,i_rate_by_disp_inc,curr_res_since,age_years,n_e_credits_this_bank,n_dependants,A11,A12,A13,...,A152,A153,A171,A172,A173,A174,A192,A202,female,reasonable
0,6,1169,4,4,67,2,1,1,0,0,...,1,0,0,0,1,0,1,0,0,1
1,48,5951,2,2,22,1,1,0,1,0,...,1,0,0,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,42,7882,2,4,45,1,2,1,0,0,...,0,1,0,0,1,0,0,0,0,1
4,24,4870,3,4,53,2,2,1,0,0,...,0,1,0,0,1,0,0,0,0,0
