## Preliminaries

For a Bayesian classification example for illustrating the **sheets/cards** family.

In [1]:
!rm -rf *.sh

<br>

### Packages

In [2]:
import subprocess

In [3]:
if 'google.colab' in str(get_ipython()):
    subprocess.run('wget -q https://raw.githubusercontent.com/exhypotheses/risk/develop/scripts.sh', shell=True)
    subprocess.run('chmod u+x scripts.sh', shell=True)
    subprocess.run('./scripts.sh', shell=True)

<br>

### Paths

In [4]:
import os
import pathlib
import sys

In [5]:
if not 'google.colab' in str(get_ipython()):    
    notebooks = os.getcwd()
    parent = str(pathlib.Path(notebooks).parent)
    sys.path.append(parent)
else:
    parent = os.getcwd()    

<br>

### Libraries

In [6]:
import logging
import collections

import pandas as pd
import numpy as np

import json

import sklearn.preprocessing

<br>

### Logging

In [7]:
logging.basicConfig(level=logging.INFO, format='%(message)s\n%(asctime)s.%(msecs)03d', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>

### Custom

In [8]:
import config

import risk.src.archetype

<br>

Configurations

In [9]:
configurations = config.Config()

In [10]:
warehouse_directory = os.path.join(parent, 'warehouse', 'data')
if not os.path.exists(warehouse_directory):
    os.makedirs(warehouse_directory)
    
raw_data_directory = os.path.join(parent, 'data')
if not os.path.exists(raw_data_directory):
    os.makedirs(raw_data_directory)    

<br>
<br>

## Data

### Raw

Glossary:

> **e**: existing, **i**: installments, **n**: number, **acc**: account, **chq**: cheque, **emp**: employment, **inc**: income, **res**: residence, **curr**: current, **disp**: disposable



In [11]:
fields = ['e_chq_acc_status', 'duration_months', 'credit_history', 'purpose', 'credit_amount', 'savings_acc_class', 'curr_emp_class', 
 'i_rate_by_disp_inc', 'sex_and_status', 'other_debtors_class', 'curr_res_since', 'property', 'age_years', 'other_i_plans', 'housing',
 'n_e_credits_this_bank', 'job', 'n_dependants', 'telephone', 'foreign_worker', 'label']

<br>

URL

In [12]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'

<br>

Read

In [13]:
try:
    data = pd.read_csv(filepath_or_buffer=url, sep=' ', header=None, encoding='utf-8')
except OSError as err:
    raise Exception(err.strerror) in err

data.columns = fields

In [14]:
logger.info(data.head())

  e_chq_acc_status  duration_months  ... foreign_worker label
0              A11                6  ...           A201     1
1              A12               48  ...           A201     2
2              A14               12  ...           A201     1
3              A11               42  ...           A201     1
4              A11               24  ...           A201     2

[5 rows x 21 columns]
2021-07-09 18:37:09.100


<br>

Write

In [15]:
data.to_csv(path_or_buf=os.path.join(raw_data_directory, 'credit.csv'), 
            header=True, index=False, encoding='utf-8')

<br>

### Numeric Data

In [16]:
numeric = data[configurations.numeric]
logger.info(numeric.info())

None
2021-07-09 18:37:09.186


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   duration_months        1000 non-null   int64
 1   credit_amount          1000 non-null   int64
 2   i_rate_by_disp_inc     1000 non-null   int64
 3   curr_res_since         1000 non-null   int64
 4   age_years              1000 non-null   int64
 5   n_e_credits_this_bank  1000 non-null   int64
 6   n_dependants           1000 non-null   int64
dtypes: int64(7)
memory usage: 54.8 KB


<br>

### Labels

* 2: unreasonable $\rightarrow$ 0
* 1: reasonable $\rightarrow$ 1

In [17]:
labels = data[['label']].copy()
labels.loc[:, 'label'] = labels['label'].mod(2)
labels.columns = ['reasonable']

NumExpr defaulting to 2 threads.
2021-07-09 18:37:09.222


<br>
<br>

## Encoding

In [18]:
archetype = risk.src.archetype.Archetype()

categories = archetype.categories()
logger.info('{}'.format(categories._fields))

('fields', 'arrays', 'dictionary')
2021-07-09 18:37:09.235


In [19]:
instances = data[categories.fields]

<br>

### Transformation Functions

In [20]:
def sex(frame: pd.DataFrame):

    # sex: 1 -> female, 0 -> male
    values: pd.Series = frame['sex_and_status'].copy().apply(
        lambda x: 1 if (x == 'A92' or x == 'A95') else 0)
    values.rename('female', inplace=True)

    return values.to_frame()
    

<br>

Aside

In [21]:
sex_ = sex(frame=data[['sex_and_status']])

<br>
<br>

### Baseline

In [22]:
originals = data[categories.fields]
baseline = pd.concat((numeric, originals, sex_, labels), axis=1)
baseline.to_csv(path_or_buf=os.path.join(warehouse_directory, 'baseline.csv'), header=True, index=False, encoding='utf-8')
baseline.head().T

Unnamed: 0,0,1,2,3,4
duration_months,6,48,12,42,24
credit_amount,1169,5951,2096,7882,4870
i_rate_by_disp_inc,4,2,2,2,3
curr_res_since,4,2,3,4,4
age_years,67,22,49,45,53
n_e_credits_this_bank,2,1,1,1,2
n_dependants,1,1,2,2,2
e_chq_acc_status,A11,A12,A14,A11,A11
credit_history,A34,A32,A34,A32,A33
purpose,A43,A43,A46,A42,A40


<br>

### Natural Numbers

In [23]:
enc = sklearn.preprocessing.OrdinalEncoder(categories=categories.arrays, dtype=np.int)
ordinals_ = enc.fit_transform(X=instances)
ordinals = pd.DataFrame(data=ordinals_, columns=categories.fields)

In [24]:
natural = pd.concat((numeric, ordinals, sex_, labels), axis=1)
natural.to_csv(path_or_buf=os.path.join(warehouse_directory, 'natural.csv'), header=True, index=False, encoding='utf-8')
natural.head().T

Unnamed: 0,0,1,2,3,4
duration_months,6,48,12,42,24
credit_amount,1169,5951,2096,7882,4870
i_rate_by_disp_inc,4,2,2,2,3
curr_res_since,4,2,3,4,4
age_years,67,22,49,45,53
n_e_credits_this_bank,2,1,1,1,2
n_dependants,1,1,2,2,2
e_chq_acc_status,0,1,3,0,0
credit_history,4,2,4,2,3
purpose,3,3,6,2,0


<br>

### One Hot

In [25]:
enc = sklearn.preprocessing.OneHotEncoder(categories=categories.arrays, drop='if_binary', sparse=False, dtype=np.int)

bits_ = enc.fit_transform(X=instances)
columns = [column[(column.rindex('_') + 1):] for column in enc.get_feature_names()]
bits = pd.DataFrame(data=bits_, columns=columns)

In [26]:
modelling = pd.concat([numeric, bits, sex_, labels], axis=1)
modelling.to_csv(path_or_buf=os.path.join(warehouse_directory, 'modelling.csv'), header=True, index=False, encoding='utf-8')
modelling.head().T

Unnamed: 0,0,1,2,3,4
duration_months,6,48,12,42,24
credit_amount,1169,5951,2096,7882,4870
i_rate_by_disp_inc,4,2,2,2,3
curr_res_since,4,2,3,4,4
age_years,67,22,49,45,53
...,...,...,...,...,...
A174,0,0,0,0,0
A192,1,0,0,0,0
A201,1,1,1,1,1
female,0,1,0,0,0


<br>
<br>

## Fields Help

**Note**

All frames have

* **7 numeric fields**: ['duration_months', 'credit_amount', 'i_rate_by_disp_inc', 'curr_res_since', 'age_years', 'n_e_credits_this_bank', 'n_dependants']
* **1 label field**: reasonable

Additionally

* **baseline**:  Has 1 binary field and 13 categorical fields.  The values of the categorical fields are the archetype/original values; ref. variables zip(categories.fields, categories.arrays).  [22 fields altogether]

* **natural**:  Has 1 binary field and 13 categorical fields.  The values of the categorical fields are natural numbers that where assigned via Ordinal Encoding of archetype/original values.  [22 fields altogether]

* **modelling**:  Has 3 binary fields and 52 categorical fields.  The values of the categorical fields are the One Hot Encodings of the archetype/original values.  [63 fields altogether]

The archetype/original categorical fields, and their categories, are

* {'e_chq_acc_status': ['A11', 'A12', 'A13', 'A14'], 'credit_history': ['A30', 'A31', 'A32', 'A33', 'A34'],<br>'purpose': ['A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48', 'A49', 'A410'],<br>'savings_acc_class': ['A61', 'A62', 'A63', 'A64', 'A65'], 'curr_emp_class': ['A71', 'A72', 'A73', 'A74', 'A75'],<br>'sex_and_status': ['A91', 'A92', 'A93', 'A94', 'A95'], 'other_debtors_class': ['A101', 'A102', 'A103'],<br>'property': ['A121', 'A122', 'A123', 'A124'], 'other_i_plans': ['A141', 'A142', 'A143'],<br>'housing': ['A151', 'A152', 'A153'], 'job': ['A171', 'A172', 'A173', 'A174'], 'telephone': ['A191', 'A192'], 'foreign_worker': ['A202', 'A201']}



Settings below.

<br>

### Setting-up

In [27]:
dictionary = []

In [28]:
dictionary.append({'modelling':
    {'source': 'modelling.csv', 
     'target': ['reasonable'],
     'numeric': configurations.numeric,      
     'categoricalFields': ['e_chq_acc_status', 'credit_history', 'purpose', 'savings_acc_class',
                           'curr_emp_class', 'sex_and_status', 'other_debtors_class', 'property', 
                           'other_i_plans', 'housing', 'job', 'A192', 'A201', 'female'],
     'binaryCF': ['A192', 'A201', 'female'], 
     'polytomousCF':  {'e_chq_acc_status': ['A11', 'A12', 'A13', 'A14'], 'credit_history': ['A30', 'A31', 'A32', 'A33', 'A34'],
              'purpose': ['A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48', 'A49', 'A410'],
              'savings_acc_class': ['A61', 'A62', 'A63', 'A64', 'A65'], 'curr_emp_class': ['A71', 'A72', 'A73', 'A74', 'A75'],
              'sex_and_status': ['A91', 'A92', 'A93', 'A94', 'A95'], 'other_debtors_class': ['A101', 'A102', 'A103'],
              'property': ['A121', 'A122', 'A123', 'A124'], 'other_i_plans': ['A141', 'A142', 'A143'],
              'housing': ['A151', 'A152', 'A153'], 'job': ['A171', 'A172', 'A173', 'A174']}
    }})

In [29]:
dictionary.append({'baseline':
    {'source': 'baseline.csv', 
     'target': ['reasonable'],
     'numeric': configurations.numeric, 
     'categoricalFields': ['e_chq_acc_status', 'credit_history', 'purpose', 'savings_acc_class', 
                           'curr_emp_class', 'sex_and_status', 'other_debtors_class', 'property', 
                           'other_i_plans', 'housing', 'job', 'telephone', 'foreign_worker', 'female'],
     'binaryCF': [],
     'polytomousCF': {}
    }})

In [30]:
dictionary.append({'natural':
    {'source': 'natural.csv', 
     'labels': ['reasonable'],
     'numeric': configurations.numeric, 
     'categoricalFields': ['e_chq_acc_status', 'credit_history', 'purpose', 'savings_acc_class', 
                           'curr_emp_class', 'sex_and_status', 'other_debtors_class', 'property', 
                           'other_i_plans', 'housing', 'job', 'telephone', 'foreign_worker', 'female'],
     'binaryCF': [],      
     'polytomousCF': {}
    }})

In [31]:
with open(os.path.join(warehouse_directory, 'fields.json'), 'w') as disk:
    json.dump(dictionary, disk)