## Prelimiaries

For a Bayesian classification example for illustrating the **sheets/cards** family.

In [1]:
!rm -rf *.sh

<br>

### Packages

In [2]:
import subprocess

In [3]:
if 'google.colab' in str(get_ipython()):
    subprocess.run('wget -q https://raw.githubusercontent.com/briefings/credit/develop/scripts.sh', shell=True)
    subprocess.run('chmod u+x scripts.sh', shell=True)
    subprocess.run('./scripts.sh', shell=True)

<br>

### Paths

In [4]:
import os
import pathlib
import sys

In [5]:
if not 'google.colab' in str(get_ipython()):
    
    notebooks = os.getcwd()
    parent = str(pathlib.Path(notebooks).parent)
    sys.path.append(parent)

<br>

### Libraries

In [6]:
import logging
import collections

import pandas as pd
import numpy as np

import sklearn.preprocessing

<br>

### Logging

In [7]:
logging.basicConfig(level=logging.INFO, format='%(message)s\n%(asctime)s.%(msecs)03d', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>

### Custom

In [8]:
import config

import credit.src.archetype

<br>

Configurations

In [9]:
configurations = config.Config()

In [10]:
warehouse_directory = os.path.join(parent, 'warehouse', 'data')
if not os.path.exists(warehouse_directory):
    os.makedirs(warehouse_directory)
    
raw_data_directory = os.path.join(parent, 'data')
if not os.path.exists(raw_data_directory):
    os.makedirs(raw_data_directory)    

<br>
<br>

## Data

### Raw

Glossary:

> **e**: existing, **i**: installments, **n**: number, **acc**: account, **chq**: cheque, **emp**: employment, **inc**: income, **res**: residence, **curr**: current, **disp**: disposable



In [11]:
fields = ['e_chq_acc_status', 'duration_months', 'credit_history', 'purpose', 'credit_amount', 'savings_acc_class', 'curr_emp_class', 
 'i_rate_by_disp_inc', 'sex_and_status', 'other_debtors_class', 'curr_res_since', 'property', 'age_years', 'other_i_plans', 'housing',
 'n_e_credits_this_bank', 'job', 'n_dependants', 'telephone', 'foreign_worker', 'label']

<br>

URL

In [12]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'

<br>

Read

In [13]:
try:
    data = pd.read_csv(filepath_or_buffer=url, sep=' ', header=None, encoding='utf-8')
except OSError as err:
    raise Exception(err.strerror) in err

data.columns = fields

In [14]:
logger.info(data.head())

  e_chq_acc_status  duration_months credit_history purpose  credit_amount  \
0              A11                6            A34     A43           1169   
1              A12               48            A32     A43           5951   
2              A14               12            A34     A46           2096   
3              A11               42            A32     A42           7882   
4              A11               24            A33     A40           4870   

  savings_acc_class curr_emp_class  i_rate_by_disp_inc sex_and_status  \
0               A65            A75                   4            A93   
1               A61            A73                   2            A92   
2               A61            A74                   2            A93   
3               A61            A74                   2            A93   
4               A61            A73                   3            A93   

  other_debtors_class  ...  property age_years  other_i_plans housing  \
0                A101  ..

<br>

Write

In [15]:
data.to_csv(path_or_buf=os.path.join(raw_data_directory, 'credit.csv'), 
            header=True, index=False, encoding='utf-8')

<br>

### Numeric Data

In [16]:
numeric = data[configurations.numeric]
logger.info(numeric.info())

None
2021-05-14 16:16:42.399


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   duration_months        1000 non-null   int64
 1   credit_amount          1000 non-null   int64
 2   i_rate_by_disp_inc     1000 non-null   int64
 3   curr_res_since         1000 non-null   int64
 4   age_years              1000 non-null   int64
 5   n_e_credits_this_bank  1000 non-null   int64
 6   n_dependants           1000 non-null   int64
dtypes: int64(7)
memory usage: 54.8 KB


<br>

### Labels

* 2: unreasonable $\rightarrow$ 0
* 1: reasonable $\rightarrow$ 1

In [17]:
labels = data[['label']].copy()
labels.loc[:, 'label'] = labels['label'].mod(2)
labels.columns = ['reasonable']

<br>
<br>

## Encoding

In [18]:
archetype = credit.src.archetype.Archetype()

categories = archetype.categories()
logger.info('{}'.format(categories._fields))

('fields', 'arrays', 'dictionary')
2021-05-14 16:16:42.417


In [19]:
instances = data[categories.fields]

<br>

### Transformation Functions

In [20]:
def sex(frame: pd.DataFrame):

    # sex: 0 -> female, 1 -> male
    values: pd.Series = frame['sex_and_status'].copy().apply(
        lambda x: 0 if (x == 'A92' or x == 'A95') else 1)
    values.rename('sex', inplace=True)

    return values.to_frame()
    

<br>

Aside

In [21]:
sex_ = sex(frame=data[['sex_and_status']])

<br>

### Ordinal

In [22]:
enc = sklearn.preprocessing.OrdinalEncoder(categories=categories.arrays, dtype=np.int)
ordinals_ = enc.fit_transform(X=instances)
ordinals = pd.DataFrame(data=ordinals_, columns=categories.fields)

In [28]:
originals = data[categories.fields]

<br>

### One Hot

In [23]:
enc = sklearn.preprocessing.OneHotEncoder(categories=categories.arrays, sparse=False, dtype=np.int)

bits_ = enc.fit_transform(X=instances)
columns = [column[(column.rindex('_') + 1):] for column in enc.get_feature_names()]
bits = pd.DataFrame(data=bits_, columns=columns)

<br>
<br>

## Features

For modelling

In [24]:
pd.concat([numeric, bits, sex_, labels], axis=1).\
    to_csv(path_or_buf=os.path.join(warehouse_directory, 'modelling.csv'), header=True, index=False, encoding='utf-8')

<br>

For statistics

In [30]:
pd.concat((numeric, ordinals, sex_, labels), axis=1).\
    to_csv(path_or_buf=os.path.join(warehouse_directory, 'numeric.csv'), header=True, index=False, encoding='utf-8')

In [31]:
pd.concat((numeric, originals, sex_, labels), axis=1).\
    to_csv(path_or_buf=os.path.join(warehouse_directory, 'alphabetic.csv'), header=True, index=False, encoding='utf-8')