## Preliminaries

In [1]:
!rm -rf *.sh

<br>

### Packages

In [2]:
import subprocess

In [3]:
if 'google.colab' in str(get_ipython()):
    subprocess.run('wget -q https://raw.githubusercontent.com/briefings/credit/develop/scripts.sh', shell=True)
    subprocess.run('chmod u+x scripts.sh', shell=True)
    subprocess.run('./scripts.sh', shell=True)

<br>

### Paths

In [4]:
import os
import pathlib
import sys

In [5]:
if not 'google.colab' in str(get_ipython()):    
    notebooks = os.getcwd()
    parent = str(pathlib.Path(notebooks).parent)
    sys.path.append(parent)
else:
    parent = os.getcwd()    

In [6]:
warehouse = os.path.join(parent, 'warehouse', 'representations')
if not os.path.exists(warehouse):
    os.makedirs(warehouse)

<br>

### Libraries

In [7]:
import logging
import requests
import collections
import json

import os

import sklearn.manifold
import sklearn.preprocessing

import numpy as np
import pandas as pd
import dask

import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
dask.__version__

'2.30.0'

<br>

### Logging

In [9]:
logging.basicConfig(level=logging.INFO, format='%(message)s\n%(asctime)s.%(msecs)03d', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>

### Custom

In [10]:
import config

import credit.graphics.relational
import credit.graphics.settings

import credit.src.modelling
import credit.src.archetype

import credit.embeddings.bijections
import credit.embeddings.reference
import credit.embeddings.representations

<br>

Configurations

In [11]:
configurations = config.Config()

<br>

Graphs

In [12]:
relational = credit.graphics.relational.Relational()

RelationalGraphLabels = collections.namedtuple(typename='RelationalGraphLabels',
                                               field_names=['title', 'xlabel', 'ylabel'])

<br>

Settings

In [13]:
settings = credit.graphics.settings.Settings()

settings.layout()
settings.aesthetics()

<br>
<br>

## Data

Consider dropping these questionable/problematic fields:

* ***savings_acc_class*** $\; \Rightarrow \;$ (A61, A62, A63, A64, A65): <br>This might be biased towards wealthy individuals; may discount diligent but poor individuals.

* ***curr_emp_class*** $\; \Rightarrow \;$ (A71, A72, A73, A74, A75):<br>This may penalise individuals that have just started a role.

* ***curr_res_since***:<br>This may penalise individuals that have just moved into a new dwelling.

* ***n_e_credits_this_bank***:<br>This does not account for credits elsewhere.

* ***job*** $\; \Rightarrow \;$ (A171, A172, A173, A174):<br>Poorly demarcated.

Do not use:
* ***sex_and_status*** $\; \Rightarrow \;$ sex **&** A91, A92, A93, A94, A95

* ***age_years***

<br>

ref. http://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29

<br>

<br>

### Specifications

In [14]:
try:
    req = requests.get(url='https://raw.githubusercontent.com/exhypotheses/credit/develop/warehouse/data/fields.json')
    req.raise_for_status()
except requests.exceptions.RequestException as err:
    raise err

In [15]:
metadata = json.loads(req.content)
specifications = metadata[0]['modelling']
specifications

{'binaryFields': ['A192', 'A201', 'female'],
 'categoricalFields': [],
 'categoryGroups': {'credit_history': ['A30', 'A31', 'A32', 'A33', 'A34'],
  'curr_emp_class': ['A71', 'A72', 'A73', 'A74', 'A75'],
  'e_chq_acc_status': ['A11', 'A12', 'A13', 'A14'],
  'housing': ['A151', 'A152', 'A153'],
  'job': ['A171', 'A172', 'A173', 'A174'],
  'other_debtors_class': ['A101', 'A102', 'A103'],
  'other_i_plans': ['A141', 'A142', 'A143'],
  'property': ['A121', 'A122', 'A123', 'A124'],
  'purpose': ['A40',
   'A41',
   'A42',
   'A43',
   'A44',
   'A45',
   'A46',
   'A47',
   'A48',
   'A49',
   'A410'],
  'savings_acc_class': ['A61', 'A62', 'A63', 'A64', 'A65'],
  'sex_and_status': ['A91', 'A92', 'A93', 'A94', 'A95']},
 'labels': ['reasonable'],
 'numeric': ['duration_months',
  'credit_amount',
  'i_rate_by_disp_inc',
  'curr_res_since',
  'age_years',
  'n_e_credits_this_bank',
  'n_dependants'],
 'source': 'modelling.csv'}

<br>
<br>

### Instances

In [16]:
modelling = credit.src.modelling.Modelling()

In [17]:
data = modelling.data()

<br>
<br>

## Embeddings

In [18]:
groups = specifications['categoryGroups']
groups

{'credit_history': ['A30', 'A31', 'A32', 'A33', 'A34'],
 'curr_emp_class': ['A71', 'A72', 'A73', 'A74', 'A75'],
 'e_chq_acc_status': ['A11', 'A12', 'A13', 'A14'],
 'housing': ['A151', 'A152', 'A153'],
 'job': ['A171', 'A172', 'A173', 'A174'],
 'other_debtors_class': ['A101', 'A102', 'A103'],
 'other_i_plans': ['A141', 'A142', 'A143'],
 'property': ['A121', 'A122', 'A123', 'A124'],
 'purpose': ['A40',
  'A41',
  'A42',
  'A43',
  'A44',
  'A45',
  'A46',
  'A47',
  'A48',
  'A49',
  'A410'],
 'savings_acc_class': ['A61', 'A62', 'A63', 'A64', 'A65'],
 'sex_and_status': ['A91', 'A92', 'A93', 'A94', 'A95']}

<br>
<br>

### Representations

In [19]:
representations = credit.embeddings.representations.Representations(blob=data).exc(groups=groups)

<br>

Each $(1 \times 3)$ tuple in the `representations` variable has a

* T-SNE transformations pandas.DataFrame of a category group field,
* transformation dictionary of the category group in question, and
* category group name.

<br>

Hence

In [20]:
frames = [representations[i][0] for i in np.arange(len(representations))]
mappings = {representations[i][2]: representations[i][1] for i in np.arange(len(representations))}

<br>
<br>

### Rebuild Data

In [21]:
frame = pd.concat(frames, axis=1, ignore_index=False)
frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   e_chq_acc_status_1     1000 non-null   float64
 1   e_chq_acc_status_2     1000 non-null   float64
 2   credit_history_1       1000 non-null   float64
 3   credit_history_2       1000 non-null   float64
 4   purpose_1              1000 non-null   float64
 5   purpose_2              1000 non-null   float64
 6   savings_acc_class_1    1000 non-null   float64
 7   savings_acc_class_2    1000 non-null   float64
 8   curr_emp_class_1       1000 non-null   float64
 9   curr_emp_class_2       1000 non-null   float64
 10  sex_and_status_1       1000 non-null   float64
 11  sex_and_status_2       1000 non-null   float64
 12  other_debtors_class_1  1000 non-null   float64
 13  other_debtors_class_2  1000 non-null   float64
 14  property_1             1000 non-null   float64
 15  prope

<br>

Hence, modelling data

In [22]:
binary = data[specifications['binaryFields']]
target = data[specifications['labels']]

X = pd.concat((frame, binary, target), axis=1, ignore_index=False)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   e_chq_acc_status_1     1000 non-null   float64
 1   e_chq_acc_status_2     1000 non-null   float64
 2   credit_history_1       1000 non-null   float64
 3   credit_history_2       1000 non-null   float64
 4   purpose_1              1000 non-null   float64
 5   purpose_2              1000 non-null   float64
 6   savings_acc_class_1    1000 non-null   float64
 7   savings_acc_class_2    1000 non-null   float64
 8   curr_emp_class_1       1000 non-null   float64
 9   curr_emp_class_2       1000 non-null   float64
 10  sex_and_status_1       1000 non-null   float64
 11  sex_and_status_2       1000 non-null   float64
 12  other_debtors_class_1  1000 non-null   float64
 13  other_debtors_class_2  1000 non-null   float64
 14  property_1             1000 non-null   float64
 15  prope

<br>

and, transformation dictionaries

In [23]:
mappings

{'credit_history': {'A30': [1.7317723035812378, 2.9328744411468506],
  'A31': [0.0941460132598877, -10.245094299316406],
  'A32': [-15.04787826538086, 3.530170202255249],
  'A33': [9.147918701171875, 28.008813858032227],
  'A34': [21.327930450439453, -4.306393623352051]},
 'curr_emp_class': {'A71': [0.4919017553329468, -11.116412162780762],
  'A72': [-32.85271453857422, 17.10158920288086],
  'A73': [25.878013610839844, -2.6521103382110596],
  'A74': [-0.16405132412910461, 26.08067512512207],
  'A75': [-22.781719207763672, -20.637964248657227]},
 'e_chq_acc_status': {'A11': [-11.912431716918945, -23.267934799194336],
  'A12': [-12.372608184814453, 22.762001037597656],
  'A13': [-37.72511672973633, -0.169875830411911],
  'A14': [19.981571197509766, 0.11874765902757645]},
 'housing': {'A151': [26.078109741210938, -6.6943888664245605],
  'A152': [-8.295867919921875, -1.7279752492904663],
  'A153': [11.389869689941406, 25.550336837768555]},
 'job': {'A171': [-0.8940067291259766, 0.516858458

<br>
<br>

### Save

In [24]:
X.to_csv(path_or_buf=os.path.join(warehouse, 'modelling.csv'), 
         header=True, index=False, encoding='UTF-8')

In [25]:
with open(os.path.join(warehouse, 'modelling.json'), 'w') as disk:
    json.dump(mappings, disk)