The splitting phase 

## Preliminaries

In [None]:
%%bash

rm -rf *.sh
rm -rf warehouse

<br>

### Packages & Libraries

**Packages**

In [None]:
import subprocess

In [None]:
if 'google.colab' in str(get_ipython()):
    subprocess.run('wget -q https://raw.githubusercontent.com/exhypotheses/risk/develop/scripts.sh', shell=True)
    subprocess.run('chmod u+x scripts.sh', shell=True)
    subprocess.run('./scripts.sh', shell=True)

<br>

**Libraries**

In [None]:
import logging
import collections
import os
import zipfile

import numpy as np
import pandas as pd
import dask


<br>

### Logging

In [None]:
logging.basicConfig(level=logging.INFO, format='%(message)s\n%(asctime)s.%(msecs)03d', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>

### Custom

In [None]:
import config

import risk.functions.split

<br>

Herein, the data set wherein the polytomous categorical variables have undergone t-SNE embedding has been selected.

In [None]:
import risk.src.representations

<br>

Configurations

In [None]:
configurations = config.Config()

<br>

### Paths

In [None]:
if not 'google.colab' in str(get_ipython()):    
    notebooks = os.getcwd()
    parent = str(pathlib.Path(notebooks).parent)
    sys.path.append(parent)
else:
    parent = os.getcwd()  

In [None]:
directory = os.path.join(parent, configurations.warehouse, 'splits', 'scikit')

if not os.path.exists(directory):
    os.makedirs(directory)

<br>
<br>

## For YAML

### Seed

In [None]:
SEED = 5
np.random.seed(seed=SEED)

<br>

### Splitting Function Arguments

In [None]:
test_size = 0.35
strata = ['reasonable', 'female']

<br>
<br>

## Data

### Read

Reads in the data wherein the polytomous categorical fields have undergone t-SNE embedding.

In [None]:
representations = risk.src.representations.Representations()

data = representations.data()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   duration_months        1000 non-null   int64  
 1   credit_amount          1000 non-null   int64  
 2   i_rate_by_disp_inc     1000 non-null   int64  
 3   curr_res_since         1000 non-null   int64  
 4   age_years              1000 non-null   int64  
 5   n_e_credits_this_bank  1000 non-null   int64  
 6   n_dependants           1000 non-null   int64  
 7   e_chq_acc_status_1     1000 non-null   float64
 8   e_chq_acc_status_2     1000 non-null   float64
 9   credit_history_1       1000 non-null   float64
 10  credit_history_2       1000 non-null   float64
 11  purpose_1              1000 non-null   float64
 12  purpose_2              1000 non-null   float64
 13  savings_acc_class_1    1000 non-null   float64
 14  savings_acc_class_2    1000 non-null   float64
 15  curr_

In [None]:
attributes = representations.attributes()
target = attributes.target

<br>

### Structure

**Split**

* Save the training & testing frames; a future pre-notebook step.

In [None]:
SplittingParameters = collections.namedtuple(
    typename='SplittingParameters', field_names=['test_size', 'random_state'])

split = risk.functions.split.Split(splitting=SplittingParameters._make((test_size, SEED)))
training, testing = split.exc(data=data, target=target, strata=strata)

In [None]:
training.to_csv(path_or_buf=os.path.join(directory, 'training.csv'), 
                header=True, index=False, encoding='UTF-8')
testing.to_csv(path_or_buf=os.path.join(directory, 'testing.csv'), 
               header=True, index=False, encoding='UTF-8')