# Overview 

## Objective

This notebook is just for settings the demo. 

It covers the following tasks:

    1. Make train, test, and val directories
    2. Split raw data in train, test and val data

## Imports and setup

In [1]:
# Imports

## General
import os
import shutil
import tempfile
import pprint

# Preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Tensorflow
import tensorflow as tf
import tensorflow_transform as tft

import apache_beam.io.iobase
import apache_beam as beam
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata, schema_utils, dataset_schema

In [6]:
# Variables

BASE_DIR_PATH = os.getcwd()
DATA_DIR_PATH = os.path.join(BASE_DIR_PATH, '/bd-fs-mnt/TenantShare/HMEQ')
RAW_DATA_PATH = os.path.join(DATA_DIR_PATH, 'hmeq.csv')

## Task 1: Make train, test, and val directories

In [7]:
def make_data_directories (datapath):
    '''
    Given the root, create train and test folders
    :param datapath: 
    :return: data_dir_paths
    '''
    train_dir = os.path.join(datapath, 'train')
    test_dir = os.path.join(datapath, 'test')

    data_dir_paths = [train_dir, test_dir]

    for data_dir_path in data_dir_paths:
        # Check if exist and remove
        if os.path.exists(data_dir_path) and os.path.isdir(data_dir_path):
            shutil.rmtree(data_dir_path)
        # Make dirs
        os.mkdir(data_dir_path)

    return data_dir_paths

## Task 2: Split raw data in training, test and val data

In [8]:
def split_raw_train_test (raw_data_path, data_dir_paths):
    '''
    Given raw data path and data directory, split the raw data
    in train and test and store csv files
    :param raw_data_path:
    :param data_dir_paths:
    :return: 1
    '''
    raw_df = pd.read_csv(raw_data_path)
    train, test = train_test_split(raw_df, test_size=0.1, random_state=8)

    dataframes = [train, test]
    data_file_names = ['train.csv', 'test.csv']

    for dataframe, data_dir_path, data_file_name in zip(dataframes, data_dir_paths, data_file_names):
        dataframe.to_csv(os.path.join(data_dir_path, data_file_name), index=False)

    return 1

In [9]:
data_dir_paths = make_data_directories(DATA_DIR_PATH)
split_raw_train_test(RAW_DATA_PATH, data_dir_paths)

1

<!-- ## Task 3: Define features and configures feature columns

In order to import our training data into TensorFlow, we need to specify what type of data each feature contains.  -->

<!-- # Define a dataframe for testing. 

# We have:

# 1. missing values
# 2. normalize data


data = pd.DataFrame(dict(
    a=[1.1, 2.2, 3.3, 4.4, np.NaN], # continuous
    b=[5.5, 6.6, 7.7, np.NaN, 0.0], # continous
    c=['a', 'b', '', 'a', ''], #categorical
    d=[1, 2, 2, 1, 2] #target
))

data.info()

data.to_csv(TEST_DATA_PATH, index=False) -->

<!-- TARGET = ['d']
CATEGORICAL_VARIABLES = ['c']
NUMERICAL_VARIABLES = ['a', 'b']

feature_columns = [] -->

<!-- ## Set Helpers -->

<!-- def _get_mean_parameter(train_data: pd.DataFrame, column: str) -> float:
    ''' Given a column, calculate mean'''
    mean = train_data[column].mean()
    return mean

def _get_impute_parameters(train_data: pd.DataFrame, numerical_features: list) -> dict:
    '''For each column in the numerical features, calculate mean.'''

    impute_parameters = {}
    
    for column in numerical_features:
        impute_parameters[column] = _get_mean_parameter(train_data, column)
        
    return impute_parameters

def _impute_missing(inputs: dict) -> dict:
    impute_parameters = _get_impute_parameters(data, NUMERICAL_VARIABLES)
    # Since we modify just some features, 
    # we need to start by setting `outputs` to a copy of `inputs.
    output = inputs.copy()
    for key, value in impute_parameters.items():
        is_miss = tf.math.is_nan(inputs[key])
        tf_mean = tf.constant(value, dtype=np.float64)
        output[key] = tf.where(is_miss, tf_mean, inputs[key])
    return output
     -->

<!-- ## Load CSV files as dataset and Impute missing -->

<!-- def input_fn(dataframe: pd.DataFrame) -> dict:
    dataset = tf.data.Dataset.from_tensor_slices(dict(dataframe))
    dataset = dataset.map(_impute_missing)
    dataset = dataset.repeat(3).shuffle(buffer_size=5, seed=8).batch(5).prefetch(1)
    return dataset -->

<!-- # Notice I use Dataset.from_tensor_slices method because data fits into memory. 
# Also it works on dictionaries, allowing this data to be easily imported.

# def input_fn(dataframe: pd.DataFrame) -> dict:
#     '''input_fn to read the data and impute missings'''
    
#     #Extract
#     dataframe = _set_categorical_type(dataframe)
#     dataframe = _set_categorical_empty(dataframe)
#     dataframe = _set_numerical_type(dataframe)
#     dataset = tf.data.Dataset.from_tensor_slices(dict(dataframe))
    
#     #Transform
#     dataset = dataset.map(_impute_missing_categorical)
#     dataset = dataset.map(_impute_missing_numerical)
#     batch_dataset = dataset.repeat(3).shuffle(buffer_size=480, seed=8).batch(5).prefetch(1)
    
#     #Load : to check
#     return dataset -->

<!-- for item in input_fn(data):
    print(item) -->

<!-- # for item in input_fn(data_train).take(1):
#     print(item) -->