# 2. Data preparation and pre-processing

### 2.0 Imports

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
import seaborn as sns

In [None]:
import boto3
from boto3 import session
import os

key_id = os.environ.get("AWS_ACCESS_KEY_ID")
secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
bucket_name = os.environ.get("AWS_S3_BUCKET")
s3_endpoint = os.environ.get("AWS_S3_ENDPOINT")

s3_client = boto3.client("s3", aws_access_key_id=key_id, aws_secret_access_key=secret_key, endpoint_url=s3_endpoint)

### 2.1 Loading data

In [None]:

s3_client.download_file(bucket_name, "data/train.csv", '/tmp/train.csv')
s3_client.download_file(bucket_name, "data/test.csv", '/tmp/test.csv')

In [None]:
df_train = pd.read_csv('/tmp/train.csv')
df_train

In [None]:
df_test = pd.read_csv('/tmp/test.csv')
df_test

### 2.2 Checking for missing values

In [None]:
df_train.isna().any().describe()

In [None]:
df_test.isna().any().describe()

In [None]:
df_train.info()

In [None]:
X = df_train.iloc[:,1:]
y = df_train.iloc[:, 0]
print(X.shape, y.shape)

> <div class="alert alert-block alert-info">
    <b>No missing data, let's continue.</b></div>


### 2.3 Data and target class visualizations

In [None]:
sns.barplot(x=y.value_counts().index, y=y.value_counts().values)

> <div class="alert alert-block alert-info">
    <b>As you can see, there is a fairly even class distribution.</b></div>

In [None]:
z = np.reshape(X.iloc[8].values, (28,28))
print(z.shape)
plt.imshow(z, cmap='Greys')

> <div class="alert alert-block alert-info">
    <b>Here is an example of one of the digits. It is a 28 x 28 black and white image.</b></div>

### 2.4 Creating training and validation sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=15)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

> <div class="alert alert-block alert-info">
<b>We now need to split the training data into two sets; training and validation. We will train on 90% of the data, and use the remaining 10% to evaluate the model during training.</b></div>

### 2.5 Pre-processing pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class ReshapeFunc(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.reshape((-1,28,28,1))
        return X

> <div class="alert alert-block alert-info">
<b>Custom transformer was created to add a channel dimension to the input (X) data. As the image is black and white we only have 1 channel. In the case of a coloured image we would require 3 channels (red blue green).</b></div>

In [None]:
features_pipeline = Pipeline(steps=[
    ('Normalize', MinMaxScaler()),
    ('Reshape', ReshapeFunc())
])

> <div class="alert alert-block alert-info">
    <b>Feature pipeline. Data is scaled between 0 and 1 and then reshaped into input format.</b></div>

In [None]:
X_train = features_pipeline.fit_transform(X_train)

In [None]:
from sklearn.preprocessing import OneHotEncoder
target_pipeline = Pipeline(steps=[
    ('OneHot', OneHotEncoder())
])

> <div class="alert alert-block alert-info">
<b>Target pipeline. One hot encoding is used, as we will be using a softmax activation function in the output node.</b></div>

In [None]:
y_train = target_pipeline.fit_transform(y_train.values.reshape(-1,1))

In [None]:
y_train = y_train.toarray()

In [None]:
print(X_train.shape, y_train.shape)

> <div class="alert alert-block alert-info">
    <b>Don't forget to apply the preprocessing pipeline to the validation data!</b></div>

In [None]:
X_val = features_pipeline.fit_transform(X_val)

In [None]:
y_val = target_pipeline.fit_transform(y_val.values.reshape(-1, 1))

In [None]:
y_val = y_val.toarray()

In [None]:
print(X_val.shape, y_val.shape)

> <div class="alert alert-block alert-info">
    <b>and test data..</b></div>

In [None]:
X_test = features_pipeline.fit_transform(df_test)

### 2.6 Export data

In [None]:
os.makedirs('/tmp/ml-pipeline/data/', exist_ok=True)

In [None]:
np.save('/tmp/ml-pipeline/data/X_train.npy', X_train)
np.save('/tmp/ml-pipeline/data/y_train.npy', y_train)
np.save('/tmp/ml-pipeline/data/X_val.npy', X_val)
np.save('/tmp/ml-pipeline/data/y_val.npy', y_val)
np.save('/tmp/ml-pipeline/data/X_test.npy', X_test)