# Initial Preparation

In [17]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.4 is required in this notebook
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.4"

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
np.random.seed(42)
tf.random.set_seed(42)

# For changes
USER = "ageent"
REPO = "y-mlOnPhrases"
X_FILE = "X.npy"
Y1_FILE = "Y1.npy"
Y2_FILE = "Y2.npy"

In [4]:
# to save data or images
if "google.colab" in sys.modules:
    from google.colab import drive
    ROOT_GD = "/content/drive"
    STORAGE_PATH = ROOT_GD + "/My Drive/Colab Notebooks/" + REPO + "/"
    drive.mount(ROOT_GD)
else:   # local host
    STORAGE_PATH = "data/"

def save_data(prefix, file_name, data_frame):
    path = STORAGE_PATH + prefix + file_name
    data_frame.to_csv(path)

def save_pred(file_name, data_frame):
    save_data("predictions/", file_name, data_frame)
def save_trans_data(file_name, data_frame):
    save_data("transformed/", file_name, data_frame)

"""
fig, ax = plt.subplots()
ax.plot(data)
save_fig(fig, "fig_name")
"""
def save_fig(fig, fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = STORAGE_PATH + "img/" + fig_id + "." + fig_extension
    print("Saving figure", fig_id)
    if tight_layout:
        fig.tight_layout()
    fig.savefig(path, format=fig_extension, dpi=resolution)

In [5]:
# get the data
def get_github_data(path_to_file):
    """TODO: maybe need to use requests."""
    url = "https://raw.githubusercontent.com/{}/{}/{}"\
                            .format(USER, REPO, path_to_file)
    return np.load(url)

def get_localhost_data(file_name):
    path = "data/src/" + file_name
    return np.load(path)

if "google.colab" in sys.modules:
    PATH_TO_SRC = "main/data/src/"
    X_DATA = get_github_data(PATH_TO_SRC + X_FILE)
    Y1_DATA = get_github_data(PATH_TO_SRC + Y1_FILE)
    Y2_DATA = get_github_data(PATH_TO_SRC + Y2_FILE)
else:   # local host
    X_DATA = get_localhost_data(X_FILE)
    Y1_DATA = get_localhost_data(Y1_FILE)
    Y2_DATA = get_localhost_data(Y2_FILE)

In [6]:
print(X_DATA.shape)
print(X_DATA[0])

(800, 128)
[-0.98200285  5.3519163   0.6241017  -3.7863977  -2.0433748  -1.6433135
  5.0111694   0.11501709 -3.3202858   2.2631938   4.487829    3.1017983
  3.4887044  -4.942223    6.474518    0.77631605  5.3201113  -5.003155
 -6.0889516   3.9606059   4.9673815   0.5534823   2.3377123  -3.4211032
  5.278324    1.576092   -5.2838745   0.5925345  -1.2432728   1.5591371
 -1.0380139  -0.1521509   2.4624774   6.4752107  -4.399489   -2.6032155
  3.5712152   0.44489035  1.3303515   0.42398357  2.2737198   7.777598
 -3.3040464  -2.2658207  -7.7937617  -0.6868003   7.5321355   0.5417963
  2.423962    7.1077695  -0.6353128   3.4406264  -2.3372521   0.1237992
  1.9296596   4.452048    2.1478891  -2.770266   -9.235324   10.521325
 -8.574103   -3.127737   -5.1270823   5.001681   -2.710712    0.44150203
 -0.15498942  0.24662127 -0.21252623 -1.7166231  -1.0460446  -5.4344797
  2.56957    10.698443   -3.0771906   1.0651661  -0.32124305  5.725385
  7.167192    5.156452    5.6941953   6.4998055   6.7241

In [7]:
print(Y1_DATA.shape)
print(Y1_DATA[0:10])

(800,)
[1 2 2 0 1 0 2 2 1 0]


In [8]:
print(Y2_DATA.shape)
print(Y2_DATA[0:10])

(800,)
[ 6. 15. 34. 58. 27. 29. 50. 12.  7. 17.]


# Feature engineering
### Transforming

In [9]:
x = X_DATA.copy()
y2 = Y2_DATA.copy()

In [13]:
from sklearn.preprocessing import OneHotEncoder

transformer_y1 = OneHotEncoder()

In [14]:
y1 = transformer_y1.fit_transform(Y1_DATA[:, np.newaxis]).toarray()
y1

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

### Shuffle

In [15]:
t_ = np.hstack([x, y1, y2[:, np.newaxis]])
t_.shape

(800, 132)

In [16]:
np.random.shuffle(t_)
x = t_[:, :128].copy()
y1 = t_[:, 128:131].copy()
y2 = t_[:, 131].copy()
%reset_selective -f t_

# Model selection

In [19]:
model1 = keras.models.Sequential()
model1.add(keras.layers.Dense(3, activation="linear", use_bias=False, input_shape=[128]))

In [60]:
def classifier_metric():
    def cm(y_true, y_pred):
        tf.keras.metrics.sparse_categorical_accuracy(y_true[:, :3], y_pred)
    return cm

def regressor_metric():
    def rm(y_true, y_pred):
        w_reg = 1 / 3
        y_pred_w = y_pred * w_reg
        y_pred_reg = tf.reduce_sum(y_pred_w, axis=1)
        return tf.metrics.mean_squared_error(y_true[:, 3], y_pred_reg)
    return rm

In [61]:
# test for metrics
y_true_ = tf.constant([[1, 0, 0, 60],
                       [0 ,1, 0, 50],
                       [0, 0, 1, 20],
                       [0, 1, 0, 4]])
y_pred_ = tf.constant([[99, 0, 0],
                       [0, 99, 0],
                       [0, 0, 99],
                       [0, 99, 0]])
print(classifier_metric()(y_true_, y_pred_))
print(regressor_metric()(y_true_, y_pred_))

InvalidArgumentError: Can not squeeze dim[1], expected a dimension of 1, got 3 [Op:Squeeze]

In [57]:
def loss1(y_true, y_pred):
    """y_true have shape (n, 4) and t_pred have shape (n, 3)"""
    y_true_cl = tf.cast(y_true[:, :3], tf.float64)
    y_true_reg = tf.cast(y_true[:, 3], tf.float64)

    # classification
    y_pred_max = tf.reduce_max(y_pred, axis=1)[:, tf.newaxis]
    y_pred_norm = y_pred / y_pred_max
    ss_cl = tf.square(y_true_cl - y_pred_norm)

    # regression
    w_reg = 1 / 3
    y_pred_w = y_pred * w_reg
    y_pred_reg = tf.reduce_sum(y_pred_w, axis=1)
    ss_reg = tf.square(y_true_reg - y_pred_reg)

    summands = tf.concat([ss_cl, ss_reg[:, tf.newaxis]], 1)
    return tf.reduce_sum(summands, axis=1)

In [58]:
# test for loss1
y_true_ = tf.constant([[1, 0, 0, 60],
                       [0 ,1, 0, 50],
                       [0, 0, 1, 20],
                       [0, 1, 0, 4]])
y_pred_ = tf.constant([[99, 0, 0],
                       [0, 99, 0],
                       [0, 0, 99],
                       [0, 99, 0]])
loss1(y_true_, y_pred_)

<tf.Tensor: shape=(4,), dtype=float64, numpy=array([3560.11111111, 2466.77777778,  386.77777778,   13.44444444])>

In [None]:
# TODO: do y_true and y_pred with equal dimension
model1.compile(loss=loss1,
               optimizator="nadam",
               metrics=[classifier_metric, regressor_metric])