## Version 20240821

## 00 Importing Modules & Mounting Drive

In [None]:
!pip install ydata-profiling

import pathlib
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np
import pandas as pd
import sklearn
import pickle
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb
import tensorflow as tf
import datetime as dt

from scipy import stats as sm
from IPython.display import Image
from graphviz import Source
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score, accuracy_score, average_precision_score, f1_score, precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
from ydata_profiling import ProfileReport
from tabulate import tabulate
from tensorflow.keras import layers, losses, initializers, Model, regularizers, activations

%matplotlib inline

In [None]:
'''from google.colab import drive
drive.mount('/content/drive') #'''

In [None]:
'''MAIN_PATH = str(pathlib.Path().resolve())
WORK_PATH = MAIN_PATH + '/drive/MyDrive/Workspace'
SOURCE_PATH = WORK_PATH + '/00_Data_Source'
CACHE_PATH = WORK_PATH + '/00_Cache_Data' #'''

## 01 Choosing & Importing Dataset

### 01.00 Importing Data from Zipfile

In [None]:
filename = 'Diseases_And_Symptoms.zip' # replace this

url = 'https://github.com/azzindani/00_Data_Source/raw/main/'+ filename
http_response = urlopen(url)
zipfile = ZipFile(BytesIO(http_response.read()))
zipfile.extractall() #'''

In [None]:
os.listdir()

In [None]:
df = pd.read_csv(os.listdir()[1], encoding = 'ISO-8859-1')#, sep = ';')
df.shape #'''

### 01.01 Importing Main Data

In [None]:
'''filename = 'Uber_Fares.csv' # replace this

url = 'https://github.com/azzindani/00_Data_Source/raw/main/'+ filename
df = pd.read_csv(url, encoding = 'ISO-8859-1')#, sep = ';')
df.shape #'''

In [None]:
df.nunique()

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
for column in df.columns:
  if df[column].dtypes == 'object':
    print(column)
    print('-' * 100)
    print(df[column].unique())
    print('=' * 100)

### 01.02 Importing Geo Data

In [None]:
'''geo_path = 'https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json' # replace this

gdf = gpd.read_file(geo_path)
gdf.head(2) #'''

In [None]:
'''gdf = gdf.rename(columns = {'name' : 'State'})
gdf = gdf[['State', 'geometry']]
gdf.head(2) #'''

### 01.03 Importing Additional Data (for enrichment)

### 01.04 Dataframe Back Up

In [None]:
df_bu = df.copy()

## 02 Data Structuring

### 02.01 Selecting & Dropping Variables

In [None]:
column_list = [] # fill this

df = df.drop(column_list, axis = 1)
df.head(2) #'''

### 02.02 Cleaning Text Object

#### 02.02.01 Convert Header to Proper Text

In [None]:
'''for x in df.columns:
  y = x.title()
  df = df.rename(columns = {x : y}) #'''

#### 02.02.02 Strip Abnormal Spaces

In [None]:
'''for x in df.columns:
  if df[x].dtypes == 'object':
    try:
      df[x] = df[x].str.strip()
    except:
      pass #'''

#### 02.02.03 Convert Object Content to Proper Text

In [None]:
'''for x in df.columns:
  if df[x].dtypes == 'object':
    for a in df[x].unique():
      b = a.title()
      df[x] = df[x].replace(a, b)
  else:
    pass #'''

### 02.03 Coverting Data

#### 02.03.01 Convert to date

In [None]:
df.head(2)

In [None]:
column_list = ['Activity Period Start Date'] # fill this

for x in column_list:
  try:
    df[x] = pd.to_datetime(df[x])
  except:
    pass #'''

#### 02.03.02 Convert to integer

In [None]:
column_list = [] # fill this

for x in column_list:
  df[x] = df[x].astype('int') #'''

#### 02.03.03 Convert to Object (if necessary)

In [None]:
column_list = [] # fill this

for x in column_list:
  df[x] = df[x].astype('str') #'''

#### 02.03.04 Replace 0 to Nan (if necessary)

In [None]:
#df = df.replace(0, np.nan)

#### 02.03.05 Filling 0 to Nan

In [None]:
'''for column in df.columns:
  if df[column].dtype == 'float64' or df[column].dtype == 'int64':
    df[column] = df[column].fillna(0)
    print(column)
  else:
    pass #'''

#### 02.03.06 Dropping Nan

In [None]:
#df = df.dropna()

## 03 Data Cleaning

### 03.01 Replacing Variable Component

In [None]:
'''value_dict = {} # fill this

column_name = ''

df[column_name] = df[column_name].replace(value_dict) #'''

### 03.02 Add New Variable

#### 03.02.01 Add by Math Calculation

In [None]:
'''new_var = '' # fill this
obj_var1 = '' # fill this
obj_var2 = '' # fill this

df[new_var] = df[obj_var1] * df [obj_var2] #'''

#### 03.02.02 Add by Replacing "Other"

In [None]:
'''column_name = '' # fill this

df[column_name].value_counts() #'''

In [None]:
'''value_thres = 1300

replace_list = []

df_dict = df[column_name].value_counts().to_dict()
for i in df_dict:
  if df_dict[i] < value_thres:
    replace_list.append(i)

replace_list #'''

In [None]:
'''df[column_name] = df[column_name].copy().replace(to_replace = replace_list, value = 'Other') #'''

### 03.03 Inaccuracies

In [None]:
column_list = [] # fill this

for x in column_list:
  try:
    df[x] = df[x].replace('0', np.nan)
    df = df.dropna()
  except:
    pass

### 03.04 Handling

##### Data Distribution Check (Before)

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.histogram(
  df,
  x = x,
  y = y,
  color = color,
  marginal = 'box',
  hover_data = df.columns
)

fig.show() #'''

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.scatter(
  df,
  x = x,
  y = y,
  color = color,
  marginal_x = 'histogram',
  marginal_y = 'box',
  hover_data = df.columns
)

fig.show() #'''

#### 03.04.01 Using IQR (Inter Quantile Range)

In [None]:
def iqr_thres(dataframe, column, th1 = 0.25, th3 = 0.75):
  quartile1 = dataframe[column].quantile(th1)
  quartile3 = dataframe[column].quantile(th3)
  iqr = quartile3 - quartile1
  upper_limit = quartile3 + 1.5 * iqr
  lower_limit = quartile1 - 1.5 * iqr

  return lower_limit, upper_limit

In [None]:
def check_outliers_iqr(dataframe, column):
  lower_limit, upper_limit = iqr_thres(dataframe, column)
  if dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)].any(axis = None):
    return True
  else:
    return False

In [None]:
def replace_iqr(dataframe, columns, th1 = 0.25, th3 = 0.75, replace = False):
  data = []

  for column in columns:
    if dataframe[column].dtypes == 'int64' or dataframe[column].dtypes == 'float64':
      if column != 'Outcome':
        outliers_ = check_outliers_iqr(dataframe, column)
        count = None
        lower_limit, upper_limit = iqr_thres(dataframe, column, th1, th3)

        if outliers_:
          count = dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)][column].count()
          if replace:
            if lower_limit < 0:
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
            else:
              dataframe.loc[(dataframe[column] < lower_limit), column] = np.nan
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
        outliers_status = check_outliers_iqr(dataframe, column)
        data.append([outliers_, outliers_status, count, column, lower_limit, upper_limit ])

  table = tabulate(data, headers = ['Outliers (Previously)', 'Outliers', 'Count', 'Column', 'Lower Limit', 'Upper Limit'], tablefmt = 'rst', numalign = 'right')
  print('Removing Outliers using IQR')
  print(table)

In [None]:
column_list = [] # fill this

replace_iqr(
  dataframe = df,
  columns = column_list,
  replace = True
)
df = df.dropna()#'''

#### 03.04.02 Using Standard Deviation

In [None]:
def std_thres(dataframe, column):
  upper_limit = dataframe[column].mean() + 3 * dataframe[column].std()
  lower_limit = dataframe[column].mean() - 3 * dataframe[column].std()

  return lower_limit, upper_limit

In [None]:
def check_outliers_std(dataframe, column):
  lower_limit, upper_limit = std_thres(dataframe, column)
  if dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)].any(axis = None):
    return True
  else:
    return False

In [None]:
def replace_std(dataframe, columns, replace = False):
  data = []

  for column in columns:
    if dataframe[column].dtypes == 'int64' or dataframe[column].dtypes == 'float64':
      if column != 'Outcome':
        outliers_ = check_outliers_std(dataframe, column)
        count = None
        lower_limit, upper_limit = std_thres(dataframe, column)

        if outliers_:
          count = dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)][column].count()
          if replace:
            if lower_limit < 0:
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
            else:
              dataframe.loc[(dataframe[column] < lower_limit), column] = np.nan
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
        outliers_status = check_outliers_std(dataframe, column)
        data.append([outliers_, outliers_status, count, column, lower_limit, upper_limit])

  table = tabulate(data, headers = ['Outlier (Previously)', 'Outliers', 'Count', 'Column', 'Lower Limit', 'Upper Limit'], tablefmt = 'rst', numalign = 'right')
  print('Removing Outliers using 3 Standard Deviation')
  print(table)

In [None]:
column_list = [] # fill this

replace_std(
  dataframe = df,
  columns = column_list,
  replace = True
)
df = df.dropna()#'''

### 03.05 Handling Missing / Zeros / Null
##### Filling missing value (numerical only) is better using median than mean or mode

#### 03.05.01 Detecting Zero Value

In [None]:
zero_columns = [] # fill this

for x in df.columns:
  if df[x].dtypes == 'int64' or df[x].dtypes == 'float64':
    if (df[x] == 0).sum() != 0:
      print(x, ':', str((df[x] == 0).sum()))
      zero_columns.append(x)

zero_columns #'''

#### 03.05.02 Detecting Nan / Non Available Value

In [None]:
nan_columns = [] # fill this

for x in df.columns:
  if df[x].dtypes == 'object':
    if (df[x] == np.nan).sum() or (df[x] == '-').sum() != 0:
      print(x, ':', str((df[x] == np.nan).sum()))
      nan_columns.append(x)

nan_columns #'''

#### 03.05.03 Replacing Zero with Mean (for numerical value if median value == 0), if necessary

In [None]:
'''column_list = [] # fill this

df = df.replace(0, np.nan)

for x in column_list:
  df[x] = df[x].fillna(df[x].mean()) #'''

#### 03.05.04 Replacing Zero with Median (for numerical value if median value != 0), if necessary

In [None]:
'''column_list = [] # fill this

df = df.replace(0, np.nan)

for x in column_list:
  df[x] = df[x].fillna(df[x].median()) #'''

#### 03.05.05 Replacing Zero with Mode (for categorical / object value), if necessary

In [None]:
'''column_list = [] # fill this

df = df.replace(0, np.nan)

for x in column_list:
  df[x] = df[x].fillna(df[x].mode()) #'''

### 03.06 Handling Incomplete Data

### 03.07 Handling Data Biases

### 03.08 Handling Duplicates

In [None]:
#df = df.drop_duplicates()

##### Data Distribution Check (After)

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.histogram(
  df,
  x = x,
  y = y,
  color = color,
  marginal = 'box',
  hover_data = df.columns
)

fig.show() #'''

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.scatter(
  df,
  x = x,
  y = y,
  color = color,
  marginal_x = 'histogram',
  marginal_y = 'box',
  hover_data = df.columns
)

fig.show() #'''

## 04 Enriching Data
#### take other dataset, inside or outside from related dataset / business

In [None]:
'''location = 'Country' # replace this

a = df[location].unique()
b = gdf[location].unique()

for i in a:
  if i not in b:
    print(i)

print(50 * '=')

for i in b:
  if i not in a:
    print(i) #'''

In [None]:
value_dict = {}

try:
  df[location] = df[location].replace(value_dict)
except:
  pass #'''

## 05 Data Validation
#### Verifying consistency, quality, and security of data

## 06 Exploration Data Analysis (Univariate)

In [None]:
'''data_profile = ProfileReport(
  df,
  correlations = {
    'pearson' : {'calculate' : True},
    'spearman' : {'calculate' : True},
    'kendall' : {'calculate' : True},
    'phi_k' : {'calculate' : True},
    'cramers': {'calculate' : True},
  },
)

data_profile #'''

## 07 Select Variable X & Y | Splitting Data

#### 07.01 Data Balancing

In [None]:
'''# Check Before Data Balancing
y_var = 'HeartDisease'

fig, ax = plt.subplots(figsize = (5, 5))
sizes = [count for count in df[y_var].value_counts()]
labels = list(df[y_var].value_counts().index)

ax.pie(
  x = sizes,
  labels = labels,
  autopct = '%1.1f%%',
)
plt.show() #'''

In [None]:
'''n = 20000
append_data = []

for i in df[y_var].unique():
  df_x = df[df[y_var] == i][:n]
  append_data.append(df_x)

append_df = pd.concat(append_data)
append_df.shape #'''

In [None]:
'''# Check After Data Balancing

fig, ax = plt.subplots(figsize = (5, 5))
sizes = [count for count in append_df[y_var].value_counts()]
labels = list(append_df[y_var].value_counts().index)

ax.pie(
  x = sizes,
  labels = labels,
  autopct = '%1.1f%%',
)
plt.show() #'''

In [None]:
#df = append_df

### 07.02 Label Encoding / One Hot Encoding

#### 07.02.01 First Method

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
y_var = 'diseases'

class_dict = {}
count = 0
for cat in df[y_var].unique():
  class_dict[cat] = count
  count = count + 1

class_dict #'''

In [None]:
rev_class_dict = {v: k for k, v in class_dict.items()}

In [None]:
df[y_var] = df[y_var].replace(class_dict)

#### 07.02.02 Second Method

In [None]:
columns = [] # fill selected columns

for column in columns:
  df[column] = df[column].astype('str')
  print(df[column].unique())

In [None]:
label_encoders = {}

for column in columns:
  le = LabelEncoder()
  df[column] = le.fit_transform(df[column])
  label_encoders[column] = le

for column in columns:
  labels = label_encoders[column].classes_
  print(column)
  print('-' * 50)
  for i in range(len(labels)):
    print(str(i), ':', labels[i])
  print('=' * 50)

### 07.03 Define X & Y variables

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
#x_var = []
x_var = df.columns[1:]
y_var = 'diseases'

x = df[x_var]
y = df[y_var]

x.head(2)

In [None]:
x = np.array(x)
y = np.array(y)

### 07.04 Split Data

In [None]:
sc = StandardScaler()

x_train, x_tosplit, y_train, y_tosplit = train_test_split(x, y, test_size = 0.3, random_state = 42)
x_test, x_val, y_test, y_val = train_test_split(x_tosplit, y_tosplit, test_size = 0.4, random_state = 42)

print('x_train shape :', x_train.shape)
print('x_test shape :', x_test.shape)
print('x_val shape :', x_val.shape)
print('=' * 50)
print('y_train shape :', y_train.shape)
print('y_test shape :', y_test.shape)
print('y_val shape :', y_val.shape)

In [None]:
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [None]:
y_classes = len(df[y_var].unique())
y_classes

## 08 Classification Model

### 08.01 Building Model

In [None]:
inp = layers.Input(shape = (x_train.shape[1],))

x = layers.Embedding(input_dim = 1024, output_dim = 64)(inp)
x = layers.Conv1D(128, 5, activation = 'relu')(x)
#x = layers.Dense(units = 1024, activation = 'relu')(x)
#x = residual_block(x, 1024)
#x = layers.Bidirectional(layers.LSTM(64,  return_sequences = True))(x)
#x = layers.Bidirectional(layers.LSTM(32))(x)
x = layers.Flatten()(x)
#x = layers.Dense(units = 1024, activation = 'relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
#x = layers.Dropout(0.05)(x)
x = layers.Dense(units = y_classes, activation = 'softmax', name = 'fc' + str(10))(x)

In [None]:
def residual_block(inputs, units):
  x = layers.Dense(units)(inputs)
  x = layers.BatchNormalization()(x)
  x = layers.Activation('relu')(x)
  x = layers.Add()([inputs, x])
  x = layers.Activation('relu')(x)
  return x

In [None]:
def inception_block(inputs, filters):
    conv1 = layers.Conv1D(filters, kernel_size = 1, padding = 'same', activation = 'relu')(inputs)
    conv2 = layers.Conv1D(filters, kernel_size = 3, padding = 'same', activation = 'relu')(inputs)
    conv3 = layers.Conv1D(filters, kernel_size = 5, padding = 'same', activation = 'relu')(inputs)
    pool = layers.MaxPooling1D(pool_size = 3, strides = 1, padding = 'same')(inputs)
    pool = layers.Conv1D(filters, kernel_size = 1, padding = 'same', activation = 'relu')(pool)
    x = layers.concatenate([conv1, conv2, conv3, pool], axis = -1)
    return x

In [None]:
inp = layers.Input(shape = (x_train.shape[1],))

x = layers.Reshape((x_train.shape[1], 1))(inp)
#x = inception_block(x, 16)
x = inception_block(x, 32)
#x = inception_block(x, 64)
#x = inception_block(x, 128)
#x = inception_block(x, 64)
#x = inception_block(x, 32)
#x = inception_block(x, 16)
x = layers.Flatten()(x)
x = layers.Dense(units = 1024, activation = 'relu')(x)
x = layers.Dense(units = y_classes, activation = 'softmax', name = 'fc' + str(10))(x) #'''

In [None]:
inp = layers.Input(shape = (x_train.shape[1],))

x = layers.Dense(units = 1024, activation = 'relu')(inp)

x = residual_block(x, 1024)

x = layers.Dense(units = 1024, activation = 'relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

x = layers.Dense(units = y_classes, activation = 'softmax', name = 'fc' + str(10))(x)

In [None]:
model = Model(inputs = inp, outputs = x, name = 'Model_X')
model.summary()

In [None]:
model.compile(
  optimizer = 'adam',
  loss = losses.sparse_categorical_crossentropy,
  metrics = ['accuracy']
)

### 08.02 Set Up Threshold

In [None]:
class MyThresholdCallback(tf.keras.callbacks.Callback):
  def __init__(self, threshold):
    super(MyThresholdCallback, self).__init__()
    self.threshold = threshold

  def on_epoch_end(self, epoch, logs=None):
    val_acc = logs['val_accuracy']
    acc = logs['accuracy']
    if val_acc >= self.threshold and acc >= self.threshold:
        self.model.stop_training = True

In [None]:
es_callback = MyThresholdCallback(threshold = 0.98)

### 08.03 Model Training (Data Validation Included)

In [None]:
history = model.fit(
  x_train,
  y_train,
  batch_size = 512,
  epochs = 5,
  validation_data = (x_val, y_val),
  callbacks = es_callback
)

In [None]:
val_loss = history.history['val_loss']
loss = history.history['loss']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

epochs = range(1, len(acc)+1, 1)

plt.plot(epochs, acc, 'r--', label = 'Training Accuracy')
plt.plot(epochs, val_acc, 'b', label = 'Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.ylabel('acc')
plt.xlabel('epochs')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r--', label = 'Training Loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation Loss')
plt.title('Training and Validation Loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend()
plt.figure()

### 08.04 Model Testing

In [None]:
model.evaluate(x_test, y_test)

### 08.05 Checking The Result

#### 08.05.01 Classification Report

In [None]:
y_predicted = np.argmax(model.predict(x_test), axis = -1)
cr = classification_report(y_test, y_predicted)

print('Classification Report\n', cr)

#### 08.05.02 Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
'''print('Confusion Matrix')
plt.figure(figsize = (5, 3))
sns.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show() #'''

#### 08.05.03 Prediction Table

In [None]:
test_label = []
predict_label = []
label_name =  []
predict_name = []
prediction_score = []

for i in range(0, 100): #x_test.shape[0]):
  test_label.append(y_test[i])
  t_label = rev_class_dict[y_test[i]]
  label_name.append(t_label)

  img = x_test[i]
  img = tf.expand_dims(img, axis = 0)
  prediction = model.predict(img)
  dense = prediction.reshape(-1).tolist()
  score = max(dense)
  index = dense.index(score)
  predict_label.append(index)

  p_label = rev_class_dict[index]
  predict_name.append(p_label)

  prediction_score.append(score)

prediction_result = pd.DataFrame({
  'label_num' : test_label,
  'label_name' : label_name,
  'predict_num' : predict_label,
  'predict_name' : predict_name,
  'predict_score' : prediction_score,
})

In [None]:
prediction_result['pred_check'] = prediction_result['label_num'] == prediction_result['predict_num']
prediction_result.head()

#### 08.05.04.F1 Score

In [None]:
labels = df[y_var].tolist()

f1_scores = f1_score(prediction_result['label_name'], prediction_result['predict_name'], average = None, labels = labels)
f1_dict = {label : score for label, score in zip(labels, f1_scores)}

f1_df = pd.DataFrame(list(f1_dict.items()), columns = ['class', 'f1_score'])
f1_df = f1_df.sort_values(by = 'f1_score', ascending = False).reset_index(drop = True)
f1_df.head()

In [None]:
plt.figure(figsize = (15, 20), dpi = 100)
sns.barplot(x = f1_df['f1_score'], y = f1_df['class'])

#### 08.05.05 False Prediction

In [None]:
false_pred = prediction_result[prediction_result['pred_check'] == False].sort_values('predict_score', ascending = False)
false_pred.head()

In [None]:
false = round(false_pred.shape[0] / x_test.shape[0] * 100, 2)
print('Amount of False Prediction :', false, '%')

## 09 Regression Model

### 09.01 Building Model

In [None]:
inp = layers.Input(shape = (x_train.shape[1],))

x = layers.Dense(units = 128, kernel_initializer = 'normal', activation = 'relu')(inp)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

x = layers.Dense(units = 256, kernel_initializer = 'normal', activation = 'relu')(inp)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

x = layers.Dense(units = 512, kernel_initializer = 'normal', activation = 'relu')(inp)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

x = layers.Dense(units = 1024, kernel_initializer = 'normal', activation = 'relu')(inp)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

x = layers.Dense(units = 512, kernel_initializer = 'normal', activation = 'relu')(inp)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

x = layers.Dense(units = 1, kernel_initializer = 'normal', activation = 'linear')(x)

In [None]:
model = Model(inputs = inp, outputs = x, name = 'Model_X')
model.summary()

In [None]:
error = 'mse'
#error = 'mean_absolute_error'
error_val = 'val_' + error

model.compile(
  optimizer = 'adam',
  loss = error,
  metrics = [error]
)

### 09.02 Set Up Threshold

In [None]:
class MyThresholdCallback(tf.keras.callbacks.Callback):
  def __init__(self, threshold):
    super(MyThresholdCallback, self).__init__()
    self.threshold = threshold

  def on_epoch_end(self, epoch, logs=None):
    val_acc = logs['val_mean_absolute_error']
    acc = logs['mean_absolute_error']
    if val_acc <= self.threshold and acc <= self.threshold:
        self.model.stop_training = True

In [None]:
es_callback = MyThresholdCallback(threshold = 2500)

### 09.03 Model Training (Data Validation Included)

In [None]:
history = model.fit(
  x_train,
  y_train,
  batch_size = 512,
  epochs = 5,
  validation_data = (x_val, y_val),
  callbacks = es_callback
)

In [None]:
val_loss = history.history['val_loss']
loss = history.history['loss']
acc = history.history[error]
val_acc = history.history[error_val]

epochs = range(1, len(acc)+1, 1)

plt.plot(epochs, acc, 'r--', label = 'Training Error')
plt.plot(epochs, val_acc, 'b', label = 'Validation Error')
plt.title('Training and Validation Error')
plt.ylabel(error)
plt.xlabel('epochs')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r--', label = 'Training Loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation Loss')
plt.title('Training and Validation Loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend()
plt.figure()

### 09.04 Model Testing

In [None]:
model.evaluate(x_test, y_test)

### 09.05 Checking The Result
#### Prediction Table

In [None]:
y_predicted = model.predict(x_test)

In [None]:
true_y = []
predicted_y = []
errors = []

for i in range(0, 100): #x_test.shape[0]):
  true_y.append(y_test[i])
  predicted_y.append(y_predicted[i][0])
  rmse = np.sqrt(np.mean((y_test[i] - y_predicted[i][0]) ** 2))
  errors.append(rmse)

prediction_result = pd.DataFrame({
  'true_y' : true_y,
  'predicted_y' : predicted_y,
  error : errors,
})

prediction_result = prediction_result.sort_values(by = error, ascending = True).reset_index(drop = True)
display(prediction_result.head())
display(prediction_result.tail())

## 10 Saving Trained Model

In [None]:
'''filename = str(dt.datetime.today().strftime('%Y%m%d_%H%M'))
save_dir = '/content/' + filename

model_json = model.to_json()
with open (save_dir + '.json', 'w') as json_file:
  json_file.write('/content/' + model_json)

model.save(save_dir + '.h5') #'''