## Init libraries

In [2]:
from collections import defaultdict
from pathlib import Path
import pandas as pd
import numpy as np

import scipy.stats as distributions
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

from experiment import run_experiment, load_experiment,\
    plot_ranked_with_overfitting, plot_scores_splits, plot_predictions,\
    compare_models, build_compare_models_result,\
    plot_features_importance

import utils
import settings

global_random_state = 2020

## Preprocessing without data augmentation

In [None]:
dataset_filename = R"data/Dataset 17-02-20.csv"
df = pd.read_csv(dataset_filename, delimiter=";")

# remove < sign from all cells
remove_grt_sng = lambda x: str(x).replace('<', '')
df = df.applymap(remove_grt_sng)

# replace , in float with .
replace_com_dot = lambda x: str(x).replace(',', '.')
df = df.applymap(replace_com_dot)

# replace with np.nan float nan values (str(1e400*0))
replace_nan = lambda x: np.nan if x == str(1e400 * 0) else x
df = df.applymap(replace_nan)

df = df.drop(settings.drop_list, axis=1)
df = utils.encode_categorical(df, settings.categorical_list)

li_value = []
li_eq = []
# create new columns
for index, row in df[['A AE G', 'Tot AE W', 'Age (G+W)']].iterrows():
    if str(row[0]) != str(1e400 * 0):
        li_value.append(row[0])
        li_eq.append(float(row[0]) / float(row[2]))
    elif str(row[1]) != str(1e400 * 0):
        li_value.append(row[1])
        li_eq.append(float(row[1]) / float(row[2]))
    else:
        li_value.append(np.nan)
        li_eq.append(np.nan)
df['Equivalent age'] = li_value
df['Ratio age'] = li_eq

df = df.drop(columns=df.loc[:, 'A AE G':'Subtest AE  W DI'].columns)

df = df[pd.to_numeric(df['Equivalent age']) > 0]

df = df.reset_index()

df.drop(columns='index', inplace=True)

df = utils.treat_nan(df)

# df = utils.data_augmentation(df, 'Ratio age')

utils.save(df, './data/aug_downsyndrom')

# dividing training observation and target labels
X = df.loc[:, df.columns != 'Ratio age']
y = df['Ratio age']

# Age cancellation analysis ( without data augmentation )

In [3]:
aug_dataset = Path(R"data/aug_downsyndrom.csv")

if aug_dataset.exists():
    df_to_transform = pd.read_csv(aug_dataset, delimiter=";")
else:
    raise RuntimeError("Run the cell before.")

In [4]:
# Set default paths
root = "results"

# Set default parameters
mode = "ratio"
scoring = "neg_mean_squared_error"
input_attribute = "Age (G+W)"
cv= 5
n_iter = 5
# important attributes
attributes_to_highlight = [
    "Immunoglobuline A (mg/dl)",
    "Homocysteine (µmol/L)",
    "Creatinina (mg/dl)",
    "CD19+ (PANB) /mmc",
    "Sphincter Control at month",
    "alanine",
    "Eritrociti (10^6/mmc)",
    "Started Walking at month",
    "Tireotropina (microIU/ml)",
    "Apgar Score 1'",
    "Height cm",
    "Weight Kg"
]
# create dict of list for collecting all results with function build_compare_models_result
groups_created = defaultdict(list)

# Experiments
- RandomForest
- ExtraTrees
- NeuralNetwork

## Random forest

### Baseline

In [4]:
base_estimator = RandomForestRegressor(n_jobs=-1)
params_distribution = dict(max_depth=distributions.randint(10,20),
                               n_estimators=distributions.randint(30,50),
                               min_samples_leaf=distributions.randint(20,30),
                               min_samples_split=distributions.randint(20,30))


config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=n_iter,
    cv=cv,
    shuffle=True,
    scoring=scoring,
    random_state=global_random_state,
    n_jobs=-1
)

name = "RandomForest"
changes = ['baseline']

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)

[INFO] experiment: Experiment RandomForest ( results\RandomForest\baseline )
Changes:
	- baseline            
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	RandomForestRegressor(n_jobs=-1)
	- params_distribution :	{'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001960941F4F0>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001960EACFD00>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001960E9A0FA0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001960EACF0D0>}
	- n_iter              :	5
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_squared_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving transformer...
[INFO] experiment: Savin

  0%|          | 0/200 [00:00<?, ?attribute/s]

### Changing parameters
- Decreased max depth
- Decreased number of decision trees

In [5]:
base_estimator = RandomForestRegressor(n_jobs=-1)
params_distribution = dict(max_depth=distributions.randint(5,10),
                               n_estimators=distributions.randint(20,30),
                               min_samples_leaf=distributions.randint(20,30),
                               min_samples_split=distributions.randint(20,30))


config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=n_iter,
    cv=cv,
    shuffle=True,
    scoring=scoring,
    random_state=global_random_state,
    n_jobs=-1
)
name = "RandomForest"
changes = ["max_depth","n_estimators"]

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)

[INFO] experiment: Experiment RandomForest ( results\RandomForest\max_depth_n_estimators )
Changes:
	- max_depth           
	- n_estimators        
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	RandomForestRegressor(n_jobs=-1)
	- params_distribution :	{'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961B577D90>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001960E9DBEE0>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961B70C880>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001960EA2C310>}
	- n_iter              :	5
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_squared_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving t

  0%|          | 0/200 [00:00<?, ?attribute/s]

### Changing score function
- Using MAE with Baseline

In [6]:
internal_scoring = "neg_mean_absolute_error"

base_estimator = RandomForestRegressor(n_jobs=-1)
params_distribution = dict(max_depth=distributions.randint(5,10),
                               n_estimators=distributions.randint(20,30),
                               min_samples_leaf=distributions.randint(20,30),
                               min_samples_split=distributions.randint(20,30))


config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=n_iter,
    cv=cv,
    shuffle=True,
    scoring=internal_scoring,
    random_state=global_random_state,
    n_jobs=-1
)
name = "RandomForest"
changes = ["mae"]

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)


[INFO] experiment: Experiment RandomForest ( results\RandomForest\mae )
Changes:
	- mae                 
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	RandomForestRegressor(n_jobs=-1)
	- params_distribution :	{'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961B8B4A90>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019618889130>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961B8B4580>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019616815B80>}
	- n_iter              :	5
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_absolute_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving transformer...
[INFO] experiment: Saving co

  0%|          | 0/200 [00:00<?, ?attribute/s]

## Extra trees

### Baseline

In [7]:
base_estimator = ExtraTreesRegressor(n_jobs=-1)
params_distribution = dict(max_depth=distributions.randint(10,20),
                               n_estimators=distributions.randint(30,50),
                               min_samples_leaf=distributions.randint(20,30),
                               min_samples_split=distributions.randint(20,30))


config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=n_iter,
    cv=cv,
    shuffle=True,
    scoring=scoring,
    random_state=global_random_state,
    n_jobs=-1
)

name = "ExtraTrees"
changes = ['baseline']

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)


[INFO] experiment: Experiment ExtraTrees ( results\ExtraTrees\baseline )
Changes:
	- baseline            
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	ExtraTreesRegressor(n_jobs=-1)
	- params_distribution :	{'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000196131E8430>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961D922850>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000196131E8250>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961D922C40>}
	- n_iter              :	5
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_squared_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving transformer...
[INFO] experiment: Saving conf

  0%|          | 0/200 [00:00<?, ?attribute/s]

### Changing parameters
- Decreased max depth
- Decreased number of decision trees

In [8]:
base_estimator = ExtraTreesRegressor(n_jobs=-1)
params_distribution = dict(max_depth=distributions.randint(5,10),
                               n_estimators=distributions.randint(20,30),
                               min_samples_leaf=distributions.randint(20,30),
                               min_samples_split=distributions.randint(20,30))


config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=n_iter,
    cv=cv,
    shuffle=True,
    scoring=scoring,
    random_state=global_random_state,
    n_jobs=-1
)

name = "ExtraTrees"
changes = ["max_depth","n_estimators"]

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)

[INFO] experiment: Experiment ExtraTrees ( results\ExtraTrees\max_depth_n_estimators )
Changes:
	- max_depth           
	- n_estimators        
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	ExtraTreesRegressor(n_jobs=-1)
	- params_distribution :	{'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961695C310>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000196154F2F10>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961695C970>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000196131DE940>}
	- n_iter              :	5
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_squared_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving transfo

  0%|          | 0/200 [00:00<?, ?attribute/s]

### Changing score
- Using MAE

In [9]:
internal_scoring = "neg_mean_absolute_error"

base_estimator = ExtraTreesRegressor(n_jobs=-1)
params_distribution = dict(max_depth=distributions.randint(5,10),
                               n_estimators=distributions.randint(20,30),
                               min_samples_leaf=distributions.randint(20,30),
                               min_samples_split=distributions.randint(20,30))


config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=n_iter,
    cv=cv,
    shuffle=True,
    scoring=internal_scoring,
    random_state=global_random_state,
    n_jobs=-1
)
name = "ExtraTrees"
changes = ["mae"]

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)

[INFO] experiment: Experiment ExtraTrees ( results\ExtraTrees\mae )
Changes:
	- mae                 
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	ExtraTreesRegressor(n_jobs=-1)
	- params_distribution :	{'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961DA49A60>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001960F2E0E50>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001961DA49130>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019615C84FD0>}
	- n_iter              :	5
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_absolute_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving transformer...
[INFO] experiment: Saving configur

  0%|          | 0/200 [00:00<?, ?attribute/s]

## Neural network

### One hidden layer

In [10]:
# no standardization applied
base_estimator = MLPRegressor(early_stopping=True, max_iter=5000, learning_rate_init=1e-3)
params_distribution = dict(hidden_layer_sizes=distributions.randint(30,60), solver=['adam','sgd'], activation=['relu', 'tanh', 'logistic'])

config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=n_iter,
    cv=cv,
    shuffle=True,
    scoring=scoring,
    random_state=global_random_state,
    n_jobs=-1
)

name = "NeuralNetwork"
changes = ["baseline"]

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)

[INFO] experiment: Experiment NeuralNetwork ( results\NeuralNetwork\baseline )
Changes:
	- baseline            
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	MLPRegressor(early_stopping=True, max_iter=5000)
	- params_distribution :	{'hidden_layer_sizes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019614DE6AC0>, 'solver': ['adam', 'sgd'], 'activation': ['relu', 'tanh', 'logistic']}
	- n_iter              :	5
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_squared_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving transformer...
[INFO] experiment: Saving configuration...
[INFO] experiment: Experiment complete.


  0%|          | 0/200 [00:00<?, ?attribute/s]

In [11]:
# no standardization applied
base_estimator = MLPRegressor(early_stopping=True, max_iter=5000, learning_rate_init=1e-3)

params_distribution = dict(hidden_layer_sizes=[(10,10)], solver=['sgd'], activation=['relu', 'logistic'])

config = dict(
    mode=mode,
    input_attribute=input_attribute,
    base_estimator=base_estimator,
    params_distribution=params_distribution,
    n_iter=4,
    cv=cv,
    shuffle=True,
    scoring=scoring,
    random_state=global_random_state,
    n_jobs=-1
)

name = "NeuralNetwork"
changes = ["hidden_layer_sizes"]

run_experiment(root, name, changes, df_to_transform, **config)
groups_created[name].append(changes)

[INFO] experiment: Experiment NeuralNetwork ( results\NeuralNetwork\hidden_layer_sizes )
Changes:
	- hidden_layer_sizes  
Configuration:	
	- mode                :	ratio
	- input_attribute     :	Age (G+W)
	- base_estimator      :	MLPRegressor(early_stopping=True, max_iter=5000)
	- params_distribution :	{'hidden_layer_sizes': [(10, 10)], 'solver': ['sgd'], 'activation': ['relu', 'logistic']}
	- n_iter              :	4
	- cv                  :	5
	- shuffle             :	True
	- scoring             :	neg_mean_squared_error
	- random_state        :	2020
	- n_jobs              :	-1
[INFO] experiment: Performing attribute cancellation...
[INFO] experiment: Saving dataframe...
[INFO] experiment: Saving transformer...
[INFO] experiment: Saving configuration...
[INFO] experiment: Experiment complete.


  0%|          | 0/200 [00:00<?, ?attribute/s]



Build results for all models

In [12]:
print(groups_created)
build_compare_models_result(root, groups_created)

defaultdict(<class 'list'>, {'RandomForest': [['baseline'], ['max_depth', 'n_estimators'], ['mae']], 'ExtraTrees': [['baseline'], ['max_depth', 'n_estimators'], ['mae']], 'NeuralNetwork': [['baseline'], ['hidden_layer_sizes']]})


Unnamed: 0,mean_train_score,mean_test_score,std_train_score,std_test_score,mean_diff_test_train,std_diff_test_train,model_name,attribute
Sesso,0.235270,0.242759,0.005556,0.020948,0.007489,0.026404,ExtraTrees_baseline,Sesso
Scheda colazione,0.076653,0.079785,0.020472,0.082631,0.003132,0.103100,ExtraTrees_baseline,Scheda colazione
Farmaci,217.697091,220.241207,5.919775,22.869952,2.544116,28.627285,ExtraTrees_baseline,Farmaci
Malattie metaboliche,12.557743,12.906820,0.705495,2.856481,0.349077,3.561330,ExtraTrees_baseline,Malattie metaboliche
Leucociti (10^3/mmc),4.410866,4.541330,0.466481,1.854120,0.130464,2.320125,ExtraTrees_baseline,Leucociti (10^3/mmc)
...,...,...,...,...,...,...,...,...
Masticatory Disfunction,0.477704,0.495554,0.033044,0.083101,0.017850,0.114927,RandomForest_mae,Masticatory Disfunction
Constipation,0.480762,0.498459,0.007844,0.029739,0.017697,0.032354,RandomForest_mae,Constipation
Diarrhea,0.163488,0.170571,0.022684,0.052696,0.007083,0.071127,RandomForest_mae,Diarrhea
Equivalent age,9.577180,10.210871,0.397998,1.547005,0.633690,1.896661,RandomForest_mae,Equivalent age
