In [None]:
import sys
import os
sys.path.extend(['Fyler', 'EvalFyler', 'Lib', 'Codes'])

In [None]:
import configparser
import pathlib

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from Fyler import fyler_data, fyler_bow
from EvalFyler import fyler_fextract, fyler_dataphenot
from Lib import utils

In [None]:
# Example configuration
example_cfg_path = 'example_cfgs/fyler_window_size-14_note_window_size-14_fyler_min_count-10_align-left.cfg'
with open(example_cfg_path, 'r', encoding='utf8') as cfg_fd:
    print(cfg_fd.read())

In [None]:
# Set environment variables
keep_defaults = False
def set_vars(key, value):
    if keep_defaults:
        return os.environ.setdefault(key, value)
    else:
        os.environ[key] = value
        return os.environ[key]

############################################################
# EDIT THIS STRING to point to your local ACHD mount path. #
############################################################
print(set_vars('ACHD', '/home/angus/mnt/ACHD'))

print(set_vars('DATA_ROOT', os.getcwd()))

In [None]:
# Generate data
cfg = configparser.ConfigParser()
cfg.read(example_cfg_path)
root = os.path.expandvars(cfg.get("data", "root"))

notes_path = os.path.join(root, fyler_data.NOTE)
text_path = os.path.join(root, fyler_data.TEXT)
notes = fyler_data.open_notes(notes_path)

cfg_name = os.path.splitext(os.path.basename(example_cfg_path))[0]
tok_dir = pathlib.Path('models', cfg_name, 'tokenizer')
tok_dir.mkdir(parents=True, exist_ok=True)
print(f'{tok_dir=}')

fdp = fyler_data.FylerDatasetProvider(
    conn=notes,
    note_dir=text_path,
    input_vocab_size=cfg.get("args", "cui_vocab_size"),
    code_vocab_size=cfg.get("args", "code_vocab_size"),
    cfg=cfg['data'],
    tokenizer_dir=tok_dir,
)

In [None]:
# Train model
## copied from fyler_bow.main(·)
## better to use fyler_bow.py directly in the command line but the process is reproduced here for clarity

model_dir = pathlib.Path('models', cfg_name)
device = None  # set this to the appropriate torch cuda device to use the GPU

# Here we can see that instantiating a FylerDatasetProvider from an existing path loads the 
#  existing data instead of regenerating (unless data::regenerate is set to true in the config file)
dp = fyler_data.FylerDatasetProvider(
    conn=notes,
    note_dir=text_path,
    input_vocab_size=cfg.get("args", "cui_vocab_size"),
    code_vocab_size=cfg.get("args", "code_vocab_size"),
    cfg=cfg['data'],
    tokenizer_dir=tok_dir,
)

in_seqs, out_seqs = dp.load_as_sequences()

tr_in_seqs, val_in_seqs, tr_out_seqs, val_out_seqs = train_test_split(
    in_seqs, out_seqs, test_size=0.10, random_state=2020
)

print(f"loaded {len(tr_in_seqs)} training and {len(val_in_seqs)} validation samples")

max_cui_seq_len = max(len(seq) for seq in tr_in_seqs)
print("longest cui sequence:", max_cui_seq_len)

max_code_seq_len = max(len(seq) for seq in tr_out_seqs)
print("longest code sequence:", max_code_seq_len)

train_loader = fyler_bow.make_data_loader(
    utils.sequences_to_matrix(tr_in_seqs, len(dp.input_tokenizer.stoi)),
    utils.sequences_to_matrix(tr_out_seqs, len(dp.output_tokenizer.stoi)),
    cfg.getint("model", "batch"),
    "train",
)

val_loader = fyler_bow.make_data_loader(
    utils.sequences_to_matrix(val_in_seqs, len(dp.input_tokenizer.stoi)),
    utils.sequences_to_matrix(val_out_seqs, len(dp.output_tokenizer.stoi)),
    cfg.getint("model", "batch"),
    "dev",
)

os.makedirs(model_dir, exist_ok=True)

model = fyler_bow.BagOfWords(
    input_vocab_size=len(dp.input_tokenizer.stoi),
    output_vocab_size=len(dp.output_tokenizer.stoi),
    hidden_units=cfg.getint("model", "hidden"),
    dropout_rate=cfg.getfloat("model", "dropout"),
    model_dir=model_dir,
)

best_loss, optimal_epochs = fyler_bow.fit(
    model, cfg, train_loader, val_loader, cfg.getint("model", "epochs"),
    model_dir=model_dir, device=device
)
print("best loss %.4f after %d epochs" % (best_loss, optimal_epochs))

In [None]:
# Get downstream config
example_downstream_cfg_path = 'example_cfgs/experiment-cc-TGA_fyler_window_size-14_note_window_size-14_fyler_min_count-10_align-left.cfg'
with open(example_downstream_cfg_path, 'r', encoding='utf8') as dcfg_fd:
    print(dcfg_fd.read())

In [None]:
# Downstream classifier training
_ = fyler_fextract.train_model(
    gpu=-1, 
    model_class="fyler",
    model_dir='models',
    out_dir=None, 
    cfg_path=example_downstream_cfg_path,
)

# Clearly no learning is happening at the moment.

#### What this looks like at the batch level:

```sh
$ # generate the different configurations for the encoders and the downstream tasks using base config files
$ # gen_experiments.py currently has the phenotypes, window sizes, and minimum count parameter grid hard-coded
$ python Fyler/gen_experiments.py pretrain \
    path/to/base_config.cfg \
    path/to/cfgs
$ python Fyler/gen_experiments.py fextract \
    path/to/downstream/base_config.cfg \
    path/to/downstream/cfgs
$ # generate the data into the directory specified in the config file
$ python Fyler/experiments.py data \
    path/to/cfgs \
    --exclude path/to/excluded-cfgs.txt \
    -p num_cores \
    -o path/to/data/log/dir
$ # train Fyler encoders
$ python Fyler/fyler_bow.py batch \
    path/to/cfgs \
    path/to/data/dirs \
    path/to/fyler/encoders \
    --exclude path/to/excluded-downstream-cfgs.txt \
    -g num_gpus \
    -o path/to/fyler/log/dir
$ # train downstream classifiers
$ python EvalFyler/fyler_fextract.py batch \
    path/to/downstream/cfgs \
    path/to/fyler/encoders \
    --model_class fyler \
    --exclude path/to/excluded-downstream-cfgs.txt \
    -g num_gpus \
    -o path/to/downstream/log/dir
```