# Importing libraries

In [2]:
%reload_ext autoreload
%autoreload 2

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from utils import funcs, load_data
import tensorflow as tf
import mlflow
import subprocess
import git
import numpy as np
import pandas as pd

%reload_ext load_data
%reload_ext funcs

In [3]:
def running_evaluation(dataset='valid', pathologies=['pathologies'], Data='', model='', number_augmentation=3):

    def log_results(dataframe, probs_2d_orig, pathologies, MA, dataset):

        def add_dataframe_info_columns(df_info, probs_2d, pathologies):

            df              = df_info.drop(pathologies,axis=1)
            df_temp         = pd.DataFrame(probs_2d_orig, columns=pathologies).set_index(df.index)
            df[pathologies] = df_temp[pathologies]

            return df
            
        path = f'../../prob_{dataset}.csv'
        df = add_dataframe_info_columns(df_info=dataframe, probs_2d=probs_2d_orig, pathologies=pathologies)       
        df.to_csv(path)
        mlflow.log_artifact(path,artifact_path=f'probabilities/{dataset}/')

        path = f'../../prob_aug_avg_{dataset}.csv'
        pd.DataFrame( MA.probs_avg_2d, columns=Info.pathologies ).to_csv(path)
        mlflow.log_artifact(path,artifact_path=f'probabilities/{dataset}/')

        path = f'../../uncertainty_{dataset}.csv'
        pd.DataFrame( MA.probs_std_2d, columns=Info.pathologies ).to_csv(path)
        mlflow.log_artifact(path,artifact_path=f'uncertainties/{dataset}/')


        path = f'../../accuracy_orig_{dataset}.csv'
        accuracy = np.floor( 1000*np.mean((MA.truth > 0.5) == (probs_2d_orig > 0.5),axis=0) )/ 10
        pd.DataFrame( {'accuracy':accuracy, 'pathologies':Info.pathologies} ).set_index('pathologies').to_csv(path)
        mlflow.log_artifact(path,artifact_path=f'accuracies/{dataset}/')

        path = f'../../accuracy_aug_{dataset}.csv'
        accuracy = np.floor( 1000*np.mean((MA.truth > 0.5) == (MA.probs_avg_2d > 0.5),axis=0) )/ 10
        pd.DataFrame( {'accuracy':accuracy, 'pathologies':Info.pathologies} ).set_index('pathologies').to_csv(path)
        mlflow.log_artifact(path,artifact_path=f'accuracies/{dataset}/')


        
    probs_2d_orig, final_results, MA = funcs.apply_technique_aim_1_2( how_to_treat_nans   = 'ignore', 
                                                                      data_generator      = Data.generator[dataset], 
                                                                      data_generator_aug  = Data.generator[dataset + '_aug'], 
                                                                      model               = model, 
                                                                      uncertainty_type    = 'std', 
                                                                      number_augmentation = number_augmentation)

    log_results(dataframe     = Data.dataframe[dataset], 
                probs_2d_orig = probs_2d_orig, 
                pathologies   = pathologies, 
                MA            = MA, 
                dataset       = dataset)


def setting_up_gpu():

    config = tf.compat.v1.ConfigProto(inter_op_parallelism_threads=5, intra_op_parallelism_threads=5) # , device_count={"GPU":1, "CPU": 10})
    # config.gpu_options.allow_growth = True  
    # config.log_device_placement = True  
    sess = tf.compat.v1.Session(config=config)
    tf.compat.v1.keras.backend.set_session(sess)

    return sess


def mlflow_setting_up():

    server, artifact = funcs.mlflow_settings()
    mlflow.set_tracking_uri(server)


    """ Creating/Setting the experiment
        Line below should be commented if the experiment is already created
        If kept commented during the first run of a new experiment, the set_experiment 
        will automatically create the new experiment with local artifact storage """

    experiment_name = 'expanding_dataset_aim1_2'
    # mlflow.create_experiment(name=experiment_name, artifact_location=artifact)
    mlflow.set_experiment(experiment_name=experiment_name)

    # Starting the MLflow 
    run = mlflow.start_run(run_id='106f71e138174d8db44bc6c32f537066') # run_name; run_id
    # mlflow.set_tag(f'mlflow.note.content',f'run_id: {run.info.run_id}')

    return run

  and should_run_async(code)


## GPU set up

In [4]:
sess = setting_up_gpu()

## creating a ssh-tunnel to server in the background

In [5]:
command     = 'ssh -N -L 5000:localhost:5432 artinmajdi@data7-db1.cyverse.org &'
ssh_session = subprocess.Popen('exec ' + command, stdout=subprocess.PIPE, shell=True)

## MLflow set up

In [6]:
# MLflow set up
run = mlflow_setting_up()

# Loading the optimization parameters aturomatically from keras
mlflow.keras.autolog()

## Saving the Git commit  (only in Jupyter notebook)

In [7]:
repo = git.Repo(search_parent_directories=True)
git_commit_hash = repo.head.object.hexsha
print('git commit hash', git_commit_hash)
mlflow.set_tag('mlflow.source.git.commit', git_commit_hash)

git commit hash 98d373195612770089edd30079d26c26fa71e2a8


## Reading Terminal Inputs

In [8]:
getting_inputs_via_terminal = False

if getting_inputs_via_terminal: 
    epochs, batch_size, max_sample, architecture_name, number_augmentation = funcs.reading_terminal_inputs()
else:                           
    epochs, batch_size, max_sample, architecture_name, number_augmentation = 3, 40, 1000000, 'DenseNet121', 3

  and should_run_async(code)


## loading the data

In [9]:
dataset    = 'chexpert' # nih chexpert
dir        = '/groups/jjrodrig/projects/chest/dataset/' + dataset + '/'

running_new_run = False

if running_new_run:
    Data, Info = load_data.load_chest_xray(dir=dir, dataset=dataset, batch_size=batch_size, mode='train_val', max_sample=max_sample)

    mlflow.log_param('dataset'     , dataset)
    mlflow.log_param('max_sample'  , max_sample)
    mlflow.log_param('train count' , len(Data.generator['train'].filenames))
    mlflow.log_param('valid count' , len(Data.generator['valid'].filenames))
    mlflow.log_param('batch size'  , batch_size)

else:
    Data, Info = load_data.load_chest_xray(dir=dir, dataset=dataset, batch_size=batch_size, mode='test', max_sample=max_sample)

before sample-pruning
train: (223414, 20)
test: (234, 19)

after sample-pruning
train (certain): (124626, 21)
train (uncertain): (5807, 21)
valid: (31157, 21)
test: (169, 20) 

Found 169 validated image filenames.
Found 169 validated image filenames.


## Optimization

In [10]:
optimize_model = False

if optimize_model:
    model = funcs.optimize( train_dataset     = Data.data_tf['train'], 
                            valid_dataset     = Data.data_tf['valid'], 
                            architecture_name = architecture_name,
                            epochs            = epochs, 
                            Info              = Info,
                            dir               = dir)
else:
    model = mlflow.keras.load_model(model_uri=f'runs:/{run.info.run_id}/model',compile=False)

    model.compile(  optimizer = tf.keras.optimizers.Adam(learning_rate=0.001), 
                    loss      = funcs.weighted_bce_loss(Info.class_weights), # tf.keras.losses.binary_crossentropy #  
                    metrics   = [tf.keras.metrics.binary_accuracy])

## Evaluation

In [11]:
EVALUATE = True

if EVALUATE:

    # validation dataset
    running_evaluation( dataset             = 'valid', 
                        pathologies         = Info.pathologies, 
                        Data                = Data, 
                        model               = model, 
                        number_augmentation = number_augmentation)

    # test dataset
    Data, Info= load_data.load(dir=dir, dataset=dataset, batch_size=batch_size, mode='test', max_sample=max_sample)

    running_evaluation( dataset             = 'test', 
                        pathologies         = Info.pathologies, 
                        Data                = Data, 
                        model               = model, 
                        number_augmentation = number_augmentation)             

  0%|          | 0/5 [00:00<?, ?it/s]running the evaluation on original non-augmented data
100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
  0%|          | 0/5 [00:00<?, ?it/s] running the evaluation on augmented data including the uncertainty measurement
augmentation 0/3
100%|██████████| 5/5 [00:04<00:00,  1.07it/s]
  0%|          | 0/5 [00:00<?, ?it/s]augmentation 1/3
100%|██████████| 5/5 [00:04<00:00,  1.09it/s]
  0%|          | 0/5 [00:00<?, ?it/s]augmentation 2/3
100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


In [12]:
# dataset = 'test'
# probs_2d_orig, final_results, MA = funcs.apply_technique_aim_1_2( how_to_treat_nans   = 'ignore', 
#                                                                     data_generator      = Data.generator[dataset], 
#                                                                     data_generator_aug  = Data.generator[dataset + '_aug'], 
#                                                                     model               = model, 
#                                                                     uncertainty_type    = 'std', 
#                                                                     number_augmentation = number_augmentation)

In [13]:
# def add_dataframe_info_columns(df_info, probs_2d, pathologies):

#     df              = df_info.drop(pathologies,axis=1)
#     df_temp         = pd.DataFrame(probs_2d_orig, columns=pathologies).set_index(df.index)
#     df[pathologies] = df_temp[pathologies]

#     return df


# pathologies = Info.pathologies

# path = f'../../prob_{dataset}.csv'
# df = add_dataframe_info_columns(df_info=Data.dataframe[dataset], probs_2d=probs_2d_orig, pathologies=pathologies)       
# df.to_csv(path)
# mlflow.log_artifact(path,artifact_path=f'probabilities/{dataset}/')

# path = f'../../prob_aug_avg_{dataset}.csv'
# pd.DataFrame( MA.probs_avg_2d, columns=Info.pathologies ).to_csv(path)
# mlflow.log_artifact(path,artifact_path=f'probabilities/{dataset}/')

# path = f'../../uncertainty_{dataset}.csv'
# pd.DataFrame( MA.probs_std_2d, columns=Info.pathologies ).to_csv(path)
# mlflow.log_artifact(path,artifact_path=f'uncertainties/{dataset}/')


# path = f'../../accuracy_orig_{dataset}.csv'
# accuracy = np.floor( 1000*np.mean((MA.truth > 0.5) == (probs_2d_orig > 0.5),axis=0) )/ 10
# pd.DataFrame( {'accuracy':accuracy, 'pathologies':Info.pathologies} ).set_index('pathologies').to_csv(path)
# mlflow.log_artifact(path,artifact_path=f'accuracies/{dataset}/')

# path = f'../../accuracy_aug_{dataset}.csv'
# accuracy = np.floor( 1000*np.mean((MA.truth > 0.5) == (MA.probs_avg_2d > 0.5),axis=0) )/ 10
# pd.DataFrame( {'accuracy':accuracy, 'pathologies':Info.pathologies} ).set_index('pathologies').to_csv(path)
# mlflow.log_artifact(path,artifact_path=f'accuracies/{dataset}/')

## Closing the session

In [14]:
# End mlflow session
mlflow.end_run()

# End the ssh session. If this failed, we can type 'pkill ssh' in the terminal 
ssh_session.kill()

print('Optimization Complete')



Ending mlflow session
Ending ssh session
Optimization Complete
