In [1]:
import pandas as pd
import numpy as np
import os

In [4]:
def get_params_per_config(texts):
    
    params_count_arr = []
    for line in texts:
        if 'params' in line.lower():
            p = line.split(":")[-1]
            params_count_arr.append(p)
    ms_count_arr = np.array(params_count_arr).astype(np.int)
    #print(params_count_arr)
    return ms_count_arr

def get_num_configs(texts):
    
    num_configs = 0
    for line in texts:
        if 'config:' in line.lower():
            num_configs += 1
        
    #print("Number of configurations:{}".format(num_configs))
    return num_configs
    
def get_telapsed_per_config(texts):
    
    config_count = 0
    t_arr = []
    for line in texts:
        if 'config:' in line.lower():
            #print(line)
            config_count += 1
        elif 'time_elapsed' in line.lower():
            t = line.strip().split(":")[1]
            #print("Time for Config {}: {} s \n".format(config_count, t))
            t_arr.append(t)

    t_arr = np.array(t_arr).astype(np.float)
    return t_arr

In [5]:
def get_best_osize_df(df, group_by, sort_field, n_top):
    df1 = df.groupby(group_by).apply(
    lambda x: x.sort_values(by=sort_field,
                            ascending=True, 
                            ignore_index=True).head(n_top)
    )
    return df1

In [6]:
def get_model_gsresults(folder, model_type, group_by=None, n_top=None):
    
    # Get the logfile name
    logfile = "gs_training_{}_M500_P50_N500.log".format(model_type)
    
    # Get the .json file name
    jsonfile = "grid_search_results_{}_M500_P50_N500.json".format(model_type)
    
    # Get the full file path
    logfile_path = os.path.join(folder, logfile)
    jsonfile_path = os.path.join(folder, jsonfile)
    
    # Check if the file is present or not
    assert os.path.exists(logfile_path) == True, print("Log file not found!!")
    assert os.path.exists(jsonfile_path) == True, print("Grid search related Json file not found!!")
    
    # Extract the dataframe
    df = pd.read_json(jsonfile_path)
    
    # Get the logfile lines
    with open(logfile_path, 'r') as f:
        texts = f.readlines()
    
    #t_arr = get_telapsed_per_config(texts) # get the time elapsed as an array per config
    #params_count_arr = get_params_per_config(texts) # get the no. of params per config
    
    #print(t_arr)
    
    ## Add the columns from the logfile
    #df['Time_Elapsed'] = t_arr
    #df['Num_Params'] = params_count_arr
    
    # Get the dataframe based on best val.set performance grouped by output_size
    df_by_osize = get_best_osize_df(df, group_by, 'val_loss_best', n_top)
    
    return df, df_by_osize

In [7]:
df_gru, df_gru_by_nhidden = get_model_gsresults(folder='./log/ce_drive/prbs/gru_L2_H60_results/',
                                              model_type='gru',
                                              group_by='n_hidden',
                                              n_top=4)

In [8]:
# Print the dataframe
df_gru

Unnamed: 0,model_type,input_size,output_size,n_hidden,n_layers,lr,num_epochs,Config_no,tr_loss_end,val_loss_end,tr_loss_best,val_loss_best
0,gru,1,5,30,1,0.001,3000,1,0.005562,0.009726,7.7e-05,6e-05
1,gru,1,5,30,2,0.001,3000,2,1.9e-05,1.6e-05,1.9e-05,1e-05
2,gru,1,5,40,1,0.001,3000,3,0.000285,0.000278,0.000118,0.000119
3,gru,1,5,40,2,0.001,3000,4,1e-05,1e-05,8e-06,5e-06
4,gru,1,5,50,1,0.001,3000,5,0.000153,0.00015,8.7e-05,7.5e-05
5,gru,1,5,50,2,0.001,3000,6,1.4e-05,3.8e-05,1e-05,4e-06
6,gru,1,5,60,1,0.001,3000,7,0.000201,0.00021,4.3e-05,4e-05
7,gru,1,5,60,2,0.001,3000,8,1.4e-05,2.3e-05,1.7e-05,3e-06


In [9]:
# Print the dataframe with best entries
df_gru_by_nhidden

Unnamed: 0_level_0,Unnamed: 1_level_0,model_type,input_size,output_size,n_hidden,n_layers,lr,num_epochs,Config_no,tr_loss_end,val_loss_end,tr_loss_best,val_loss_best
n_hidden,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
30,0,gru,1,5,30,2,0.001,3000,2,1.9e-05,1.6e-05,1.9e-05,1e-05
30,1,gru,1,5,30,1,0.001,3000,1,0.005562,0.009726,7.7e-05,6e-05
40,0,gru,1,5,40,2,0.001,3000,4,1e-05,1e-05,8e-06,5e-06
40,1,gru,1,5,40,1,0.001,3000,3,0.000285,0.000278,0.000118,0.000119
50,0,gru,1,5,50,2,0.001,3000,6,1.4e-05,3.8e-05,1e-05,4e-06
50,1,gru,1,5,50,1,0.001,3000,5,0.000153,0.00015,8.7e-05,7.5e-05
60,0,gru,1,5,60,2,0.001,3000,8,1.4e-05,2.3e-05,1.7e-05,3e-06
60,1,gru,1,5,60,1,0.001,3000,7,0.000201,0.00021,4.3e-05,4e-05


In [10]:
print(len(df_gru_by_nhidden))
df_gru_by_nhidden.iloc[2].to_dict()

8


{'model_type': 'gru',
 'input_size': 1,
 'output_size': 5,
 'n_hidden': 40,
 'n_layers': 2,
 'lr': 0.001,
 'num_epochs': 3000,
 'Config_no': 4,
 'tr_loss_end': 1.038149416765046e-05,
 'val_loss_end': 9.735508077331664e-06,
 'tr_loss_best': 7.967359560636248e-06,
 'val_loss_best': 4.596677748243868e-06}

In [11]:
df_gru_by_nhidden.sort_values(by='val_loss_best',
                            ascending=True, 
                            ignore_index=True)                            

Unnamed: 0,model_type,input_size,output_size,n_hidden,n_layers,lr,num_epochs,Config_no,tr_loss_end,val_loss_end,tr_loss_best,val_loss_best
0,gru,1,5,60,2,0.001,3000,8,1.4e-05,2.3e-05,1.7e-05,3e-06
1,gru,1,5,50,2,0.001,3000,6,1.4e-05,3.8e-05,1e-05,4e-06
2,gru,1,5,40,2,0.001,3000,4,1e-05,1e-05,8e-06,5e-06
3,gru,1,5,30,2,0.001,3000,2,1.9e-05,1.6e-05,1.9e-05,1e-05
4,gru,1,5,60,1,0.001,3000,7,0.000201,0.00021,4.3e-05,4e-05
5,gru,1,5,30,1,0.001,3000,1,0.005562,0.009726,7.7e-05,6e-05
6,gru,1,5,50,1,0.001,3000,5,0.000153,0.00015,8.7e-05,7.5e-05
7,gru,1,5,40,1,0.001,3000,3,0.000285,0.000278,0.000118,0.000119


In [12]:
_, df_gru_by_nlayers = get_model_gsresults(folder='./log/ce_drive/prbs/gru_L2_H60_results/',
                                              model_type='gru',
                                              group_by='n_layers',
                                              n_top=3)

In [14]:
df_gru_by_nlayers

Unnamed: 0_level_0,Unnamed: 1_level_0,model_type,input_size,output_size,n_hidden,n_layers,lr,num_epochs,Config_no,tr_loss_end,val_loss_end,tr_loss_best,val_loss_best
n_layers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,gru,1,5,60,1,0.001,3000,7,0.000201,0.00021,4.3e-05,4e-05
1,1,gru,1,5,30,1,0.001,3000,1,0.005562,0.009726,7.7e-05,6e-05
1,2,gru,1,5,50,1,0.001,3000,5,0.000153,0.00015,8.7e-05,7.5e-05
2,0,gru,1,5,60,2,0.001,3000,8,1.4e-05,2.3e-05,1.7e-05,3e-06
2,1,gru,1,5,50,2,0.001,3000,6,1.4e-05,3.8e-05,1e-05,4e-06
2,2,gru,1,5,40,2,0.001,3000,4,1e-05,1e-05,8e-06,5e-06


In [15]:
df_gru_by_nlayers.sort_values(by='val_loss_best',
                            ascending=True, 
                            ignore_index=True)    

Unnamed: 0,model_type,input_size,output_size,n_hidden,n_layers,lr,num_epochs,Config_no,tr_loss_end,val_loss_end,tr_loss_best,val_loss_best
0,gru,1,5,60,2,0.001,3000,8,1.4e-05,2.3e-05,1.7e-05,3e-06
1,gru,1,5,50,2,0.001,3000,6,1.4e-05,3.8e-05,1e-05,4e-06
2,gru,1,5,40,2,0.001,3000,4,1e-05,1e-05,8e-06,5e-06
3,gru,1,5,60,1,0.001,3000,7,0.000201,0.00021,4.3e-05,4e-05
4,gru,1,5,30,1,0.001,3000,1,0.005562,0.009726,7.7e-05,6e-05
5,gru,1,5,50,1,0.001,3000,5,0.000153,0.00015,8.7e-05,7.5e-05
