In [5]:
import os
import glob
import re

import pandas as pd


MODELS_PATH = './models/java-small-model'
results = []

In [25]:
models = {
        '2020_04_05__0.0001_0.0001_0.001__0026d470': 32,
        '2020_04_05__0.0001_0.0001_0.001__2ecc05a5': 22,
        '2020_04_05__0.0001_0.0001_0.001__e152caf0': 28,
        '2020_04_05__0_0_-1__2167eb3d': 37,
        '2020_04_05__0_0_-1__3cff1009': 19,
        '2020_04_05__0_0_-1__4990422c': 19}

In [12]:
models = {'2020_04_07__0_0_-2.0__subtoken_restricted': 24}

In [17]:
models = ['2020_04_09__0_0_-3.0__subtoken_nodes_restricted']

In [2]:
models = ['2020_04_05__0.0001_0.0001_0.001__e152caf0']

In [50]:
coeff_select = ['2020_04_07__0.1_0_-1__b718b076',
                '2020_04_07__0.01_0_-1__53cfd74e',
                '2020_04_07__0.001_0_-1__0f88a7ef',
                '2020_04_07__0.0001_0_-1__92371c27', 
                '2020_04_07__1e-05_0_-1__81cbb810',
                '2020_04_07__1e-06_0_-1__3866b871',
                '2020_04_07__0_0.1_-1__5810e829',
                '2020_04_07__0_0.01_-1__b303bfa9',
                '2020_04_07__0_0.001_-1__facca85d',
                '2020_04_07__0_0.0001_-1__35a3d3de',
                '2020_04_07__0_1e-05_-1__bfb47368',
                '2020_04_07__0_1e-06_-1__be492856',
                '2020_04_05__0_0_-1__2167eb3d']

In [23]:
vocab_words = ['2020_04_11__0_0_-1__632771e7',
               '2020_04_11__0_0_-1__fa0572aa',
               '2020_04_11__0_0_-1__dc65356a',
               '2020_04_05__0_0_-1__2167eb3d']

In [24]:
models = vocab_words

# Best epoch and validation

In [25]:
non_decimal = re.compile(r'[^\d.]+')

results = []
for model in models:
    out_files = glob.glob(MODELS_PATH + '/' + model + "/*.out")
    if out_files == []:
        raise "No .out files"
    train_out = out_files[0]
    
    with open(train_out) as f:
        _, _, _, Lasso, GroupLasso, Threshold = list(filter(None, model.split('_')))[:6]
        text = f.read().split()
        
        best_epoch = int(non_decimal.sub('', text[-7]))
        val_prec, val_rec, val_F = map(lambda x: float(non_decimal.sub('', x)), [text[-5], text[-3], text[-1]])
        
        subtoken_words, nodes_words = map(lambda x: int(non_decimal.sub('', x)), [text[6], text[17]])
        
        results.append([model, subtoken_words, nodes_words, Lasso, GroupLasso, Threshold, 
                        best_epoch, val_prec, val_rec, val_F])

In [26]:
results = pd.DataFrame(results, 
     columns=['folder', 'SUBTOKEN_words', 'NODES_words', 
              'Lasso', 'GroupLasso', 'Threshold', 'best_epoch', 'Val_Prec', 'Val_Rec', 'Val_F1'])

In [27]:
results

Unnamed: 0,folder,SUBTOKEN_words,NODES_words,Lasso,GroupLasso,Threshold,best_epoch,Val_Prec,Val_Rec,Val_F1
0,2020_04_11__0_0_-1__632771e7,73906,12,0,0,-1,26,0.4554,0.34614,0.39332
1,2020_04_11__0_0_-1__fa0572aa,12,12,0,0,-1,3,0.18812,0.13437,0.15677
2,2020_04_11__0_0_-1__dc65356a,12,323,0,0,-1,2,0.20896,0.16248,0.18281
3,2020_04_05__0_0_-1__2167eb3d,73906,323,0,0,-1,37,0.4747,0.36856,0.41495


# Release

In [12]:
for index, row in results.iterrows():
    folder = results.iloc[index]['folder']
    epoch = results.iloc[index]['best_epoch']
    command = f'python code2seq.py --load models/java-small-model/{folder}/model_iter{epoch} --release'
    os.system(f'sbatch --error=./slurm/%j.err --output=./slurm/%j.out -J c2s_release --gres=gpu:1 -c 4 --wrap=\"{command}\"')

# Test

In [13]:
for index, row in results.iterrows():
    folder = results.iloc[index]['folder']
    epoch = results.iloc[index]['best_epoch']
    command = f'python3 code2seq.py --load {MODELS_PATH}/{folder}/model_iter{epoch}.release --test data/java-small/java-small.test.c2s'
    os.system(f'sbatch --error=./slurm/{folder}.err --output=./slurm/{folder}.out -J c2s_test --gres=gpu:1 -c 4 --wrap=\"{command}\"')


# Get test results

In [28]:
import pandas as pd

results['Test_Prec'] = None
results['Test_Rec'] = None
results['Test_F1'] = None

for index, row in results.iterrows():
    folder = results.iloc[index]['folder']
    epoch = results.iloc[index]['best_epoch']
    
    with open(f'./slurm/{folder}.out') as f:
        _, _, _, Lasso, GroupLasso, Threshold, _ = list(filter(None, folder.split('_')))
        text = f.read().split()
        prec_i = text.index('Precision:') + 1
        rec_i = text.index('recall:') + 1   
        f_i = text.index('F1:') + 1
        
        prec, rec, f = list(map(lambda a: float(a.rstrip(',')), [text[prec_i], text[rec_i], text[f_i]]))
        results['Test_Prec'][index] = prec
        results['Test_Rec'][index] = rec
        results['Test_F1'][index] = f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
results[[ 'SUBTOKEN_words', 'NODES_words', 'best_epoch', 'Val_Prec', 'Val_Rec', 'Val_F1', 'Test_Prec',
       'Test_Rec', 'Test_F1']]

Unnamed: 0,SUBTOKEN_words,NODES_words,best_epoch,Val_Prec,Val_Rec,Val_F1,Test_Prec,Test_Rec,Test_F1
0,73906,12,26,0.4554,0.34614,0.39332,0.486531,0.368711,0.419506
1,12,12,3,0.18812,0.13437,0.15677,0.253408,0.184787,0.213724
2,12,323,2,0.20896,0.16248,0.18281,0.256083,0.185483,0.215139
3,73906,323,37,0.4747,0.36856,0.41495,0.489211,0.37998,0.427732


In [16]:
results.columns

Index(['folder', 'SUBTOKEN_words', 'NODES_words', 'Lasso', 'GroupLasso',
       'Threshold', 'best_epoch', 'Val_Prec', 'Val_Rec', 'Val_F1', 'Test_Prec',
       'Test_Rec', 'Test_F1'],
      dtype='object')

In [3]:
def get_logs(job, epoch):
    OUTPUT_FOLDER = './SLURM'
    
    
    
    with open(OUTPUT_FOLDER + f'{job}.out','r') as f:
        line = f.readline()
        while line != 'i'
        
    with open(OUTPUT_FOLDER + f'errors_{job}','r') as f:
        errors_file = f.read()




In [6]:
results

Unnamed: 0,jobID,lasso,group_lasso,threshold,epochs,Precision,recall,F1,NODES_VOCAB nonzeros,SUBTOKEN_VOCAB nonzeros,Lasso Reg,Group Lasso Reg
0,34508,0.0001,0.0001,0.001,24,0.4744,0.35912,0.40879,1236,30258,0.225216,0.065769
1,34510,0.0001,0.0001,1e-05,64,0.46884,0.36772,0.41217,1486,74339,0.271612,0.061684
2,34511,0.0001,0.0001,1e-06,75,0.46578,0.3726,0.41401,3986,148216,0.301428,0.062066
3,34512,0.0001,0.0001,1e-07,24,0.46324,0.36765,0.40995,18028,6730523,0.408273,0.073424
4,34513,0.0,0.0001,1e-05,37,0.46248,0.36882,0.41037,40333,9413252,0.0,4.115718
5,34514,0.0001,0.0,1e-05,60,0.47707,0.38251,0.42459,1434,73891,0.316947,0.0


In [None]:
python3 code2seq.py --load models/java-large-model/model_iter60.release --test data/java-small/java-small.test.c2s