# Analyze Latency in Log Files

In [1]:
import re
import os
import numpy as np
import pandas as pd
from collections import defaultdict

#from human_eval.data import stream_jsonl, write_jsonl
#from helpers import get_code_in_fences, clean_code, clean_code_light, read_problems

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
%load_ext autoreload
%autoreload 2

In [2]:
def extract_elapsed_times_from_file( file_to_parse ):
    pattern = re.compile(r'Time elapsed(?: including backoff)?: (\d+\.\d+) seconds')
    elapsed_times = []
    with open(file_to_parse) as f:
        for line in f:
            match = pattern.search(line)
            if match:
                seconds = float(match.group(1))
                elapsed_times.append(seconds)
    return elapsed_times

## Rank Datasets and Models

In [3]:
models = [
    'phixtral-2x2_8',
    'Nous-Hermes-2-Solar-10.7B',
    'Meta-Llama-3.1-8B-Instruct',
    'codegemma-7b-it',
    'deepseek-coder-6.7b-instruct',
    'OpenCodeInterpreter-DS-6.7B',
    'Artigenz-Coder-DS-6.7B',
    'CodeQwen1.5-7B-Chat',
    'Nxcode-CQ-7B-orpo',
    'phixtral-4x2_8',
    'mistral_3b',
    'mistral_8B',
    'nemo',
    'codestral_mamba',
    'mistral_7b',
    ]
print(len(models))
dataset_names = ['HumanEval', 'MBPP', 'LBPP', 'BigCode']
wdir = '/Users/andrew/Documents/01_documents/GWU_DENG/SEAS_8588_Praxis/2_Code/logs_results/final-round-one-model/'

15


In [4]:
# group files by dataset, then by model
filepaths_per_dataset = dict()
for dataset_name in dataset_names:
    filepaths_per_model = defaultdict(list)
    wdir_current = wdir + dataset_name
    for root, dirs, files in os.walk(wdir_current):
        for filename in files:
            if filename.endswith('.log'):
                current_models = [model for model in models if model in filename]
                assert len(current_models)==1, f'Found more than 1 model: {current_models}'
                current_model = current_models[0]
                                
                # extract code execution durations
                filepath = os.path.join(root, filename)
                elapsed_times = extract_elapsed_times_from_file( filepath )
                filepaths_per_model[ current_model ].extend( elapsed_times )
    filepaths_per_dataset[ dataset_name ] = filepaths_per_model

# print number of models for each dataset
for dataset_name in filepaths_per_dataset:
    print(dataset_name)
    print(len(filepaths_per_dataset[dataset_name]))
    
#filepaths_per_dataset['HumanEval']

HumanEval
15
MBPP
15
LBPP
15
BigCode
15


In [5]:
# explore structure
temp = filepaths_per_dataset['HumanEval']['CodeQwen1.5-7B-Chat']
sum(temp), np.mean(temp)

(2850.3186, 5.793330487804878)

In [6]:
rows = []
for key1, subdict in filepaths_per_dataset.items():
    for key2, lst in subdict.items():
        total = sum(lst)
        mean = total / len(lst) if lst else float('nan')
        rows.append({
            'dataset': key1,
            'model': key2,
            'mean': mean,
            'sum': total,
        })

In [7]:
df = pd.DataFrame(rows)
print(df.shape)
df.head(25)

(60, 4)


Unnamed: 0,dataset,model,mean,sum
0,HumanEval,codestral_mamba,2.216548,1090.5416
1,HumanEval,phixtral-2x2_8,19.982338,9831.3105
2,HumanEval,phixtral-4x2_8,24.086714,11850.6634
3,HumanEval,Nxcode-CQ-7B-orpo,6.797984,3344.6079
4,HumanEval,Artigenz-Coder-DS-6.7B,7.251943,3567.9562
5,HumanEval,mistral_8B,2.686335,1321.6767
6,HumanEval,mistral_3b,2.640248,1299.002
7,HumanEval,codegemma-7b-it,7.418158,3649.7338
8,HumanEval,Meta-Llama-3.1-8B-Instruct,10.18503,5011.0347
9,HumanEval,CodeQwen1.5-7B-Chat,5.79333,2850.3186


In [8]:
df[df['model']=='Nxcode-CQ-7B-orpo']

Unnamed: 0,dataset,model,mean,sum
3,HumanEval,Nxcode-CQ-7B-orpo,6.797984,3344.6079
18,MBPP,Nxcode-CQ-7B-orpo,5.69204,8538.0597
33,LBPP,Nxcode-CQ-7B-orpo,12.517915,6083.7067
48,BigCode,Nxcode-CQ-7B-orpo,12.070811,18106.2162


In [9]:
# average latency by dataset
dset_distrib = []
for dataset in dataset_names:
    mean = np.mean( df[df['dataset']==dataset]['mean'].values )
    dset_distrib.append({
                'dataset': dataset,
                'average_latency': mean,
            })
df_dsets = pd.DataFrame( dset_distrib ).sort_values(by='average_latency').reset_index(drop=True)
df_dsets['rank'] = df_dsets['average_latency'].rank(method='dense', ascending=True).astype(int)
df_dsets

Unnamed: 0,dataset,average_latency,rank
0,MBPP,7.587069,1
1,HumanEval,7.742248,2
2,BigCode,14.475852,3
3,LBPP,18.40398,4


In [10]:
# average latency by model
model_distrib = []
for model in models:
    mean = np.mean( df[df['model']==model]['mean'].values )
    model_distrib.append({
                'model': model,
                'average_latency': mean,
            })
df_models = pd.DataFrame( model_distrib ).sort_values(by='average_latency').reset_index(drop=True)
df_models['rank'] = df_models['average_latency'].rank(method='dense', ascending=True).astype(int)
df_models

Unnamed: 0,model,average_latency,rank
0,nemo,2.629084,1
1,mistral_7b,3.231326,2
2,mistral_8B,3.52505,3
3,codestral_mamba,3.829256,4
4,mistral_3b,4.322546,5
5,CodeQwen1.5-7B-Chat,8.196508,6
6,deepseek-coder-6.7b-instruct,8.835947,7
7,Nxcode-CQ-7B-orpo,9.269687,8
8,codegemma-7b-it,9.761839,9
9,OpenCodeInterpreter-DS-6.7B,11.123243,10


## Rank Prompts

In [11]:
prompts = [
    'complete_code_prompt_basic',
    'complete_code_prompt_full',
    'complete_code_prompt',
    'complete_task_prompt_basic',
    'complete_task_prompt_full',
    'complete_task_prompt',    
]
wdir = '/Users/andrew/Documents/01_documents/GWU_DENG/SEAS_8588_Praxis/2_Code/logs_results/final-round-one-model/'

In [12]:
# group files by prompt
filepaths_per_prompt = defaultdict(list)
for dataset_name in dataset_names:
    wdir_current = wdir + dataset_name
    for root, dirs, files in os.walk(wdir_current):
        for filename in files:
            if filename.endswith('.log'):
                for prompt in prompts:
                    if prompt in filename:
                        current_prompt = prompt
                        break
                                
                # extract code execution durations
                filepath = os.path.join(root, filename)
                elapsed_times = extract_elapsed_times_from_file( filepath )
                filepaths_per_prompt[ current_prompt ].extend( elapsed_times )

# print number of models for each dataset
for prompt in filepaths_per_prompt:
    print(prompt)
    print(len(filepaths_per_prompt[prompt]))

complete_code_prompt_full
9960
complete_code_prompt
9960
complete_code_prompt_basic
9463
complete_task_prompt_full
9930
complete_task_prompt
9930
complete_task_prompt_basic
9930


In [14]:
# average latency by prompt
prompt_distrib = []
for prompt in filepaths_per_prompt:
    mean = np.mean( filepaths_per_prompt[prompt] )
    prompt_distrib.append({
                'prompt': prompt,
                'average_latency': mean,
            })
df_pprompts = pd.DataFrame( prompt_distrib ).sort_values(by='average_latency').reset_index(drop=True)
df_pprompts['rank'] = df_pprompts['average_latency'].rank(method='dense', ascending=True).astype(int)
df_pprompts

Unnamed: 0,prompt,average_latency,rank
0,complete_task_prompt_full,6.787204,1
1,complete_code_prompt_full,10.64445,2
2,complete_task_prompt,10.763853,3
3,complete_task_prompt_basic,13.151265,4
4,complete_code_prompt_basic,13.410396,5
5,complete_code_prompt,13.792369,6
