# Calculate CodeBLEU

In [7]:
from codebleu import calc_codebleu
from javalang import tokenizer

prediction = "def add ( a , b ) :\n return a + b"
reference = "def sum ( first , second ) :\n return second + first"

result = calc_codebleu([reference], [prediction], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)
print(result)

{'codebleu': 0.5537842627792564, 'ngram_match_score': 0.10419044640136393, 'weighted_ngram_match_score': 0.11094660471566163, 'syntax_match_score': 1.0, 'dataflow_match_score': 1.0}


In [10]:
with open('../data/kestra_evaluation_examples/reference/KestraApplicationContext_reference.java') as file:
    reference = ''.join(file.readlines())
print(reference)

with open('../data/kestra_evaluation_examples/candidate/KestraApplicationContext_candidate.java') as file:
    prediction = ''.join(file.readlines())
print(prediction)

package io.kestra.core.contexts;

import io.kestra.core.plugins.PluginRegistry;
import io.micronaut.context.ApplicationContextConfiguration;
import io.micronaut.context.DefaultApplicationContext;
import io.micronaut.core.io.service.SoftServiceLoader;
import io.micronaut.core.annotation.NonNull;
import io.micronaut.inject.BeanDefinitionReference;

import java.util.List;
package io.kestra.core.contexts;

import io.kestra.core.plugins.PluginRegistry;
import io.micronaut.context.ApplicationContextConfiguration;
import io.micronaut.context.DefaultApplicationContext;
import io.micronaut.core.annotation.NonNull;
import io.micronaut.core.io.service.SoftServiceLoader;
import io.micronaut.inject.BeanDefinitionReference;

import java.util.List;



In [13]:
def javalang_codebleu_tokenizer_adapter(tokens):
    tokens = list(tokenizer.tokenize(tokens))
    return [token.value for token in tokens]

reference= """a=1
"""
prediction= """a=1
"""
result = calc_codebleu([reference], [prediction], lang="java", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=javalang_codebleu_tokenizer_adapter)
print(result)

{'codebleu': 0.7643161313935813, 'ngram_match_score': 0.5623413251903491, 'weighted_ngram_match_score': 0.4949232003839766, 'syntax_match_score': 1.0, 'dataflow_match_score': 1.0}


# Run SnapLogic Code

In [4]:
import os
import subprocess
import re

from config import config_dict, logger

In [5]:
result = subprocess.run(['ls', '-lh', config_dict['code_base_directory']], capture_output=True, text=True)
print(result.stdout)
result




CompletedProcess(args=['ls', '-lh', '../snap_v4_clone/'], returncode=1, stdout='', stderr='ls: ../snap_v4_clone/: No such file or directory\n')

In [6]:
os.chdir('/home/ubuntu/capstone-code-generation')
os.chdir(config_dict['code_base_directory'])

# mvn install -DskipTests to build without running tests

module = 'transform'
command_output = subprocess.run([f'/opt/apache-maven-3.6.3/bin/mvn clean install -pl :{module} -am'], capture_output=True, text=True, shell=True)
print(command_output.stdout)
command_output

FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/capstone-code-generation'

In [7]:
result = {
    'pass': command_output.returncode == 0,
    'failure_count': None,
    'failed_tests': []
}
indicator = False
for line in command_output.stdout.split('\n'):

    if indicator and "[ERROR] Tests run:" in line:
        print(line)
        failures_match = re.search(r"Failures: (\d+)", line)
        result['failure_count'] = int(failures_match.group(1))
        indicator = False
    
    if indicator and  "[ERROR]   " in line:
        print(line)
        failed_test = line.split(' ')[3]
        result['failed_tests'].append(failed_test)
        
    if not indicator and line == "[ERROR] Failures: ":
        indicator = True
        print(line)

print(result)


{'pass': True, 'failure_count': None, 'failed_tests': []}


# Calculate Pass@k

In [1]:
from collections import ChainMap
from datetime import datetime
import logging
import json
from multiprocessing import Pool
import os
from pathlib import Path
import re
from typing import List, Tuple, Dict
import subprocess

from codebleu import calc_codebleu
from javalang import tokenizer
import numpy as np
import pandas as pd
from tqdm import tqdm

from config import config_dict, logger

In [2]:
def evaluate_responses(generated_responses_directory: str, output_directory: str, evaluation_metrics: List=config_dict["evaluation_metrics"]):
    """
    overwrite the real code with the generated code to create the set of generated code files
    Calculate the perplexity, compilation rate, CodeBLEU, and Pass@k metrics for the generated responses
    """
    logger.info("Evaluate responses")

    for file in os.listdir(generated_responses_directory):
        if file.endswith('.csv'):
            file_path = os.path.join(generated_responses_directory, file)

            logger.info(f"Evaluating {file_path}")
            responses_df = pd.read_csv(file_path)
            responses_df['completion'] = responses_df['completion'].str.replace('\n <EOT>', '')

            # probably not possible
            if "perplexity" in evaluation_metrics:
                _calculate_perplexity(responses_df)

            if "CodeBLEU" in evaluation_metrics:
                responses_df = _calculate_code_bleu(responses_df)

            if "pass@k" in evaluation_metrics:
                responses_df = _calculate_pass_at_k(responses_df.iloc[:50])

            file = file.replace("finetune_inference_output", "evaluated_responses")
            evaluated_responses_path = os.path.join(output_directory, file)
            os.makedirs(output_directory, exist_ok=True)
            responses_df.to_csv(evaluated_responses_path, index=False)

In [3]:
def _calculate_pass_at_k(responses_df: pd.DataFrame, code_base_directory: str = config_dict['code_base_directory'], evaluation_directory: str = config_dict['evaluation_directory']):
    """
    find all of the modules
    for each module
        run maven clean install and capture the results
    for each file in the test set
        replace the code in the file
        maven install the file and capture the results
        compare the results to the module baseline
    """
    logger.info("Calculating Pass@k score")

    # we only want to run the tests in the same module as the generated code to save time
    # first find all the modules and get a baseline of which tests pass
    # this can take a very long time so it will only run if the codebase commit hash changed
    base_module_results_path = _get_base_module_results_path(code_base_directory, evaluation_directory)
    if os.path.exists(base_module_results_path):
        logger.debug("Base modules have already been built")
        with open(base_module_results_path, "r") as outfile: 
            module_results = json.load(outfile)
    else:
        module_results = _build_base_modules()
        with open(base_module_results_path, "w") as outfile: 
            json.dump(module_results, outfile, indent=4)
    

    # test each piece of generated code 
    logger.info("Building generated code and capturing results")
    
    # this is very slow to run this, so it is done in parallel
    cpu_count = os.cpu_count()
    responses_df_split = np.array_split(responses_df, cpu_count-1)
    with Pool(cpu_count-1) as p:
        generated_code_results = p.map(_build_generated_code, responses_df_split)

    # make a big dictionary of the results from each process, convert to a dataframe, and then append horizontally to the responses
    generated_code_results = dict(ChainMap(*generated_code_results))
    generated_code_results = pd.DataFrame(generated_code_results).transpose().sort_index()
    generated_code_results = pd.concat([responses_df.reset_index(drop=True), generated_code_results.reset_index(drop=True)], axis=1)

    return generated_code_results


def _get_base_module_results_path(code_base_directory, evaluation_directory) -> str:
    """
    The build results of the unaltered base modules are stored in a json dictionary.
    The name of the file has the git hash of the commi of the code base directory for versioning
    """
    current_working_directory = os.getcwd()
    os.chdir(code_base_directory)
    git_hash = subprocess.run([f'git rev-parse --short HEAD'], capture_output=True, text=True, shell=True)
    git_hash = git_hash.stdout.strip()
    os.chdir(current_working_directory)

    base_module_results_file = f"base_module_build_results_{git_hash}.json"
    base_module_results_path = os.path.join(evaluation_directory, base_module_results_file)

    return base_module_results_path

In [4]:
def _build_base_modules(code_base_directory: str = config_dict['code_base_directory']) -> Dict:

    logger.info("Finding and building base (unaltered) modules")
    modules = _find_modules(code_base_directory)
    module_results = {}
    build_from_scratch = True #  build the first module from scratch
    modules = modules[49:] #FIXME
    for module in tqdm(modules):
        logger.info(module == 'salesforce')
        module_results[module] = _capture_maven_build_results(module, code_base_directory, build_from_scratch=build_from_scratch)
        build_from_scratch = False

    return module_results

In [5]:
def _find_modules(directory: str) -> List[str]:
	modules = []
	with open(directory + "/pom.xml", 'r') as f:
		while (f.readline().strip() != "<modules>"): continue
		while ((line := f.readline().strip()) != "</modules>"):
			if (line.startswith("<module>") and line.endswith("</module>")):
				modules.append(line[8:-9])
	return modules

In [16]:
def _capture_maven_build_results(module: str, code_base_directory: str=config_dict['code_base_directory'], build_from_scratch: bool=False) -> Dict:
    
    logger.info(f"Building {module}")

    # run the maven command to build the module
    current_working_directory = os.getcwd()
    os.chdir(code_base_directory)
    logger.info(os.getcwd())
    # FIXME
    # if build_from_scratch:
    #     build_from_scratch = " -am"
    build_from_scratch = ""#" -am"
    command = f'/opt/apache-maven-3.6.3/bin/mvn clean install -pl :{module}{build_from_scratch}'
    logger.info(command)
    # subprocess.run(['source /home/ubuntu/.bashrc'], shell=True)
    command_output = subprocess.run([command], capture_output=True, text=True, shell=True)
    logger.info(command_output.returncode)

    # record whether or not the entire build passed and what the failures were
    result = {
    'pass': command_output.returncode == 0,
    'failure_count': 0,
    'failed_tests': [],
    'error_count': 0
    }

    # parse the standard out to count the number failures and record the names of the failed tests
    indicator = False
    for line in command_output.stdout.split('\n'):

        # record the number of failures and errors
        if indicator and "[ERROR] Tests run:" in line:
            failures_match = re.search(r"Failures: (\d+)", line)
            result['failure_count'] = int(failures_match.group(1))

            errors_match = re.search(r"Errors: (\d+)", line)
            result['error_count'] = int(errors_match.group(1))
            indicator = False
        
        # record the name of each failed test or error
        if indicator and  "[ERROR]   " in line:
            failed_test = line.split(' ')[3]
            result['failed_tests'].append(failed_test)

        # once we find the line "[ERROR] Failures: " or "[ERROR] Errors: " we can begin parsing the information related to the failed tests (aka indicator=True)
        if not indicator and (line == "[ERROR] Failures: " or line == "[ERROR] Errors: "):
            indicator = True

    logger.info("Changing working directory back")
    os.chdir(current_working_directory)

    return result

In [24]:
config_dict['run_ts']

'pre_20231127'

In [26]:
os.chdir('/home/ubuntu/capstone-code-generation')
if not config_dict["run_ts"]:
    run_ts = datetime.now().strftime("%Y%m%d_%H%M%S")
else:
    run_ts = config_dict["run_ts"]

run_folder_path = 'data/run_' + str(run_ts) + '/'
config_dict["output_directory"] = run_folder_path
# config_dict["prompt_directory"] = config_dict["output_directory"] + "/prompts"
config_dict["generated_responses_directory"] = config_dict["output_directory"] + "/generated_responses"
# config_dict["evaluation_directory"] = config_dict["output_directory"] + "/evaluated_responses"

# _capture_maven_build_results('salesforce', config_dict['code_base_directory'], False)
# _build_base_modules(config_dict['code_base_directory'])
# _calculate_pass_at_k(pd.DataFrame(),config_dict['code_base_directory'], config_dict['evaluation_directory'])
evaluate_responses(
            config_dict["generated_responses_directory"],
            config_dict["evaluation_directory"]
        )

2023-11-29 14:13:20,244 - INFO - Evaluate responses
2023-11-29 14:13:20,245 - INFO - Evaluating data/run_pre_20231127//generated_responses/20231105_lora-20231026-161421_finetune_inference_output.csv
2023-11-29 14:13:20,287 - INFO - Calculating Pass@k score
2023-11-29 14:13:20,290 - INFO - Finding and building base (unaltered) modules
  0%|          | 0/19 [00:00<?, ?it/s]2023-11-29 14:13:20,291 - INFO - True
2023-11-29 14:13:20,292 - INFO - Building salesforce
2023-11-29 14:13:20,293 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 14:13:20,293 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :salesforce
2023-11-29 14:14:25,344 - INFO - 0
2023-11-29 14:14:25,345 - INFO - Changing working directory back
  5%|▌         | 1/19 [01:05<19:30, 65.05s/it]2023-11-29 14:14:25,346 - INFO - False
2023-11-29 14:14:25,346 - INFO - Building sap
2023-11-29 14:14:25,347 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 14:14:25,348 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :sap
 

KeyboardInterrupt: 

In [46]:
def get_base_module_results_path():
    """
    The build results of the unaltered base modules are stored in a json dictionary.
    The name of the file has the git hash of the commi of the code base directory for versioning
    """
    current_working_directory = os.getcwd()
    os.chdir(config_dict['code_base_directory'])
    git_hash = subprocess.run([f'git rev-parse --short HEAD'], capture_output=True, text=True, shell=True)
    git_hash = git_hash.stdout.strip()
    os.chdir(current_working_directory)

    base_module_results_file = f"base_module_build_results_{git_hash}.json"
    base_module_results_path = os.path.join(config_dict['evaluation_directory'], base_module_results_file)

    return base_module_results_path


In [47]:
def build_generated_code(responses_df: pd.DataFrame) -> Dict:
    """
    Build and analyze generated code based on responses DataFrame.

    This function takes a DataFrame of responses, replaces code in the original file, builds the modified module,
    and performs analysis on the generated code. It returns a dictionary containing build results and additional
    failure information.

    :param pd.DataFrame responses_df: DataFrame containing responses with 'original_file_path', 'label',
                                      and 'completion' columns.

    :return dict: A dictionary containing build results and additional failure information for each generated code entry.
                 The dictionary has the following structure:
                 {
                     '[index]': {
                         'additional_failure_count': int,
                         'additional_failed_tests': List[str],
                         'additional_pass': bool,
                         # other build results...
                     },
                     # other entries...
                 }

    :raises AssertionError: If there are mismatches in failure counts or failed tests during the analysis,
                            an AssertionError is raised with details.
    """

    # sometimes the parallelization will send a dataframe with no rows
    if len(responses_df) > 0:
        
        # need build results from base modules for comparison
        base_module_results_path = get_base_module_results_path()
        with open(base_module_results_path, "r") as outfile: 
            module_results = json.load(outfile)
        
        generated_code_results = {}
        for index, row in tqdm(responses_df.iterrows()):

            # replace the original code in the original file in the code base
            original_file_path = row['original_file_path']
            _replace_text_in_file(original_file_path, row['label'], row['completion'])

            # build the module again
            module = re.search(r"snap_v4_clone/(.+)/src/", original_file_path).group(1)
            logger.info(f"Building {module}")
            generated_code_results[str(index)] = _capture_maven_build_results(module)

            # check if there are any additional failed tests beyond what failed in the unaltered base module
            generated_code_results[str(index)]['additional_failure_count'] = generated_code_results[str(index)]['failure_count'] - module_results[module]['failure_count']
            generated_code_results[str(index)]['additional_failed_tests'] = list(set(generated_code_results[str(index)]['failed_tests']) - set(module_results[module]['failed_tests']))
            generated_code_results[str(index)]['additional_pass'] = generated_code_results[str(index)]['additional_failure_count'] == 0 and generated_code_results[str(index)]['pass']

            # sanity check that the number of additional failures == the number of additional failed tests
            # and if that passes test that the names of the tests are the same
            try:
                assert generated_code_results[str(index)]['additional_failure_count'] == len(generated_code_results[str(index)]['additional_failed_tests']), f"Mismatch in failure count and number of failed tests for index {index}"
                assert sorted(generated_code_results[str(index)]['failed_tests']) == sorted(module_results[module]['failed_tests']), f"Mismatch in failed tests for index {index}"
            except AssertionError as e:
                print(f"Assertion Error: {e}")

            # put the original code back in the module and continue with the next iteration of generated code
            _replace_text_in_file(original_file_path, row['completion'], row['label'])

        return generated_code_results

In [48]:
def _replace_text_in_file(file_path: str, search_text: str, replace_text: str) -> None:
    """
    Replace occurrences of a specified text in a file.

    :param str file_path: The path to the file to be processed.
    :param str search_text: The text to be replaced.
    :param str replace_text: The text to replace the occurrences of `search_text`.

    :return: None. The function modifies the specified file in-place.
    :rtype: None

    :raises Exception: If an error occurs during the file processing, an exception is caught,
                      and an error message is printed, indicating the file and the nature of the error.
    """
    try:
        with open(file_path, 'r') as file:
            file_content = file.read()
        
        # Use regular expressions to replace the search_text with replace_text
        modified_content = file_content.replace(search_text, replace_text)

        with open(file_path, 'w') as file:
            file.write(modified_content)

    except Exception as e:
        print(f'Error while processing {file_path}: {e}')

In [49]:
os.chdir('/home/ubuntu/capstone-code-generation/')
logger = logging.getLogger(__name__)

generated_responses_directory = config_dict['generated_responses_directory']
file = '20231112_lora-20231107-201630_finetune_inference_output.csv'
file_path = os.path.join(generated_responses_directory, file)
responses_df = pd.read_csv(file_path, nrows=12)
responses_df['completion'] = responses_df['completion'].str.replace('\n <EOT>', '')

generated_code_results = _calculate_pass_at_k(responses_df)
print(generated_code_results)


  return bound(*args, **kwds)
1it [00:14, 14.99s/it]
1it [00:15, 15.28s/it]
2it [00:21, 10.87s/it]
2it [00:26, 13.07s/it]
2it [00:35, 17.52s/it]
2it [00:39, 19.67s/it]
2it [01:40, 50.28s/it]


                                                 path  \
0   /Users/ChaliniA/Documents/capstone-code-genera...   
1   /Users/ChaliniA/Documents/capstone-code-genera...   
2   /Users/ChaliniA/Documents/capstone-code-genera...   
3   /Users/ChaliniA/Documents/capstone-code-genera...   
4   /Users/ChaliniA/Documents/capstone-code-genera...   
5   /Users/ChaliniA/Documents/capstone-code-genera...   
6   /Users/ChaliniA/Documents/capstone-code-genera...   
7   /Users/ChaliniA/Documents/capstone-code-genera...   
8   /Users/ChaliniA/Documents/capstone-code-genera...   
9   /Users/ChaliniA/Documents/capstone-code-genera...   
10  /Users/ChaliniA/Documents/capstone-code-genera...   
11  /Users/ChaliniA/Documents/capstone-code-genera...   

                                               prompt  \
0   <PRE> import org.apache.commons.lang3.StringUt...   
1   <PRE>     private static final String PRODUCT_...   
2   <PRE> /*\n * SnapLogic - Data Integration\n *\...   
3   <PRE> public abstract clas

In [50]:
generated_code_results

Unnamed: 0,path,prompt,label,line_to_remove_index,last_line_to_remove_index,completion,file_name,original_file_path,pass,failure_count,failed_tests,additional_failure_count,additional_failed_tests,additional_pass
0,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> import org.apache.commons.lang3.StringUt...,"propertyBuilder.describe(FOLDER_ID, FO...",87,119,defineRetryProperties(propertyBuilder);...,BoxPermissions.java,../snap_v4_clone/box/src/main/java/com/snaplog...,False,0,[],0,[],True
1,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> private static final String PRODUCT_...,"propertyBuilder.describe(ZOBJECT, OBJE...",207,223,// define additional properties\n ...,Read.java,../snap_v4_clone/exchange/src/main/java/com/sn...,False,0,[],0,[],True
2,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> /*\n * SnapLogic - Data Integration\n *\...,"// Target Type (Public Channel, Privat...",46,62,super.defineProperties(propertyBuilder)...,UpdateMessage.java,../snap_v4_clone/slack/src/main/java/com/snapl...,False,0,[],0,[],True
3,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> public abstract class AsyncJobIdCommon e...,"propertyBuilder.describe(JOB_ID, JOB_I...",89,96,"// By default, there's no additional pr...",AsyncJobIdCommon.java,../snap_v4_clone/netsuite/src/main/java/com/sn...,False,0,[],0,[],True
4,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> private String xmatch;\n private ...,propertyBuilder.describe(PROP_EXCHANGE...,182,285,propertyBuilder.describe(PROP_QUEUE_PRO...,MessageConsumer.java,../snap_v4_clone/rabbitmq/src/main/java/com/sn...,False,0,[],0,[],True
5,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> import org.apache.commons.lang3.builder....,ServiceNowHelper.defineInstancePropert...,78,126,defineClientId(propertyBuilder);\n ...,ServiceNowOAuth2Account.java,../snap_v4_clone/servicenow/src/main/java/com/...,False,0,[],0,[],True
6,/Users/ChaliniA/Documents/capstone-code-genera...,"<PRE> * Copyright (C) 2023, SnapLogic, Inc. ...",propertyBuilder\n .desc...,53,59,propertyBuilder.describe(ACCOUNT_KEY_LA...,GoogleServiceAccount.java,../snap_v4_clone/google/analytics4/src/main/ja...,False,0,[],0,[],True
7,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> import com.snaplogic.snap.api.PropertyVa...,super.defineProperties(propertyBuilder...,72,147,super.defineProperties(propertyBuilder)...,ConfluentKafkaSSLAccount.java,../snap_v4_clone/confluentkafka/src/main/java/...,False,0,[],0,[],True
8,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> private static final String ATTACHME...,"builder.describe(TO_PROP, TO_LABEL, TO...",231,284,builder.describe(TO_PROP)\n ...,EmailSender.java,../snap_v4_clone/email/src/main/java/com/snapl...,False,0,[],0,[],True
9,/Users/ChaliniA/Documents/capstone-code-genera...,<PRE> import com.snaplogic.snap.api.capabiliti...,}\n,79,80,propertyBuilder.describe(ViewType.DOCUM...,SchemaSnap.java,../snap_v4_clone/japis/jtest/src/test/java/com...,False,0,[],0,[],True


In [7]:
import os
import re

process_code_base_directory = f'../temp_code_base_directory_2_1/'
original_file_path = '../snap_v4_clone/db/elt/src/main/java/com/snaplogic/snaps/elt/EltSCD2.java'
original_file_path = os.path.join(process_code_base_directory, original_file_path.replace('../snap_v4_clone/', ''))
print(original_file_path)
print(fr"{process_code_base_directory}/([^/]+)/")
module = re.search(fr"{process_code_base_directory}([^/]+)/", original_file_path).group(1)
print(module)

../temp_code_base_directory_2_1/db/elt/src/main/java/com/snaplogic/snaps/elt/EltSCD2.java
../temp_code_base_directory_2_1//([^/]+)/
db


# Debugging Salesforce Module

In [1]:
from config import config_dict
import evaluate_responses as ef
import os

2023-11-29 17:26:28,161 - INFO - NumExpr defaulting to 8 threads.


In [28]:
ef._capture_maven_build_results('salesforce')

2023-11-29 14:51:56,779 - INFO - Building salesforce
2023-11-29 14:51:56,780 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 14:51:56,780 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :salesforce
2023-11-29 14:53:02,057 - INFO - 0
2023-11-29 14:53:02,058 - INFO - Changing working directory back


{'pass': True, 'failure_count': 0, 'failed_tests': [], 'error_count': 0}

In [29]:
ef._build_base_modules()

2023-11-29 14:53:41,950 - INFO - Finding and building base (unaltered) modules
  0%|          | 0/19 [00:00<?, ?it/s]2023-11-29 14:53:41,952 - INFO - True
2023-11-29 14:53:41,952 - INFO - Building salesforce
2023-11-29 14:53:41,953 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 14:53:41,953 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :salesforce
2023-11-29 14:54:47,581 - INFO - 0
2023-11-29 14:54:47,582 - INFO - Changing working directory back
  5%|▌         | 1/19 [01:05<19:41, 65.63s/it]2023-11-29 14:54:47,583 - INFO - False
2023-11-29 14:54:47,583 - INFO - Building salesforce
2023-11-29 14:54:47,584 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 14:54:47,584 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :salesforce
2023-11-29 14:55:52,808 - INFO - 0
2023-11-29 14:55:52,809 - INFO - Changing working directory back
 11%|█         | 2/19 [02:10<18:31, 65.39s/it]2023-11-29 14:55:52,810 - INFO - False
2023-11-29 14:55:52,810 - INFO - Building salesforce
2023-1

KeyboardInterrupt: 

In [33]:
!python --version

Python 3.10.9


In [34]:
ef._calculate_pass_at_k(pd.DataFrame())

2023-11-29 16:27:20,935 - INFO - Calculating Pass@k score
2023-11-29 16:27:20,939 - INFO - Finding and building base (unaltered) modules
  0%|          | 0/19 [00:00<?, ?it/s]2023-11-29 16:27:20,940 - INFO - True
2023-11-29 16:27:20,941 - INFO - Building salesforce
2023-11-29 16:27:20,941 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 16:27:20,942 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :salesforce
2023-11-29 16:28:50,479 - INFO - 1
2023-11-29 16:28:50,480 - INFO - Changing working directory back
  5%|▌         | 1/19 [01:29<26:51, 89.54s/it]2023-11-29 16:28:50,481 - INFO - False
2023-11-29 16:28:50,481 - INFO - Building salesforce
2023-11-29 16:28:50,482 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 16:28:50,482 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :salesforce
  5%|▌         | 1/19 [01:55<34:30, 115.03s/it]


KeyboardInterrupt: 

In [2]:
if not config_dict["run_ts"]:
    run_ts = datetime.now().strftime("%Y%m%d_%H%M%S")
else:
    run_ts = config_dict["run_ts"]

run_folder_path = 'data/run_' + str(run_ts) + '/'
config_dict["output_directory"] = run_folder_path
config_dict["prompt_directory"] = config_dict["output_directory"] + "prompts/"
config_dict["generated_responses_directory"] = config_dict["output_directory"] + "generated_responses/"
config_dict["evaluation_directory"] = config_dict["output_directory"] + "evaluated_responses/"

print(config_dict["generated_responses_directory"],
        config_dict["evaluation_directory"])

data/run_pre_20231127/generated_responses/ data/run_pre_20231127/evaluated_responses/


In [3]:
os.chdir('/home/ubuntu/capstone-code-generation')
ef.evaluate_responses(config_dict["generated_responses_directory"],
            config_dict["evaluation_directory"], 
            config_dict["code_base_directory"],
            config_dict["evaluation_metrics"])

2023-11-29 17:26:38,176 - INFO - Evaluate responses
2023-11-29 17:26:38,177 - INFO - Evaluating data/run_pre_20231127/generated_responses/20231105_lora-20231026-161421_finetune_inference_output.csv
2023-11-29 17:26:38,220 - INFO - Calculating Pass@k score
2023-11-29 17:26:38,224 - INFO - Finding and building base (unaltered) modules
  0%|          | 0/68 [00:00<?, ?it/s]2023-11-29 17:26:38,227 - INFO - False
2023-11-29 17:26:38,227 - INFO - Building japis
2023-11-29 17:26:38,227 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 17:26:38,228 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :japis -am
2023-11-29 17:26:41,091 - INFO - 0
2023-11-29 17:26:41,092 - INFO - Changing working directory back
  1%|▏         | 1/68 [00:02<03:12,  2.87s/it]2023-11-29 17:26:41,093 - INFO - False
2023-11-29 17:26:41,093 - INFO - Building jcores
2023-11-29 17:26:41,094 - INFO - /home/ubuntu/snap_v4_clone
2023-11-29 17:26:41,094 - INFO - /opt/apache-maven-3.6.3/bin/mvn clean install -pl :jcores
2

FileExistsError: [Errno 17] File exists: '../temp_code_base_directory_2_1/'