In [None]:
import ast
import json
import torch
import re
import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# Predictions

In [None]:
torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

In [None]:
def generate_prediction_from_row(row):
    prompt = f"{row['signature']}\n\"\"\"\n{row['docstring']}\n\"\"\""

    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=1000)
    generated_code = tokenizer.batch_decode(outputs)[0]

    end_of_docstring = generated_code.find('"""') + 3
    start_of_next_def = generated_code.find('def', end_of_docstring)
    
    if start_of_next_def == -1:
        start_of_next_def = len(generated_code)

    function_body = generated_code[end_of_docstring:start_of_next_def].strip()
    function_body = function_body.replace('\n', ' ')
    function_body = function_body.split('"""')[-1].strip()
    function_body = re.sub(' +', ' ', function_body)

    return function_body

def predictions(file_path, output_path='predictions.txt'):
    if file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    elif file_path.endswith('.json'):
        with open(file_path, 'r') as f:
            data = json.load(f)
        data = pd.DataFrame(data)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .json file.")

    with open(output_path, 'w') as f:
        i = 0
        for index, row in data.iterrows():
            function_body = generate_prediction_from_row(row)
            f.write(function_body + '\n')
            i += 1
            print(i)
            if i == 100:
                print("All generated codes have been written to predictions.txt")
                break

In [None]:
predictions('./data/datasets/kotlin_full.csv')

In [None]:
predictions('./data/datasets/python_full.json')

In [None]:
i = 0
with open('predictions.txt', 'w') as f:
    for index, row in df_python_test.iterrows():
        prompt = f"{row['signature']}\n\"\"\"\n{row['docstring']}\n\"\"\""

        inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)

        outputs = model.generate(**inputs, max_length=1000)

        generated_code = tokenizer.batch_decode(outputs)[0]

        end_of_docstring = generated_code.find('"""') + 3

        start_of_next_def = generated_code.find('def', end_of_docstring)
        if start_of_next_def == -1:
            start_of_next_def = len(generated_code)

        function_body = generated_code[end_of_docstring:start_of_next_def].strip()

        function_body = function_body.replace('\n', ' ')
        function_body = function_body.split('"""')[-1].strip()

        function_body = re.sub(' +', ' ', function_body)
        f.write(function_body + '\n')
        # print(function_body)
        i += 1
        print(i)
        if i == 100:
            print("All generated codes have been written to predictions.txt")
            break
    

```
Evaluator
We provide a script to evaluate predictions for this task, and report accuracy score. You can run the script like this:

python evaluator.py -a=evaluator/answers.txt -p=evaluator/predictions.txt
Each line in the *.txt file is an output.
```

# Evaluations