In [624]:
from typing import Union, List
import argparse
import glob
import os
from datetime import datetime
from pathlib import Path
import sys
import json
import time
import logging
import random
import re
import math
from itertools import chain
from string import punctuation
import tokenize
from nltk.translate import bleu_score

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
import textwrap
from sklearn import metrics
import statistics

from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    TFGPT2Model,
    GPT2Tokenizer,
    OpenAIGPTTokenizer,
    RobertaTokenizer,
    get_linear_schedule_with_warmup
)
from data.dataset import ComplexUtteranceCodeDataset

torch.manual_seed(42)

[nltk_data] Downloading package punkt to /Users/asaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x15bedeb70>

In [619]:
import sys
import os 

WORK_AREA = '..'
os.chdir(WORK_AREA)

paths = ['./src/', './src/api/v6', './notebooks/src']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [614]:
from enum import Enum

class ModelFlavour(Enum):
    Text2Code = "text2code"
    Text2Rep = "text2rep"
    Rep2Code = "rep2code"
    Rep2Rep = "rep2rep"
    TextRep2Rep = "textrep2rep"
    TextRep2Code = "textrep2code"


class Model(Enum):
    T5Base = "t5-base"
    CodeT5Small = "codet5-small"
    CodeT5Base = "codet5-base"
    CodeT5P220m = "codet5p-220m"
    GPT2Small = "gpt2-small"
    GPT2Medium = "gpt2-medium"


model_flavour_params = {
    ModelFlavour.Text2Code: dict(
        slug = "text2code",
        input_prefix = "text to code: ",
        input_label = "text",
        target_label = "code",
    ),
    ModelFlavour.Text2Rep: dict(
        slug = "text2rep",
        input_prefix = "text to rep: ",
        input_label = "text",
        target_label = "code_rep",
    ),
    ModelFlavour.Rep2Code: dict(
        slug = "rep2code",
        input_prefix = "rep to code: ",
        input_label = "lang_rep",
        target_label = "code",
    ),
    ModelFlavour.Rep2Rep: dict(
        slug = "rep2rep",
        input_prefix = "rep to rep: ",
        input_label = "lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Rep: dict(
        slug = "text_rep2rep",
        input_prefix = "text and rep to rep: ",
        input_label = "text_lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Code: dict(
        slug = "textrep2code",
        input_prefix = "text and rep to code: ",
        input_label = "text_lang_rep",
        target_label = "code",
    ),
}

pretrained_model_names_mapping = {
    Model.T5Base: "t5-base",
    Model.CodeT5Small: "Salesforce/codet5-small",
    Model.CodeT5Base: "Salesforce/codet5-base",
    Model.CodeT5P220m: "Salesforce/codet5p-220m",
    Model.GPT2Small: "gpt2",
    Model.GPT2Medium: "gpt2-medium",
}

In [615]:
from enum import Enum

class ModelFlavour(Enum):
    Text2Code = "text2code"
    Text2Rep = "text2rep"
    Rep2Code = "rep2code"
    Rep2Rep = "rep2rep"
    TextRep2Rep = "textrep2rep"
    TextRep2Code = "textrep2code"


class Model(Enum):
    T5Base = "t5-base"
    CodeT5Small = "codet5-small"
    CodeT5Base = "codet5-base"
    CodeT5P220m = "codet5p-220m"
    GPT2Small = "gpt2-small"
    GPT2Medium = "gpt2-medium"


model_flavour_params = {
    ModelFlavour.Text2Code: dict(
        slug = "text2code",
        input_prefix = "text to code: ",
        input_label = "text",
        target_label = "code",
    ),
    ModelFlavour.Text2Rep: dict(
        slug = "text2rep",
        input_prefix = "text to rep: ",
        input_label = "text",
        target_label = "code_rep",
    ),
    ModelFlavour.Rep2Code: dict(
        slug = "rep2code",
        input_prefix = "rep to code: ",
        input_label = "lang_rep",
        target_label = "code",
    ),
    ModelFlavour.Rep2Rep: dict(
        slug = "rep2rep",
        input_prefix = "rep to rep: ",
        input_label = "lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Rep: dict(
        slug = "text_rep2rep",
        input_prefix = "text and rep to rep: ",
        input_label = "text_lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Code: dict(
        slug = "textrep2code",
        input_prefix = "text and rep to code: ",
        input_label = "text_lang_rep",
        target_label = "code",
    ),
}


pretrained_model_names_mapping = {
    Model.T5Base: "t5-base",
    Model.CodeT5Small: "Salesforce/codet5-small",
    Model.CodeT5Base: "Salesforce/codet5-base",
    Model.CodeT5P220m: "Salesforce/codet5p-220m",
    Model.GPT2Small: "gpt2",
    Model.GPT2Medium: "gpt2-medium",
}


def load_tokenizer(model_flavour: ModelFlavour, pretrained_model_name_or_path: str):
  if model_flavour in [Model.T5Base]:
    return T5Tokenizer.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.CodeT5Small, Model.CodeT5Base, Model.CodeT5P220m]:
    return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.GPT2Small]:
    return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.GPT2Medium]:
    return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path)
  else:
    raise ValueError(f"No such model flavour {model_flavour}")


def load_model(model_flavour: ModelFlavour, pretrained_model_name_or_path: str):
  if model_flavour in [Model.T5Base, Model.CodeT5Small, Model.CodeT5Base, Model.CodeT5P220m]:
    return T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.GPT2Small, Model.GPT2Medium]:
    return TFGPT2Model.from_pretrained(pretrained_model_name_or_path)
  else:
    raise ValueError(f"No such model flavour {model_flavour}")

In [616]:
from typing import List, Union, Optional, TypeVar, Generic
import os
import pandas as pd
import ast
import math
import glob
from representations.tree.tree import Tree
from representations.builders.ast.tearers.tearer_factory import TearerFactory
import tokenize
from nltk.translate import bleu_score
from nltk.translate.bleu_score import SmoothingFunction
from sklearn import metrics
from tqdm.auto import tqdm


def parse_code_rep_to_code(code_rep: str, verbose: str = "Fatal") -> str:
    try:
        tree = Tree.unparse(code_rep)
        tearer = TearerFactory().get_tearer(tree.root_node)
        asdl = tearer.tear(tree.root_node)
        code = ast.unparse(asdl)
    except Exception as e:
        if verbose == "Error":
            print(f"[Error] failed to prase code rep to code:\n", e)
        code = ""
    finally:
        return code


def build_test_code(
    code: str,
    imports: str,
    test: str,
    code_embed_str: str = "# end code block to test",
    fail_on_error: bool = False,
    verbose: str = "Fatal",
):
    try:
        code_insert_idx = test.find(code_embed_str)
        program_code = imports
        program_code += "\n"
        program_code += test[:code_insert_idx]
        program_code += code
        program_code += "\n"
        program_code += test[code_insert_idx:]
    except Exception as e:
        if verbose == "Error":
            print("[ERROR] Failed to unparse code rep to code\n", e)
        if fail_on_error:
            raise e
        program_code = ""
    finally:
        return program_code


def tokenize_source(code):
    file_path = "/tmp/example.py"

    with open(file_path, "w") as text_file:
        text_file.write(code)

    with open(file_path, "rb") as f:
        tokens_gen = tokenize.tokenize(f.readline)

        tokens = [token.string for token in tokens_gen]

    os.remove(file_path)
    return tokens


def eval_code(code: str):
    test_results = {}
    try:
        context = {}
        exec(code, context)
        test_results = context.get("test_results", {})
        test_results["execution_success"] = test_results.get("execution_success", 0) + 1
    except AssertionError as e:
        test_results["assertion_failure"] = test_results.get("assertion_failure", 0) + 1
    except Exception as e:
        test_results["execution_failure"] = test_results.get("execution_failure", 0) + 1

    code_failure = test_results.get("code_failure", 0)
    assertion_failure = test_results.get("assertion_failure", 0)
    execution_failure = test_results.get("execution_failure", 0)
    execution_success = test_results.get("execution_success", 0)
    correct = test_results.get("correct", 0)
    incorrect = test_results.get("incorrect", 0)
    total = (correct + incorrect) or math.inf
    accuracy = (1 - code_failure) * (correct / total)

    results = dict(
        code_failure=code_failure,
        execution_success=execution_success,
        execution_failure=execution_failure,
        assertion_failure=assertion_failure,
        correct=correct,
        incorrect=incorrect,
        accuracy=accuracy,
    )

    return results


def generate_predictions(
    model, tokenizer, dataloader, gold_column, id_labels, max_length, k
):
    model.eval()
    outputs = []
    targets = []
    ks = []
    ids = {}
    for id_label in id_labels:
        ids[id_label] = []

    for batch in tqdm(dataloader):
        outs = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=max_length,
            do_sample=k>1,
            num_return_sequences=k
        )

        output = [tokenizer.decode(out, skip_special_tokens=True) for out in outs]
        target = [t.strip() for t in list(np.repeat(batch[gold_column], k))]

        outputs.extend(output)
        targets.extend(target)
        ks.extend(list(np.arange(k)) * (batch["input_ids"].shape[0]))
        for id_label in id_labels:
            ids[id_label].extend(list(np.repeat(batch[id_label], k)))

    data = pd.DataFrame({
        **{
            "output": outputs,
            "target": targets,
            "k": ks,
        },
        **ids
    })

    return data


def eval_model(data: pd.DataFrame):
    results = dict(
        exact=metrics.accuracy_score(data["target"], data["output"]),
        bleu=None,
        humaneval=eval_model_humaneval(data["target"], data["output"]),
    )

    return results


def humaneval_accuracy_score(
    data: pd.DataFrame,
    code_column_name: str = "pred_code",
    score_id_labels1: Union[str, List[str]] = ["sample_id", "k"],
    score_id_labels2: Union[str, List[str]] = "sample_id",
    score_column_name: str = "accuracy",
):
    test_codes = data.apply(
        lambda x: build_test_code(
            code=x[code_column_name], imports=x["imports"], test=x["test"]
        ),
        axis=1,
    )
    test_results = test_codes.apply(lambda test_code: eval_code(test_code))
    test_results_df = pd.DataFrame.from_records(
        test_results.values, index=test_results.index
    )
    test_scores = (
        test_results_df.reset_index(drop=False)
        .groupby(score_id_labels1)[score_column_name]
        .mean()
    )
    score = (
        test_scores.reset_index(drop=False)
        .groupby(score_id_labels2)[score_column_name]
        .max()
        .mean()
    )
    return dict(score=score, results=test_results_df)


def bleu_accuracy_score(
    data: pd.DataFrame,
    generated_column="output",
    gold_column="code",
    score_id_labels1: Union[str, List[str]] = ["sample_id", "k"],
    score_id_labels2: Union[str, List[str]] = "sample_id",
    score_column_name: str = "bleu_score",
):
    eval_results = data.apply(
        lambda x: eval_bleu(x[gold_column], x[generated_column]), axis=1
    )
    eval_results_df = eval_results.to_frame("bleu_score")
    test_scores = (
        eval_results_df.reset_index(drop=False)
        .groupby(score_id_labels1)[score_column_name]
        .mean()
    )
    score = (
        test_scores.reset_index(drop=False)
        .groupby(score_id_labels2)[score_column_name]
        .max()
        .mean()
    )
    return dict(score=score, results=eval_results_df)


def model_eval(
    results_df=None,
    results_file_path=None,
    output_column="output",
    gold_column="code",
    parse_to_code=False,
    compute_humanval=True,
    compute_bleu=True,
):
    results_df = (
        pd.read_csv(results_file_path) if results_file_path else results_df.copy()
    )
    results_df["sample_id"] = results_df["sample_id"].astype(int)
    results_df.set_index(["sample_id", "sample_minor_id", "k"], inplace=True)
    results_df.sort_index(inplace=True)

    code_column = "generated_code"
    if parse_to_code:
        results_df[code_column] = results_df[output_column].apply(
            lambda x: parse_code_rep_to_code(x)
        )
    else:
        results_df[code_column] = results_df[output_column]

    results_df["test"] = results_df["test"].str.replace(
        "= next(iterator)", "= next(iterator, None)"
    )
    results_df[code_column] = results_df[code_column].str.replace(
        " = ContentType.", " = MessageContentType."
    )
    results_df[code_column] = results_df[code_column].str.replace(
        "Message.", "Messages."
    )

    humaneval_results = (
        humaneval_accuracy_score(data=results_df, code_column_name=code_column)
        if compute_humanval
        else {}
    )

    bleu_results = (
        bleu_accuracy_score(
            data=results_df, generated_column=code_column, gold_column=gold_column
        )
        if compute_bleu
        else {}
    )

    results = dict(humaneval=humaneval_results, bleu=bleu_results)
    return results


def eval_model_humaneval(
    data: pd.DataFrame,
    code_column_name: str = "pred_code",
    score_id_labels: Union[str, List[str]] = "sample_id",
    score_column_name: str = "accuracy",
):
    test_codes = data.apply(
        lambda x: build_test_code(
            code=x[code_column_name], imports=x["imports"], test=x["test"]
        ),
        axis=1,
    )
    test_results = test_codes.apply(lambda test_code: eval_code(test_code))

    test_results_df = pd.DataFrame.from_records(
        test_results.values, index=test_results.index
    )
    score = (
        test_results_df.reset_index(drop=False)
        .groupby(score_id_labels)[score_column_name]
        .mean()
        .mean()
    )
    return score, test_results_df


def eval_bleu(code, generated_code):
    if not code or not generated_code:
        return 0

    hypothesis = tokenize_source(code)

    try:
        reference = tokenize_source(generated_code)
    except:
        return 0

    n = max(min(len(hypothesis), 4), 1)
    weight = 1 / n
    weights = (weight,) * n
    smoothing_function = SmoothingFunction().method4
    score = bleu_score.sentence_bleu(
        [reference], hypothesis, weights=weights, smoothing_function=smoothing_function
    )
    return score


def eval_generated_code(
    df,
    model,
    tokenizer,
    k,
    dataloader,
    target_label,
    id_labels,
    max_length,
    output_column="output",
    gold_column="code",
    parse_code=False,
    file_path=None,
    should_generate_predictions=True,
    should_model_eval=True,
):
    file_exists = file_path and os.path.exists(file_path)
    if should_generate_predictions and not file_exists:
        preds_df = generate_predictions(
            model,
            tokenizer,
            k=k,
            dataloader=dataloader,
            gold_column=target_label,
            id_labels=id_labels,
            max_length=max_length,
        )

        if file_path:
            results_df = df.join(preds_df.set_index(df.index.names))
            results_df.to_csv(file_path)
            print(f"Results were saved to {file_path}")
    else:
        print(f"Loading results from {file_path}")
        results_df = pd.read_csv(file_path)

    results = None
    if should_model_eval:
        results = model_eval(
            results_df=results_df,
            parse_to_code=parse_code,
            compute_humanval=True,
            compute_bleu=True,
            output_column=output_column,
            gold_column=gold_column,
        )
        print(f"humaneval = {results['humaneval']['score']}")
        print(f"bleu = {results['bleu']['score']}")

    return results


In [621]:
from data.utils import (
    get_dataset_args,
    load_test_data,
)

test_file_path = 'build/eval_complex_utterance_to_code_with_intermediate_152_20230525.csv.gz'
test_df = load_test_data(test_file_path=test_file_path, id_labels=None)
print("test_df", test_df.shape)

shape =  (152, 12)
test_df (152, 12)
test_df (152, 12)


In [638]:
k = 10
# args = dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Text2Code, pretrained_model_path='./experiments/codet5-base-text2code-2023-05-25_125337')
args = dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='/Users/asaf/Downloads/codet5-base-rep2rep-2023-05-24_122620')
# args = dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Text2Rep, pretrained_model_path='/Users/asaf/Downloads/codet5-base-text2rep-2023-05-24_143609')
pretrained_model_path = args.get('pretrained_model_path')
selected_model_type = args.get('selected_model_type')
model_name = args.get('model_name')
print(pretrained_model_path)

/Users/asaf/Downloads/codet5-base-rep2rep-2023-05-24_122620


In [639]:
# create a tokenizer and load the model
tokenizer = load_tokenizer(
  model_flavour=model_name,
  pretrained_model_name_or_path=pretrained_model_names_mapping[model_name]
)
model = load_model(
  model_flavour=model_name,
  pretrained_model_name_or_path=pretrained_model_path
)

# selected model params
selected_model_flavour_params = model_flavour_params[selected_model_type]
target_label = selected_model_flavour_params.get('target_label')
parse_code = (target_label == 'code_rep')
slug = selected_model_flavour_params.get('slug')

# load the dataset
dataset_args = get_dataset_args(tokenizer, selected_model_flavour_params)
max_length = dataset_args['max_target_length']

test_dataset = ComplexUtteranceCodeDataset(data=test_df, **dataset_args)
test_dataloader = DataLoader(test_dataset, batch_size=4, num_workers=2)

model_id = model_name.value
pretrained_model_file = [x for x in pretrained_model_path.split('/') if x][-1]
test_results_file_path = f"/Users/asaf/Downloads/results/test-{str(test_df.shape[0])}-{pretrained_model_file}-k{k}.csv.gz"
id_labels = ['test_id', 'sample_id', 'sample_minor_id']

print(f"model_id = {model_id}")
print(f"slug = {slug}")
print(f"k = {k}")

model_id = codet5-base
slug = rep2rep
k = 10


In [640]:
dataloader = test_dataloader
gold_column = target_label

model.eval()
outputs = []
targets = []
ks = []
ids = {}
for id_label in id_labels:
    ids[id_label] = []

for batch in tqdm(dataloader):
    outs = model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_length=max_length,
        do_sample=k>1,
        num_return_sequences=k
    )

    output = [tokenizer.decode(out, skip_special_tokens=True) for out in outs]
    target = [t.strip() for t in list(np.repeat(batch[gold_column], k))]

    outputs.extend(output)
    targets.extend(target)
    ks.extend(list(np.arange(k)) * (batch["input_ids"].shape[0]))
    for id_label in id_labels:
        ids[id_label].extend(list(np.repeat(batch[id_label], k)))

  0%|          | 0/38 [00:02<?, ?it/s]

In [641]:
preds_df = pd.DataFrame({
    **{
        "output": outputs,
        "target": targets,
        "k": ks,
    },
    **ids
})

In [643]:
test_results_file_path

'/Users/asaf/Downloads/results/test-152-codet5-base-rep2rep-2023-05-24_122620-k10.csv.gz'

In [644]:
df = test_df.set_index(id_labels)
file_path=test_results_file_path

results_df = df.join(preds_df.set_index(df.index.names))
results_df.to_csv(file_path)
print(f"Results were saved to {file_path}")

Results were saved to /Users/asaf/Downloads/results/test-152-codet5-base-rep2rep-2023-05-24_122620-k10.csv.gz


In [632]:
results_df2 = pd.read_csv(file_path)

In [633]:
output_column = 'output'
results = model_eval(
    results_df=results_df2,
    parse_to_code=parse_code,
    compute_humanval=True,
    compute_bleu=True,
    output_column=output_column,
    gold_column=gold_column,
)
print(f"humaneval = {results['humaneval']['score']}")
print(f"bleu = {results['bleu']['score']}")

humaneval = 0.08196721311475409
bleu = 0.34636317125504384
