# Eval Models


Based on https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb#scrollTo=wvRHDkCIS91f and https://colab.research.google.com/drive/1d4xNsZbDSZ5ZqXgZjy7HyTVRLBJBVsh6#scrollTo=SDVQ04fGRb1v

## Set-up environment

Let's first install the required libraries:
* HuggingFace Transformers (for the CodeT5 model)
* HuggingFace Datasets (for loading the dataset + preprocessing it)
* PyTorch Lightning (for training)
* Weights and Biases (for logging training metrics).
* Project code from a GitHub repo

In [1]:
# !pip install -q transformers sentencepiece pytorch-lightning

In [2]:
%%bash
rm -r ~/tmp/complex-utterance-to-code
git clone https://github.com/asafam/novicode.git ~/tmp/complex-utterance-to-code
ls ~/tmp/

Cloning into '/Users/asaf/tmp/complex-utterance-to-code'...


[1m[36mcomplex-utterance-to-code[m[m


In [3]:
import os
import sys

paths = [
  '~/tmp/complex-utterance-to-code',
  '~/tmp/complex-utterance-to-code/notebooks/src',
  '~/tmp/complex-utterance-to-code/src',
  '~/tmp/complex-utterance-to-code/src/api/v6',
]
for path in paths:
  path = os.path.normcase(path)
  if not any(os.path.normcase(sp) == path for sp in sys.path):
      sys.path.append(path)

In [4]:
# from google.colab import drive

WORK_DRIVE = '/Users/asaf/tmp'
WORK_AREA = WORK_DRIVE + '/complex-utterance-to-code'

# drive.mount(WORK_DRIVE)
os.chdir(WORK_AREA)

In [5]:
import sys
sys.path.append('/Users/asaf/tmp/complex-utterance-to-code/src')
sys.path.append('/Users/asaf/tmp/complex-utterance-to-code/notebooks/src')

In [6]:
from typing import Union, List
import argparse
import glob
import os
from datetime import datetime
from pathlib import Path
import sys
import json
import time
import logging
import random
import re
import math
from itertools import chain
from string import punctuation
import tokenize
from nltk.translate import bleu_score

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
import textwrap
from sklearn import metrics
import statistics

from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    TFGPT2Model,
    GPT2Tokenizer,
    OpenAIGPTTokenizer,
    RobertaTokenizer,
    get_linear_schedule_with_warmup
)

from data.dataset import ComplexUtteranceCodeDataset
from data.utils import (
    get_dataset_args,
    load_test_data,
)
from eval.utils import (
    eval_generated_code,
    model_eval
)

torch.manual_seed(42)

[nltk_data] Downloading package punkt to /Users/asaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x29132be30>

### Model configuration code

In [7]:
from enum import Enum

class ModelFlavour(Enum):
    Text2Code = "text2code"
    Text2Rep = "text2rep"
    Rep2Code = "rep2code"
    Rep2Rep = "rep2rep"
    RepRaw2RepRaw = "repraw2repraw"
    Text2RepRaw = "text2repraw"
    RepRaw2Code = "repraw2code"
    TextRep2Rep = "textrep2rep"
    TextRep2Code = "textrep2code"


class Model(Enum):
    T5Base = "t5-base"
    CodeT5Small = "codet5-small"
    CodeT5Base = "codet5-base"
    CodeT5P220m = "codet5p-220m"
    GPT2Small = "gpt2-small"
    GPT2Medium = "gpt2-medium"


model_flavour_params = {
    ModelFlavour.Text2Code: dict(
        slug = "text2code",
        input_prefix = "text to code: ",
        input_label = "text",
        target_label = "code",
    ),
    ModelFlavour.Text2Rep: dict(
        slug = "text2rep",
        input_prefix = "text to rep: ",
        input_label = "text",
        target_label = "code_rep",
    ),
    ModelFlavour.Rep2Code: dict(
        slug = "rep2code",
        input_prefix = "rep to code: ",
        input_label = "lang_rep",
        target_label = "code",
    ),
    ModelFlavour.Rep2Rep: dict(
        slug = "rep2rep",
        input_prefix = "rep to rep: ",
        input_label = "lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Rep: dict(
        slug = "text_rep2rep",
        input_prefix = "text and rep to rep: ",
        input_label = "text_lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Code: dict(
        slug = "textrep2code",
        input_prefix = "text and rep to code: ",
        input_label = "text_lang_rep",
        target_label = "code",
    ),
    ModelFlavour.RepRaw2RepRaw: dict(
        slug = "repraw2repraw",
        input_prefix = "raw rep to raw rep: ",
        input_label = "lang_rep_raw",
        target_label = "code_rep_raw",
    ),
    ModelFlavour.Text2RepRaw: dict(
        slug = "text2repraw",
        input_prefix = "text to raw rep: ",
        input_label = "text",
        target_label = "code_rep_raw",
    ),
    ModelFlavour.RepRaw2Code: dict(
        slug = "repraw2code",
        input_prefix = "raw rep to code: ",
        input_label = "lang_rep_raw",
        target_label = "code",
    ),
}


pretrained_model_names_mapping = {
    Model.T5Base: "t5-base",
    Model.CodeT5Small: "Salesforce/codet5-small",
    Model.CodeT5Base: "Salesforce/codet5-base",
    Model.CodeT5P220m: "Salesforce/codet5p-220m",
    Model.GPT2Small: "gpt2",
    Model.GPT2Medium: "gpt2-medium",
}


def load_tokenizer(model_flavour: ModelFlavour, pretrained_model_name_or_path: str):
  if model_flavour in [Model.T5Base]:
    return T5Tokenizer.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.CodeT5Small, Model.CodeT5Base, Model.CodeT5P220m]:
    return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.GPT2Small]:
    return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.GPT2Medium]:
    return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path)
  else:
    raise ValueError(f"No such model flavour {model_flavour}")


def load_model(model_flavour: ModelFlavour, pretrained_model_name_or_path: str):
  if model_flavour in [Model.T5Base, Model.CodeT5Small, Model.CodeT5Base, Model.CodeT5P220m]:
    return T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
  elif model_flavour in [Model.GPT2Small, Model.GPT2Medium]:
    return TFGPT2Model.from_pretrained(pretrained_model_name_or_path)
  else:
    raise ValueError(f"No such model flavour {model_flavour}")

### Utils

In [8]:
from typing import List, Union, Optional, TypeVar, Generic
import os
import pandas as pd
import ast
import math
import glob
from representations.tree.tree import Tree
from representations.builders.ast.tearers.tearer_factory import TearerFactory
import tokenize
from nltk.translate import bleu_score
from nltk.translate.bleu_score import SmoothingFunction
from sklearn import metrics
from tqdm.auto import tqdm


def parse_code_rep_to_code(code_rep: str, rules_enabled: bool = False, verbose: str = "Fatal") -> str:
    try:
        tree = Tree.unparse(code_rep)
        tearer = TearerFactory().get_tearer(tree.root_node, rules_enabled=rules_enabled)
        asdl = tearer.tear(tree.root_node)
        code = ast.unparse(asdl)
    except Exception as e:
        if verbose == "Error":
            print(f"[Error] failed to prase code rep to code:\n", e)
        code = ""
    finally:
        return code


def build_test_code(
    code: str,
    imports: str,
    test: str,
    code_embed_str: str = "# end code block to test",
    fail_on_error: bool = False,
    verbose: str = "Fatal",
):
    try:
        code_insert_idx = test.find(code_embed_str)
        program_code = imports
        program_code += "\n"
        program_code += test[:code_insert_idx]
        program_code += code
        program_code += "\n"
        program_code += test[code_insert_idx:]
    except Exception as e:
        if verbose == "Error":
            print("[ERROR] Failed to unparse code rep to code\n", e)
        if fail_on_error:
            raise e
        program_code = ""
    finally:
        return program_code


def tokenize_source(code):
    file_path = "/tmp/example.py"

    with open(file_path, "w") as text_file:
        text_file.write(code)

    with open(file_path, "rb") as f:
        tokens_gen = tokenize.tokenize(f.readline)

        tokens = [token.string for token in tokens_gen]

    os.remove(file_path)
    return tokens


def eval_code(code: str):
    test_results = {}
    try:
        context = {}
        exec(code, context)
        test_results = context.get("test_results", {})
        test_results["execution_success"] = test_results.get("execution_success", 0) + 1
    except AssertionError as e:
        test_results["assertion_failure"] = test_results.get("assertion_failure", 0) + 1
    except Exception as e:
        test_results["execution_failure"] = test_results.get("execution_failure", 0) + 1

    code_failure = test_results.get("code_failure", 0)
    assertion_failure = test_results.get("assertion_failure", 0)
    execution_failure = test_results.get("execution_failure", 0)
    execution_success = test_results.get("execution_success", 0)
    correct = test_results.get("correct", 0)
    incorrect = test_results.get("incorrect", 0)
    total = (correct + incorrect) or math.inf
    accuracy = (1 - code_failure) * (correct / total)

    results = dict(
        code_failure=code_failure,
        execution_success=execution_success,
        execution_failure=execution_failure,
        assertion_failure=assertion_failure,
        correct=correct,
        incorrect=incorrect,
        accuracy=accuracy,
    )

    return results


def generate_predictions(
    df,
    model,
    tokenizer,
    gold_column,
    id_labels,
    max_length,
    dataset_args,
    file_path=None,
    n=1,
    batch_size=4,
    num_workers=8,
    output_column="output",
):
    model.eval()
    outputs = []
    targets = []
    ns = []
    ids = {}

    for id_label in id_labels:
        ids[id_label] = []

    filtered_df = df[df[output_column].isna()] if output_column in df else df # generate predictions only for
    unique_df = filtered_df.drop_duplicates(subset=id_labels)

    if unique_df.empty:
        return df

    dataset = ComplexUtteranceCodeDataset(data=unique_df, **dataset_args)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

    for batch in tqdm(dataloader):
        outs = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=max_length,
            do_sample=n>1,
            num_return_sequences=n
        )

        output = [tokenizer.decode(out, skip_special_tokens=True) for out in outs]
        target = [t.strip() for t in list(np.repeat(batch[gold_column], n))]

        outputs.extend(output)
        targets.extend(target)
        ns.extend(list(np.arange(n)) * (batch["input_ids"].shape[0]))
        for id_label in id_labels:
            ids[id_label].extend(list(np.repeat(batch[id_label], n)))

        preds_df = pd.DataFrame({
            **{
                output_column: outputs,
                "target": targets,
                "n": ns,
            },
            **ids
        })

        if file_path:
            preds_df['sample_id'] = preds_df['sample_id'].astype('int64')
            df['sample_id'] = df['sample_id'].astype('int64')

            # preds_df = (df.set_index(id_labels)).merge(preds_df, on=id_labels, how='left')

            # Merge the DataFrames
            suffix_preds = '_preds'
            merged_df = pd.merge(df, preds_df, on=id_labels, how='left', suffixes=('', suffix_preds))

            # Update 'n' and 'output' in df where they are None
            for column in preds_df.columns:
              merged_column = f"{column}{suffix_preds}"
              if merged_column in merged_df:
                merged_df[column] = merged_df[column].combine_first(merged_df[merged_column])
                merged_df.drop(merged_column, axis=1, inplace=True)
            preds_df = merged_df

            # preds_df = df.merge(preds_df, on=id_labels, how='left')
            preds_df.to_csv(file_path)
            # total_preds_count = preds_df['sample_id'].nunique()
            # generated_preds_count = preds_df[preds_df[output_column].notna()]['sample_id'].nunique()
            # pending_preds_count = preds_df[preds_df[output_column].isna()]['sample_id'].nunique()
            # print(f"Generated {generated_preds_count} / {total_preds_count} ({(100. * generated_preds_count / total_preds_count):.0f}%) and saved to {file_path}")

    return preds_df


def humaneval_accuracy_score(
    n,
    k,
    data: pd.DataFrame,
    code_column_name: str = "pred_code",
    score_id_labels1: Union[str, List[str]] = ["sample_id", "n"],
    score_id_labels2: Union[str, List[str]] = "sample_id",
    score_column_name: str = "accuracy",
):
    test_codes = data.apply(
        lambda x: build_test_code(
            code=x[code_column_name], imports=x["imports"], test=x["test"]
        ),
        axis=1,
    )
    test_results = test_codes.apply(lambda test_code: eval_code(test_code))
    test_results_df = pd.DataFrame.from_records(
        test_results.values, index=test_results.index
    )
    test_scores = (
        test_results_df.reset_index(drop=False)
        .groupby(score_id_labels1)[score_column_name]
        .mean()
    )
    scores = (
        test_scores.reset_index(drop=False)
        .groupby(score_id_labels2)[score_column_name]
        .max()
    )
    c = (scores == 1).sum()
    print(f"c = {c}, n= {n}, k = {k}")
    score = pass_at_k(n, c, k)
    return dict(score=score, results=test_results_df)


def bleu_accuracy_score(
    data: pd.DataFrame,
    generated_column="output",
    gold_column="code",
    score_id_labels1: Union[str, List[str]] = ["sample_id", "n"],
    score_id_labels2: Union[str, List[str]] = "sample_id",
    score_column_name: str = "bleu_score",
):
    eval_results = data.apply(
        lambda x: eval_bleu(x[gold_column], x[generated_column]), axis=1
    )
    eval_results_df = eval_results.to_frame("bleu_score")
    test_scores = (
        eval_results_df.reset_index(drop=False)
        .groupby(score_id_labels1)[score_column_name]
        .mean()
    )
    score = (
        test_scores.reset_index(drop=False)
        .groupby(score_id_labels2)[score_column_name]
        .max()
        .mean()
    )
    return dict(score=score, results=eval_results_df)


def model_eval(
    n,
    k,
    results_df=None,
    results_file_path=None,
    output_column="output",
    gold_column="code",
    parse_to_code=False,
    parse_rules_enabled=False,
    compute_humanval=True,
    compute_bleu=True,
):
    results_df = (
        pd.read_csv(results_file_path) if results_file_path else results_df.copy()
    )
    results_df["sample_id"] = results_df["sample_id"].astype(int)
    results_df.set_index(["sample_id", "sample_minor_id", "n"], inplace=True)
    results_df.sort_index(inplace=True)

    code_column = "generated_code"
    if parse_to_code:
        results_df[code_column] = results_df[output_column].apply(
            lambda x: parse_code_rep_to_code(x, rules_enabled=parse_rules_enabled)
        )
    else:
        results_df[code_column] = results_df[output_column]

    results_df["test"] = results_df["test"].str.replace(
        "= next(iterator)", "= next(iterator, None)", regex=True
    )
    results_df[code_column] = results_df[code_column].str.replace(
        " = ContentType.", " = MessageContentType.", regex=True
    )
    results_df[code_column] = results_df[code_column].str.replace(
        "Message.", "Messages.", regex=True
    )

    humaneval_results = (
        humaneval_accuracy_score(n=n, k=k, data=results_df, code_column_name=code_column)
        if compute_humanval
        else {}
    )

    bleu_results = (
        bleu_accuracy_score(
            data=results_df, generated_column=code_column, gold_column=gold_column
        )
        if compute_bleu
        else {}
    )

    results = dict(
        humaneval=humaneval_results,
        bleu=bleu_results
    )
    return results


def pass_at_k(n, c, k):
    """
    :param n: total number of samples
    :param c: number of correct samples
    :param k: k in pass@$k$
    """
    if (n - c) < k:
        return 1.0
    score =  1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
    return score


def eval_bleu(code, generated_code):
    if not code or not generated_code:
        return 0

    hypothesis = tokenize_source(code)

    try:
        reference = tokenize_source(generated_code)
    except:
        return 0

    n = max(min(len(hypothesis), 4), 1)
    weight = 1 / n
    weights = (weight,) * n
    smoothing_function = SmoothingFunction().method4
    score = bleu_score.sentence_bleu(
        [reference], hypothesis, weights=weights, smoothing_function=smoothing_function
    )
    return score


def eval_generated_code(
    df,
    model,
    tokenizer,
    dataset_args,
    target_label,
    id_labels,
    max_length,
    n,
    file_path=None,
    output_column="output",
    gold_column="code",
    force_generate_predictions=True,
    should_model_eval=True,
    batch_size=4
):
    file_exists = file_path and os.path.exists(file_path)
    if file_exists:
      print(f"Loading results from {file_path}")
    preds_df = pd.read_csv(file_path) if file_exists else df

    total_preds_count = preds_df['sample_id'].nunique()
    pending_preds_count = preds_df[preds_df[output_column].isna()]['sample_id'].nunique() if output_column in preds_df else preds_df['sample_id'].nunique()
    generated_preds_count = preds_df[preds_df[output_column].notna()]['sample_id'].nunique() if output_column in preds_df else 0
    print(f"Generated {generated_preds_count} / {total_preds_count} ({(100. * generated_preds_count / total_preds_count):.0f}%)")

    should_generate_predictions = pending_preds_count > 0
    if force_generate_predictions or should_generate_predictions:
        print(f"Generating {pending_preds_count} results...")
        preds_df = generate_predictions(
            df=preds_df,
            model=model,
            tokenizer=tokenizer,
            n=n,
            dataset_args=dataset_args,
            file_path=file_path,
            gold_column=target_label,
            id_labels=id_labels,
            max_length=max_length,
            batch_size=batch_size
        )

    if file_path:
        print(f"Loading results from {file_path}")
        results_df = pd.read_csv(file_path)
    else:
        results_df = preds_df

    return results_df


## Evaluation

### Loading the dataset

In [9]:
test_file_path = '/Users/asaf/Workspace/biu/complex-utterance-to-code/build/eval_complex_utterance_to_code_with_intermediate_152_20231112.csv.gz'
test_df = load_test_data(test_file_path=test_file_path, id_labels=None)
print("test_df", test_df.shape)

shape =  (152, 14)
test_df (152, 14)
test_df (152, 14)


In [10]:
def eval_test_data(
    pretrained_model_path,
    test_df,
    model_name,
    selected_model_type,
    n,
    ks = [1, 10],
    results_file_path=None,
    output_column="output",
    gold_column="code",
    force_generate_predictions=True,
    should_model_eval=True,
    batch_size=4,
    force=False
):
    # create a tokenizer and load the model
    tokenizer = load_tokenizer(
        model_flavour=model_name,
        pretrained_model_name_or_path=pretrained_model_names_mapping[model_name]
    )
    model = load_model(
        model_flavour=model_name,
        pretrained_model_name_or_path=pretrained_model_path
    )

    # selected model params
    selected_model_flavour_params = model_flavour_params[selected_model_type]
    target_label = selected_model_flavour_params.get('target_label')
    parse_code = (target_label in ['code_rep', 'code_rep_raw'])
    parse_rules_enabled = (target_label == 'code_rep_raw')
    slug = selected_model_flavour_params.get('slug')

    # load the dataset
    dataset_args = get_dataset_args(tokenizer, selected_model_flavour_params)
    max_length = dataset_args['max_target_length']

    id_labels = ['sample_id'] #['test_id', 'sample_id', 'sample_minor_id']

    print(f"model_id = {model_id}")
    print(f"slug = {slug}")
    print(f"n = {n}")
    print(f"")

    results_df = eval_generated_code(
        df=test_df,
        model=model,
        tokenizer=tokenizer,
        dataset_args=dataset_args,
        n=n,
        target_label=target_label,
        id_labels=id_labels,
        max_length=max_length,
        gold_column=gold_column,
        file_path=results_file_path,
        force_generate_predictions=force_generate_predictions,
        should_model_eval=should_model_eval,
        batch_size=batch_size
    )

    results = None
    if should_model_eval:
        for k in ks:
            results = model_eval(
                n=n,
                k=k,
                results_df=results_df,
                parse_to_code=parse_code,
                parse_rules_enabled=parse_rules_enabled,
                compute_humanval=True,
                compute_bleu=False,
                output_column=output_column,
                gold_column=gold_column,
            )
            print(f"n = {n}, k = {k}")
            print(f"humaneval = {results['humaneval']['score']}") if results['humaneval'] else print("no pass@k evaluation")
            print(f"bleu = {results['bleu']['score']}") if results['bleu'] else print("no BLEU evaluation")
            print(f"")
    return results

In [11]:
models_args = [
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='./experiments/codet5-small-rep2rep-2023-05-23_140209'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.Rep2Code, pretrained_model_path='./experiments/codet5-small-rep2code-2023-05-24_122619'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.Text2Rep, pretrained_model_path='./experiments/codet5-small-text2rep-2023-05-24_135817'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.Text2Code, pretrained_model_path='./experiments/codet5-small-text2code-2023-05-24_164951'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.RepRaw2RepRaw, pretrained_model_path='./experiments/codet5-small-repraw2repraw-2023-11-08_161837'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.Text2RepRaw, pretrained_model_path='./experiments/codet5-small-text2repraw-2023-11-08_214411'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.RepRaw2Code, pretrained_model_path='./experiments/codet5-small-repraw2code-2023-11-08_085740'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.TextRep2Rep, pretrained_model_path='./experiments/codet5-small-text_rep2rep-2023-05-24_171242'),
    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.TextRep2Code, pretrained_model_path='./experiments/codet5-small-textrep2code-2023-05-25_045545'),

    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='./experiments/codet5-base-rep2rep-2023-05-24_122620'),
    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Text2Code, pretrained_model_path='./experiments/codet5-base-text2code-2023-05-25_125337'),
    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Text2Rep, pretrained_model_path='./experiments/codet5-base-text2rep-2023-05-24_143609'),
    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Rep2Code, pretrained_model_path='./experiments/codet5-base-rep2code-2023-05-25_131404'),
    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.RepRaw2RepRaw, pretrained_model_path='./experiments/codet5-base-repraw2repraw-2023-11-08_145043'),
    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Text2RepRaw, pretrained_model_path='./experiments/codet5-base-text2repraw-2023-11-12_112626'),
    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.RepRaw2Code, pretrained_model_path='./experiments/codet5-base-repraw2code-2023-11-08_214552'),
    # dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.TextRep2Rep, pretrained_model_path='./experiments/codet5-base-text_rep2rep-2023-05-26_072118'),

    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='./experiments/codet5p-220m-rep2rep-2023-05-24_122638'),
    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.Rep2Code, pretrained_model_path='./experiments/codet5p-220m-rep2code-2023-05-25_110453'),
    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.Text2Rep, pretrained_model_path='./experiments/codet5p-220m-text2rep-2023-05-24_143618'),
    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.Text2Code, pretrained_model_path='./experiments/codet5p-220m-text2code-2023-05-25_111249'),
    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.RepRaw2RepRaw, pretrained_model_path='./experiments/codet5p-220m-repraw2repraw-2023-11-08_132428'),
    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.Text2RepRaw, pretrained_model_path='./experiments/codet5p-220m-text2repraw-2023-11-08_161700'),
    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.RepRaw2Code, pretrained_model_path='./experiments/codet5p-220m-repraw2code-2023-11-08_090909'),
    # dict(model_name=Model.CodeT5P220m, selected_model_type=ModelFlavour.TextRep2Rep, pretrained_model_path='./experiments/codet5p-220m-text_rep2rep-2023-05-25_110437'),

    dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='/Users/asaf/Downloads/experiments/t5-base-rep2rep-2023-05-25_161415'),
    dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.Rep2Code, pretrained_model_path='/Users/asaf/Downloads/experiments/t5-base-rep2code-2023-05-25_160900'),
    dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.Text2Rep, pretrained_model_path='/Users/asaf/Downloads/experiments/t5-base-text2rep-2023-05-25_161606'),
    dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.Text2Code, pretrained_model_path='/Users/asaf/Downloads/experiments/t5-base-text2code-2023-05-27_171606'),
    # dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.RepRaw2RepRaw, pretrained_model_path='./experiments/t5-base-repraw2repraw-2023-11-08_212729'),
    # dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.Text2RepRaw, pretrained_model_path='./experiments/t5-base-text2repraw-2023-11-09_092207'),
    # dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.RepRaw2Code, pretrained_model_path='./experiments/t5-base-repraw2code-2023-11-12_112659'),
    # dict(model_name=Model.T5Base, selected_model_type=ModelFlavour.TextRep2Rep, pretrained_model_path='./experiments/t5-base-textrep2code-2023-05-27_171555'),

    # dict(model_name=Model.CodeT5Small, selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='./experiments/refit_complex_codet5-small-rep2rep-2023-05-23_031926'),
]

In [12]:
# test_df = pd.DataFrame({
#         "text": ["Check that I received a mail from my advisors or cancel my first meeting with them on Sunday and Monday."],
#         "code": [""],
#         "test_id": [0],
#         "sample_id": [0],
#         "sample_minor_id": [None]
# })
# test_df

In [13]:
# n = 100
# args = models_args[0]
# # args = dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Text2Code, pretrained_model_path='./experiments/codet5-base-text2code-2023-05-25_125337')
# # args = dict(model_name=Model.CodeT5Base, selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='./experiments/codet5-base-rep2rep-2023-05-24_122620')
# pretrained_model_path = args.get('pretrained_model_path')
# selected_model_type = args.get('selected_model_type')
# model_name = args.get('model_name')
# print(pretrained_model_path)

In [14]:
# # create a tokenizer and load the model
# tokenizer = load_tokenizer(
#   model_flavour=model_name,
#   pretrained_model_name_or_path=pretrained_model_names_mapping[model_name]
# )
# model = load_model(
#   model_flavour=model_name,
#   pretrained_model_name_or_path=pretrained_model_path
# )

# # selected model params
# selected_model_flavour_params = model_flavour_params[selected_model_type]
# target_label = selected_model_flavour_params.get('target_label')
# parse_code = (target_label == 'code_rep')
# slug = selected_model_flavour_params.get('slug')

# # load the dataset
# dataset_args = get_dataset_args(tokenizer, selected_model_flavour_params)
# max_length = dataset_args['max_target_length']

# BATCH_SIZE = 4
# test_dataset = ComplexUtteranceCodeDataset(data=test_df[:BATCH_SIZE], **dataset_args) # remove temp size
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=8)

# model_id = model_name.value
# pretrained_model_file = [x for x in pretrained_model_path.split('/') if x][-1]
# test_results_file_path = f"results/test-{str(test_df.shape[0])}-{pretrained_model_file}-n{n}.csv.gz"
# id_labels = ['test_id', 'sample_id', 'sample_minor_id']

# print(f"model_id = {model_id}")
# print(f"slug = {slug}")
# print(f"n = {n}")

In [15]:
for args in models_args:
  pretrained_model_path = args.get('pretrained_model_path')
  selected_model_type = args.get('selected_model_type')
  model_name = args.get('model_name')
  print(f"Loading model from {pretrained_model_path}")

  n = 100
  selected_model_flavour_params = model_flavour_params[selected_model_type]
  slug = selected_model_flavour_params.get('slug')
  model_id = model_name.value
  test_results_file_path = f"/Users/asaf/Workspace/biu/complex-utterance-to-code/dist/experiments_results/test-{str(test_df.shape[0])}-{model_id}-{slug}-n{n}.csv.gz"

  results = eval_test_data(
      pretrained_model_path=pretrained_model_path,
      test_df=test_df,
      model_name=model_name,
      selected_model_type=selected_model_type,
      n=n,
      ks=[1, 10],
      results_file_path=test_results_file_path,
      force_generate_predictions=False,
      should_model_eval=True,
      batch_size=1
  )

  # print(results['humaneval'])
  print(f"All runs ended succesfully!\n")

Loading model from /Users/asaf/Downloads/experiments/t5-base-rep2rep-2023-05-25_161415


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


model_id = t5-base
slug = rep2rep
n = 100

Loading results from /Users/asaf/Workspace/biu/complex-utterance-to-code/dist/experiments_results/test-152-t5-base-rep2rep-n100.csv.gz
Generated 79 / 122 (65%)
Generating 43 results...


  0%|          | 0/43 [00:00<?, ?it/s]

Loading results from /Users/asaf/Workspace/biu/complex-utterance-to-code/dist/experiments_results/test-152-t5-base-rep2rep-n100.csv.gz
c = 0, n= 100, k = 1
n = 100, k = 1
humaneval = 0.0
no BLEU evaluation

c = 0, n= 100, k = 10
n = 100, k = 10
humaneval = 0.0
no BLEU evaluation

All runs ended succesfully!

Loading model from /Users/asaf/Downloads/experiments/t5-base-rep2code-2023-05-25_160900


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


model_id = t5-base
slug = rep2code
n = 100

Generated 0 / 122 (0%)
Generating 122 results...


  0%|          | 0/122 [00:08<?, ?it/s]

Loading results from /Users/asaf/Workspace/biu/complex-utterance-to-code/dist/experiments_results/test-152-t5-base-rep2code-n100.csv.gz
c = 0, n= 100, k = 1
n = 100, k = 1
humaneval = 0.0
no BLEU evaluation

c = 0, n= 100, k = 10
n = 100, k = 10
humaneval = 0.0
no BLEU evaluation

All runs ended succesfully!

Loading model from /Users/asaf/Downloads/experiments/t5-base-text2rep-2023-05-25_161606


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


model_id = t5-base
slug = text2rep
n = 100

Generated 0 / 122 (0%)
Generating 122 results...


  0%|          | 0/122 [00:08<?, ?it/s]

Loading results from /Users/asaf/Workspace/biu/complex-utterance-to-code/dist/experiments_results/test-152-t5-base-text2rep-n100.csv.gz
c = 0, n= 100, k = 1
n = 100, k = 1
humaneval = 0.0
no BLEU evaluation

c = 0, n= 100, k = 10
n = 100, k = 10
humaneval = 0.0
no BLEU evaluation

All runs ended succesfully!

Loading model from /Users/asaf/Downloads/experiments/t5-base-text2code-2023-05-27_171606


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


model_id = t5-base
slug = text2code
n = 100

Generated 0 / 122 (0%)
Generating 122 results...


  0%|          | 0/122 [00:08<?, ?it/s]

Loading results from /Users/asaf/Workspace/biu/complex-utterance-to-code/dist/experiments_results/test-152-t5-base-text2code-n100.csv.gz
c = 0, n= 100, k = 1
n = 100, k = 1
humaneval = 0.0
no BLEU evaluation

c = 0, n= 100, k = 10
n = 100, k = 10
humaneval = 0.0
no BLEU evaluation

All runs ended succesfully!



In [None]:
import pandas as pd
pd.read_csv('results/test-152-codet5-small-repraw2code-n100.csv.gz', compression='gzip')

In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
n = 2
# create a tokenizer and load the model
tokenizer = load_tokenizer(
  model_flavour=model_name,
  pretrained_model_name_or_path=pretrained_model_names_mapping[model_name]
)
model = load_model(
  model_flavour=model_name,
  pretrained_model_name_or_path=pretrained_model_path
)

# selected model params
selected_model_flavour_params = model_flavour_params[selected_model_type]
target_label = selected_model_flavour_params.get('target_label')
parse_code = (target_label == 'code_rep')
slug = selected_model_flavour_params.get('slug')

# load the dataset
dataset_args = get_dataset_args(tokenizer, selected_model_flavour_params)
max_length = dataset_args['max_target_length']

BATCH_SIZE = 1
test_dataset = ComplexUtteranceCodeDataset(data=test_df[:BATCH_SIZE], **dataset_args) # remove temp size
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=2)

model_id = model_name.value
pretrained_model_file = [x for x in pretrained_model_path.split('/') if x][-1]
test_results_file_path = f"results/test-{str(test_df.shape[0])}-{pretrained_model_file}-n{n}.csv.gz"
id_labels = ['test_id', 'sample_id', 'sample_minor_id']

print(f"model_id = {model_id}")
print(f"slug = {slug}")
print(f"n = {n}")

In [None]:
dataloader = test_dataloader
gold_column = target_label

model.eval()
outputs = []
targets = []
ns = []
ids = {}
for id_label in id_labels:
    ids[id_label] = []

for batch in tqdm(dataloader):
    outs = model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_length=max_length,
        do_sample=(n>1),
        num_return_sequences=2
    )

    output = [tokenizer.decode(out, skip_special_tokens=True) for out in outs]
    target = [t.strip() for t in list(np.repeat(batch[gold_column], n))]

    outputs.extend(output)
    targets.extend(target)
    ns.extend(list(np.arange(n)) * (batch["input_ids"].shape[0]))
    for id_label in id_labels:
        ids[id_label].extend(list(np.repeat(batch[id_label], n)))

In [None]:
model_name=Model.CodeT5P220m
selected_model_type=ModelFlavour.Rep2Rep
# selected_model_type=ModelFlavour.Text2Code
pretrained_model_path='./experiments/codet5p-220m-rep2rep-2023-05-24_122638'
# pretrained_model_path = './experiments/codet5p-220m-text2code-2023-05-25_111249'
k = 10
test_df = test_df

# create a tokenizer and load the model
tokenizer = load_tokenizer(
  model_flavour=model_name,
  pretrained_model_name_or_path=pretrained_model_names_mapping[model_name]
)
model = load_model(
  model_flavour=model_name,
  pretrained_model_name_or_path=pretrained_model_path
)

# selected model params
selected_model_flavour_params = model_flavour_params[selected_model_type]
target_label = selected_model_flavour_params.get('target_label')
parse_code = (target_label == 'code_rep')
slug = selected_model_flavour_params.get('slug')

# load the dataset
dataset_args = get_dataset_args(tokenizer, selected_model_flavour_params)
max_length = dataset_args['max_target_length']

test_dataset = ComplexUtteranceCodeDataset(data=test_df, **dataset_args)
test_dataloader = DataLoader(test_dataset, batch_size=4, num_workers=2)

model_id = model_name.value
pretrained_model_file = [x for x in pretrained_model_path.split('/') if x][-1]
test_results_file_path = f"results/test-{str(test_df.shape[0])}-{pretrained_model_file}-k{k}.csv.gz"
id_labels = ['test_id', 'sample_id', 'sample_minor_id']

print(f"model_id = {model_id}")
print(f"slug = {slug}")
print(f"k = {k}")

In [None]:
results_file_path = test_results_file_path
compute_humanval = True
compute_bleu = False
output_column = "output"
gold_column = target_label

results_df = pd.read_csv(results_file_path)
results_df["sample_id"] = results_df["sample_id"].astype(int)
results_df.set_index(["sample_id", "sample_minor_id", "n"], inplace=True)
results_df.sort_index(inplace=True)

code_column = "generated_code"
if parse_code:
    results_df[code_column] = results_df[output_column].apply(
        lambda x: parse_code_rep_to_code(x)
    )
else:
    results_df[code_column] = results_df[output_column]

results_df["test"] = results_df["test"].str.replace(
    "= next(iterator)", "= next(iterator, None)", regex=True
)
results_df[code_column] = results_df[code_column].str.replace(
    " = ContentType.", " = MessageContentType.", regex=True
)
results_df[code_column] = results_df[code_column].str.replace(
    "Message.", "Messages.", regex=True
)

In [None]:
data = results_df
code_column_nam = "pred_code"
score_id_labels = ["sample_id"]
score_column_name = "accuracy"
code_column_name = code_column

test_codes = data.apply(
    lambda x: build_test_code(
        code=x[code_column_name], imports=x["imports"], test=x["test"]
    ),
    axis=1,
)
test_results = test_codes.apply(lambda test_code: eval_code(test_code))
test_results_df = pd.DataFrame.from_records(
    test_results.values, index=test_results.index
)

In [None]:
test_results_df2 = test_results_df.reset_index(drop=False).groupby(["sample_id", "n"])[score_column_name].mean()
test_results_df2

In [None]:
test_results_df2.reset_index(drop=False).groupby(["sample_id"])[score_column_name].max().mean()

In [None]:
test_results_df.reset_index(inplace=True)
test_results_df['sample_minor_id'].fillna('a', inplace=True)

In [None]:
test_results_df.head()

In [None]:
test_results_df3 = test_results_df.loc[test_results_df.groupby(['sample_id', "sample_minor_id"])[score_column_name].idxmax()]
test_results_df3

In [None]:
test_results_df3[['code_failure', 'execution_success', 'execution_failure', 'assertion_failure', 'correct', 'incorrect']].sum()

In [None]:
test_results_df3[test_results_df3['accuracy'] == 1].shape

In [None]:
task_df = pd.read_csv('./data/task_oriented_complex_utterances.csv')
task_df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
task_df.set_index(['ID'], inplace=True)
task_df.sort_index(inplace=True)
task_df.head()

In [None]:
cflow_df = task_df[['Sequence', 'Condition', 'Loop', 'Composition']].fillna(0)

cflow_df[cflow_df['Sequence'] > 0]['Sequence'] = 1.0
cflow_df[cflow_df['Condition'] > 0]['Condition'] = 1.0
cflow_df[cflow_df['Loop'] > 0]['Loop'] = 1.0
cflow_df[cflow_df['Composition'] > 0]['Composition'] = 1.0

test_results_df4 = test_results_df3.set_index(['sample_id']).join(cflow_df)
test_results_df4.head()

In [None]:
test_results_df4[['Sequence', 'Condition', 'Loop', 'Composition']].multiply(test_results_df4['accuracy'], axis=0).sum()

In [None]:
test_results_df4[['Sequence', 'Condition', 'Loop', 'Composition']].multiply(test_results_df4['accuracy'], axis=0).sum() / test_results_df4[['Sequence', 'Condition', 'Loop', 'Composition']].sum()

In [None]:
test_results_df4['accuracy']

In [None]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})

# Creating a Series
s = pd.Series([10, 20, 30])

# Multiply the DataFrame by the Series
result = df.multiply(s, axis=0)  # 'axis=0' specifies row-wise operation

print(result)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sample DataFrames
# df1 and df2 should have the same columns for x-axis and y-axis
# Example:
models = ['CodeT5Small',	'CodeT5Base',	'CodeT5+',	'T5']
methods = ['Baseline', '+langRep', '+codeRep', '+lang+codeRep']
df_k1 = pd.DataFrame({
    'x':methods,
    'CodeT5Small': [22.92, 19.70, 16.45, 19.46],
    'CodeT5Base': [16.31, 19.73, 18.16, 20.25],
    'CodeT5+': [21.76, 18.21, 21.47, 19.40],
    'T5': [0, 0, 15.79, 22.94],
})
df_k10 = pd.DataFrame({
    'x': methods,
    'CodeT5Small': [21.63, 19.27, 21.07, 24.54],
    'CodeT5Base': [16.17, 18.15, 23.95, 23.61],
    'CodeT5+': [22.87, 19.89, 18.54, 29.82],
    'T5': [0, 0, 28.08, 26.86],
})

# Create a figure and a set of subplots (in this case, just one)
fig, ax = plt.subplots()

for model_name in models:
    # Plot the first DataFrame
    ax.plot(df_k1['x'], df_k1[model_name], label=f'{model_name} (pass@1)')

for model_name in models:
    # Plot the second DataFrame
    ax.plot(df_k10['x'], df_k10[model_name], label=f'{model_name} (pass@10)')

# Add labels and title
ax.set_ylabel('Pass Rate')
ax.set_title('Ablations Results')

# Add a legend
ax.legend()

# Display the plot
plt.show()

In [None]:
x_values = ['CodeT5Small', 'CodeT5Base', 'CodeT5+', 'T5', 'GPT3.5', 'GPT4']

baslines_k1 = [22.92, 16.31, 21.76, 0, 4.91, 5.74]
baslines_k10 = [21.63, 16.17, 22.87, 0, 17.62, 0]

lang_rep_k1 = [19.70, 19.73, 18.21, 0, np.nan, np.nan]
lang_rep_k10 = [19.27, 18.15, 19.89, 0, np.nan, np.nan]

code_rep_k1 = [16.45, 18.16, 21.47, 15.79, np.nan, np.nan]
code_rep_k10 = [21.07, 23.95, 18.54, 28.08, np.nan, np.nan]

intermediate_k1 = [19.46, 20.25, 19.40, 22.94, np.nan, np.nan]
intermediate_k10 = [24.54, 23.61, 29.82, 26.86, np.nan, np.nan]

In [None]:
df_k1 = pd.DataFrame({
    'Models': x_values,
    'Baseline': baslines_k1,
    'LangRep': lang_rep_k1,
    'CodeRep': code_rep_k1,
    'Rep2Rep': intermediate_k1
})

df_k10 = pd.DataFrame({
    'Models': x_values,
    'Baseline': baslines_k10,
    'LangRep': lang_rep_k10,
    'CodeRep': code_rep_k10,
    'Rep2Rep': intermediate_k10
})

In [None]:
df_k1.fillna(df_k1.mean()).mean()

In [None]:
df_k10.fillna(df_k10.mean()).mean()

In [None]:
df_k10.fillna(df_k10.mean()).mean().tolist()

In [None]:
dfk1 = df_k1.T[1:]
dfk1.columns = df_k1['Models'].to_list()
dfk1_ = dfk1[['CodeT5Small',	'CodeT5Base',	'CodeT5+',	'T5']]
dfk1_.rename(index={'Baseline': 'Baseline', 'LangRep': '+langRep', 'CodeRep': '+codeRep', 'Rep2Rep': '+lang+codeRep'}, inplace=True)
dfk1_.plot(legend=True, ylabel="Pass Rate")

In [None]:
dfk10 = df_k10.T[1:]
dfk10.columns = df_k10['Models'].to_list()
dfk10_ = dfk10[['CodeT5Small',	'CodeT5Base',	'CodeT5+',	'T5']]
dfk10_.rename(index={'Baseline': 'Baseline', 'LangRep': '+langRep', 'CodeRep': '+codeRep', 'Rep2Rep': '+lang+codeRep'}, inplace=True)
dfk10_.plot(legend=True, ylabel="Pass Rate")

In [None]:
pd.concat([dfk1_, dfk10_], axis=1).plot(legend=True, ylabel="Pass Rate")

In [None]:
import matplotlib.pyplot as plt

x_values = ['Baseline', '+langRep', '+codeRep', '+langRep+codeRep']
y1_values = df_k1[:4]#.fillna(df_k1.mean()).mean()
y2_values = df_k10[:4]#.fillna(df_k10.mean()).mean()

fig, ax = plt.subplots()

# Plot data
for column in y1_values.columns:
  ax.plot(x_values, y1_values[column], label='pass@1')
  ax.plot(x_values, y2_values[column], label='pass@10')

# Add some space around the plot within the bounding box
ax.margins(0.15)

plt.title('Ablation Results')

plt.ylabel('Pass rate')  # Labels y-axis as 'Y Values'

# Annotate the lines
# plt.annotate('pass@1', (x_values[-1], y1_values[-1]), textcoords="offset points", xytext=(-10,-10))
# plt.annotate('pass@10', (x_values[-1], y2_values[-1]), textcoords="offset points", xytext=(-10,10))

# Add major gridlines in the y direction in light gray
# ax.yaxis.grid(True, which='major', color='lightgray', linestyle='-', linewidth=0.5)

# Add minor gridlines in the y direction in light gray
# ax.yaxis.grid(True, which='minor', color='lightgray', linestyle=':', linewidth=0.5)

ax.set_ylabel('Pass rate')
ax.legend(loc='best')

plt.show()

# plt.savefig('./reports/ablation-results.png')

In [None]:
# Fill missing values using interpolation
df['Baseline'].interpolate(method='linear', inplace=True)
df['Baseline'].interpolate(method='linear', inplace=True)
df['Baseline'].interpolate(method='linear', inplace=True)
df['Baseline'].interpolate(method='linear', inplace=True)

In [None]:
import matplotlib.pyplot as plt


plt.plot(x_values, sorted(baslines_k1))  # Python uses the indices as x-values
plt.plot(x_values, sorted(lang_rep_k1))  # Python uses the indices as x-values
plt.plot(x_values, sorted(code_rep_k1))  # Python uses the indices as x-values
plt.plot(x_values, sorted(intermediate_k1))  # Python uses the indices as x-values
# plt.xlabel('Index')  # Labels x-axis as 'Index'
plt.ylabel('Y Values')  # Labels y-axis as 'Y Values'
plt.title('Line Chart')  # Title for the chart
plt.show()