# Train Models


Based on https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb#scrollTo=wvRHDkCIS91f and https://colab.research.google.com/drive/1d4xNsZbDSZ5ZqXgZjy7HyTVRLBJBVsh6#scrollTo=SDVQ04fGRb1v

## Set-up environment

Let's first install the required libraries:
* HuggingFace Transformers (for the CodeT5 model)
* HuggingFace Datasets (for loading the dataset + preprocessing it)
* PyTorch Lightning (for training)
* Weights and Biases (for logging training metrics).
* Project code from a GitHub repo

In [18]:
!pip install -q transformers sentencepiece pytorch-lightning

shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 10, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spi

In [19]:
%%bash
rm -r /content/complex-utterance-to-code
git clone https://github.com/asafam/complex-utterance-to-code.git /content/complex-utterance-to-code
ls /content/

drive
sample_data


shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
rm: cannot remove '/content/complex-utterance-to-code': No such file or directory
Cloning into '/content/complex-utterance-to-code'...
fatal: Unable to read current working directory: Transport endpoint is not connected


In [1]:
import sys
import os 

WORK_AREA = "/Users/asaf/Workspace/biu/complex-utterance-to-code"
os.chdir(WORK_AREA)

paths = ['./src/', './src/api/v6', './notebooks/src/']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [20]:
import os
import sys

paths = [
  '/content/complex-utterance-to-code', 
  '/content/complex-utterance-to-code/notebooks/src',
  '/content/complex-utterance-to-code/src', 
  '/content/complex-utterance-to-code/src/api/v6', 
]
for path in paths:
  path = os.path.normcase(path)
  if not any(os.path.normcase(sp) == path for sp in sys.path):
      sys.path.append(path)

In [5]:
from typing import Union, List
import argparse
import glob
import os
from datetime import datetime
from pathlib import Path
import sys
import json
import time
import logging
import random
import re
import math
from itertools import chain
from string import punctuation
import tokenize
from nltk.translate import bleu_score

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
import textwrap
from sklearn import metrics
import statistics

from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    RobertaTokenizer,
    get_linear_schedule_with_warmup
)

from data.dataset import ComplexUtteranceCodeDataset
from data.utils import (
    get_dataset_args,
    load_test_data,
)
from eval.utils import (
    eval_generated_code,
    model_eval,
)

torch.manual_seed(42)

[nltk_data] Downloading package punkt to /Users/asaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x2a1b45ed0>

In [22]:
from google.colab import drive

WORK_DRIVE = '/content/drive'
WORK_AREA = WORK_DRIVE + '/MyDrive/university/masters/complex_utterances_semantic_parsing/notebooks'

drive.mount(WORK_DRIVE)
os.chdir(WORK_AREA)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Model configuration code

In [4]:
def load_tokenizer(pretrained_model_name_or_path):
    tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
    return tokenizer


def load_model(pretrained_model_name_or_path):
    print(f"Loading model from {pretrained_model_name_or_path}")
    model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
    return model

In [5]:
from enum import Enum

class ModelFlavour(Enum):
    Text2Code = "text2code"
    Text2Rep = "text2rep"
    Rep2Code = "rep2code"
    Rep2Rep = "rep2rep"
    TextRep2Rep = "textrep2rep"
    TextRep2Code = "textrep2code"


class Model(Enum):
    T5Base = "t5-base"
    CodeT5Small = "codet5-small"
    CodeT5Base = "codet5-base"
    CodeT5P220m = "codet5p-220m"


model_flavour_params = {
    ModelFlavour.Text2Code: dict(
        slug = "text2code",
        input_prefix = "text to code: ",
        input_label = "text",
        target_label = "code",
    ),
    ModelFlavour.Text2Rep: dict(
        slug = "text2rep",
        input_prefix = "text to rep: ",
        input_label = "text",
        target_label = "code_rep",
    ),
    ModelFlavour.Rep2Code: dict(
        slug = "rep2code",
        input_prefix = "rep to code: ",
        input_label = "lang_rep",
        target_label = "code",
    ),
    ModelFlavour.Rep2Rep: dict(
        slug = "rep2rep",
        input_prefix = "rep to rep: ",
        input_label = "lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Rep: dict(
        slug = "text_rep2rep",
        input_prefix = "text and rep to rep: ",
        input_label = "text_lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Code: dict(
        slug = "textrep2code",
        input_prefix = "text and rep to code: ",
        input_label = "text_lang_rep",
        target_label = "code",
    ),
}

pretrained_model_names_mapping = {
    Model.T5Base: "t5-base",
    Model.CodeT5Small: "Salesforce/codet5-small",
    Model.CodeT5Base: "Salesforce/codet5-base",
    Model.CodeT5P220m: "Salesforce/codet5p-220m",
}

In [8]:
from typing import List, Union, Optional, TypeVar, Generic
import os
import pandas as pd
import ast
import math
import glob
from representations.tree.tree import Tree
from representations.builders.ast.tearers.tearer_factory import TearerFactory
import tokenize
from nltk.translate import bleu_score
from sklearn import metrics
from tqdm.auto import tqdm


def parse_code_rep_to_code(code_rep: str, verbose: str = "Fatal") -> str:
    try:
        tree = Tree.unparse(code_rep)
        tearer = TearerFactory().get_tearer(tree.root_node)
        asdl = tearer.tear(tree.root_node)
        code = ast.unparse(asdl)
    except Exception as e:
        if verbose == "Error":
            print(f"[Error] failed to prase code rep to code:\n", e)
        code = ""
    finally:
        return code


def build_test_code(
    code: str,
    imports: str,
    test: str,
    code_embed_str: str = "# end code block to test",
    fail_on_error: bool = False,
    verbose: str = "Fatal",
):
    try:
        code_insert_idx = test.find(code_embed_str)
        program_code = imports
        program_code += "\n"
        program_code += test[:code_insert_idx]
        program_code += code
        program_code += "\n"
        program_code += test[code_insert_idx:]
    except Exception as e:
        if verbose == "Error":
            print("[ERROR] Failed to unparse code rep to code\n", e)
        if fail_on_error:
            raise e
        program_code = ""
    finally:
        return program_code


def tokenize_source(code):
    file_path = "/tmp/example.py"

    with open(file_path, "w") as text_file:
        text_file.write(code)

    with open(file_path, "rb") as f:
        tokens_gen = tokenize.tokenize(f.readline)

        tokens = [token.string for token in tokens_gen]

    os.remove(file_path)
    return tokens


def eval_code(code: str):
    test_results = {}
    try:
        context = {}
        exec(code, context)
        test_results = context.get("test_results", {})
    except AssertionError as e:
        test_results["test_failuers"] = test_results.get("test_failuers", 0) + 1
    except Exception as e:
        test_results["code_failure"] = test_results.get("code_failure", 0) + 1

    code_failure = test_results.get("code_failure", 0)
    correct = test_results.get("correct", 0)
    incorrect = test_results.get("incorrect", 0)
    total = (correct + incorrect) or math.inf
    accuracy = (1 - code_failure) * (correct / total)

    results = dict(
        code_failure=code_failure,
        correct=correct,
        incorrect=incorrect,
        accuracy=accuracy,
    )

    return results


def eval_bleu(code, generated_code):
    hypothesis = tokenize_source(code)
    reference = tokenize_source(generated_code)
    weights = (0.25, 0.25, 0.25, 0.25)
    score = bleu_score.sentence_bleu([reference], hypothesis, weights=weights)
    return score


def generate_code(
    model, tokenizer, dataloader, gold_column, id_labels, max_length
):
    model.eval()
    outputs = []
    targets = []
    ids = {}
    for id_label in id_labels:
        ids[id_label] = []

    for batch in tqdm(dataloader):
        outs = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=max_length,
        )

        output = [tokenizer.decode(out, skip_special_tokens=True) for out in outs]
        target = [t.strip() for t in batch[gold_column]]

        outputs.extend(output)
        targets.extend(target)
        for id_label in id_labels:
            ids[id_label].extend(batch[id_label])

    data = pd.DataFrame(
        {
            "output": outputs,
            "target": targets,
        }
    )

    return data


def eval_model(data: pd.DataFrame):
    results = dict(
        exact=metrics.accuracy_score(data["target"], data["output"]),
        bleu=None,
        humaneval=eval_model_humaneval(data["target"], data["output"]),
    )

    return results


def humaneval_accuracy_score(
    data: pd.DataFrame,
    code_column_name: str = "pred_code",
    score_id_labels: Union[str, List[str]] = "sample_id",
    score_column_name: str = "accuracy",
):
    test_codes = data.apply(
        lambda x: build_test_code(
            code=x[code_column_name], imports=x["imports"], test=x["test"]
        ),
        axis=1,
    )
    test_results = test_codes.apply(lambda test_code: eval_code(test_code))
    test_results_df = pd.DataFrame.from_records(
        test_results.values, index=test_results.index
    )
    score = (
        test_results_df.reset_index(drop=False)
        .groupby(score_id_labels)[score_column_name]
        .mean()
        .mean()
    )
    return dict(score=score, results=test_results_df)


def bleu_accuracy_score(
    data: pd.DataFrame,
    generated_column="output",
    gold_column="code",
    score_id_labels: Union[str, List[str]] = "sample_id",
    score_column_name: str = "bleu_score",
):
    eval_results = data.apply(
        lambda x: eval_bleu(x[gold_column], x[generated_column]), axis=1
    )
    eval_results_df = eval_results.to_frame("bleu_score")
    score = (
        eval_results_df.reset_index(drop=False)
        .groupby(score_id_labels)[score_column_name]
        .mean()
        .mean()
    )
    return dict(score=score, results=eval_results_df)


def model_eval(
    results_df,
    output_column="output",
    gold_column="code",
    parse_to_code=False,
    compute_humanval=True,
    compute_bleu=True,
):
    results_df["sample_id"] = results_df["sample_id"].astype(int)
    results_df.set_index(["sample_id", "sample_minor_id"], inplace=True)
    results_df.sort_index(inplace=True)

    code_column = "generated_code"
    if parse_to_code:
        results_df[code_column] = results_df[output_column].apply(
            lambda x: parse_code_rep_to_code(x)
        )
    else:
        results_df[code_column] = results_df[output_column]

    results_df["test"] = results_df["test"].str.replace(
        "= next(iterator)", "= next(iterator, None)"
    )
    results_df[code_column] = results_df[code_column].str.replace(
        " = ContentType.", " = MessageContentType."
    )
    results_df[code_column] = results_df[code_column].str.replace(
        "Message.", "Messages."
    )

    humaneval_results = (
        humaneval_accuracy_score(data=results_df, code_column_name=code_column)
        if compute_humanval
        else {}
    )

    bleu_results = (
        bleu_accuracy_score(
            data=results_df, generated_column=code_column, gold_column=gold_column
        )
        if compute_bleu
        else {}
    )

    results = dict(humaneval=humaneval_results, bleu=bleu_results)
    return results


def eval_model_humaneval(
    data: pd.DataFrame,
    code_column_name: str = "pred_code",
    score_id_labels: Union[str, List[str]] = "sample_id",
    score_column_name: str = "accuracy",
):
    test_codes = data.apply(
        lambda x: build_test_code(
            code=x[code_column_name], imports=x["imports"], test=x["test"]
        ),
        axis=1,
    )
    test_results = test_codes.apply(lambda test_code: eval_code(test_code))

    test_results_df = pd.DataFrame.from_records(
        test_results.values, index=test_results.index
    )
    score = (
        test_results_df.reset_index(drop=False)
        .groupby(score_id_labels)[score_column_name]
        .mean()
        .mean()
    )
    return score, test_results_df


def eval_bleu(code, generated_code):
    hypothesis = tokenize_source(code)
    reference = tokenize_source(generated_code)
    n = max(min(len(hypothesis), 4), 1)
    weight = 1 / n
    weights = (weight,) * n
    score = bleu_score.sentence_bleu([reference], hypothesis, weights=weights)
    return score


def eval_generated_code(
    df,
    model,
    tokenizer,
    dataloader,
    target_label,
    id_labels,
    max_length,
    output_column="output",
    gold_column="code",
    parse_code=False,
    file_path=None,
):
    eval_df = generate_code(
        model,
        tokenizer,
        dataloader=dataloader,
        gold_column=target_label,
        id_labels=id_labels,
        max_length=max_length,
    )

    if file_path:
        df2 = df.join(eval_df.set_index(df.index))
        df2.to_csv(file_path)
        print(f"Results were saved to {file_path}")

    results = model_eval(
        results_file_path=file_path,
        parse_to_code=parse_code,
        compute_humanval=True,
        compute_bleu=True,
        output_column=output_column,
        gold_column=gold_column,
    )
    print(f"humaneval = {results['humaneval']['score']}")
    print(f"bleu = {results['bleu']['score']}")
    
    return results


## Evaluation

### Loading the dataset

In [None]:
test_file_path = 'data/eval_complex_utterance_to_code_with_intermediate_82_20230519.csv.gz'
test_df = load_test_data(test_file_path=test_file_path, id_labels=None)
print("test_df", test_df.shape)

shape =  (92, 12)
test_df (92, 12)
test_df (92, 12)


In [None]:
def eval_test_data(pretrained_model_path, test_df, model_architecture, selected_model_type):
    # create a tokenizer and load the model
    pretrained_model_name_or_path = pretrained_model_names_mapping[model_architecture]
    tokenizer = load_tokenizer(pretrained_model_name_or_path)
    model = load_model(pretrained_model_path)

    # selected model params    
    selected_model_flavour_params = model_flavour_params[selected_model_type]
    target_label = selected_model_flavour_params.get('target_label')
    slug = selected_model_flavour_params.get('slug')
    parse_code = selected_model_flavour_params.get(target_label) == 'code_rep'

    # load the dataset
    dataset_args = get_dataset_args(tokenizer, selected_model_flavour_params)
    max_length = dataset_args['max_target_length']
    
    test_dataset = ComplexUtteranceCodeDataset(data=test_df, **dataset_args)
    test_dataloader = DataLoader(test_dataset, batch_size=4, num_workers=12)
    
    model_id = model_architecture.value
    pretrained_model_file = [x for x in pretrained_model_path.split('/') if x][-1]
    test_results_file_path = f"results/test-{str(test_df.shape[0])}-{pretrained_model_file}.csv.gz"
    id_labels = ['test_id', 'sample_id', 'sample_minor_id']

    print(f"model_id = {model_id}")
    print(f"slug = {slug}")

    results = eval_generated_code(
        df=test_df, 
        model=model,
        tokenizer=tokenizer,
        dataloader=test_dataloader, 
        target_label=target_label,
        id_labels=id_labels,
        max_length=max_length,
        gold_column='code', 
        parse_code=parse_code,
        file_path=test_results_file_path,
    )
    return results

In [None]:
!ls -ltra ./experiments | grep -i codet5p

drwx------ 2 root root 4096 May 18 20:13 codet5p-220m-rep2rep-2023-05-18_181859
drwx------ 2 root root 4096 May 18 22:32 codet5p-220m-text2code-2023-05-18_202622
drwx------ 2 root root 4096 May 19 12:31 codet5p-220m-textrep2rep-2023-05-19_102443
drwx------ 2 root root 4096 May 19 15:16 codet5p-220m-rep2code-2023-05-19_130954
drwx------ 2 root root 4096 May 19 16:54 codet5p-220m-text2rep-2023-05-19_151621
drwx------ 2 root root 4096 May 19 23:10 codet5p-220m-textrep2code-2023-05-19_205000
drwx------ 2 root root 4096 May 20 12:52 codet5p-220m-text2code-2023-05-20_103245
drwx------ 2 root root 4096 May 20 15:03 codet5p-220m-rep2rep-2023-05-20_125703
drwx------ 2 root root 4096 May 21 01:50 codet5p-220m-text2code-2023-05-20_233005


In [None]:
models_args = [
    dict(selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='./experiments/refit_complex_codet5-small-rep2rep-2023-05-23_031926/'),
]

for args in models_args:
  pretrained_model_path = args.get('pretrained_model_path')
  selected_model_type = args.get('selected_model_type')
  print(pretrained_model_path)
  
  results = eval_test_data(pretrained_model_path, test_df, Model.CodeT5Small, selected_model_type)
  print(results)

./experiments/refit_complex_codet5-small-rep2rep-2023-05-23_031926/
Loading model from ./experiments/refit_complex_codet5-small-rep2rep-2023-05-23_031926/
model_id = codet5-small
slug = rep2rep


  0%|          | 0/23 [00:00<?, ?it/s]

Results were saved to results/codet5-small-rep2rep-test-92-.csv.gz


TokenError: ignored

In [3]:
results_df = pd.read_csv('./dist/experiments_results/codet5-small-rep2rep-test-92-refit_complex_codet5-small-rep2rep-2023-05-23_031926.csv.gz')
results_df.head()

Unnamed: 0.1,Unnamed: 0,test_id,sample_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,text_lang_rep,lang_rep_pretty,code_rep_pretty,output,target
0,0,40_b,40,b,If I don't have anything scheduled on the 20th...,"date_time = DateTime.resolve_from_text(""20th o...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,Text: If I don't have anything scheduled on th...,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,[ Module [ event_calendar = EventCalendar.res...,[ Module [ date_time = DateTime.resolve_from_t...
1,1,19_b,19,b,"Check the weather in Indianapolis, and if it's...","location = Location.resolve_from_text(""Indiana...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ location = Location.resolve_from_te...,"Text: Check the weather in Indianapolis, and i...",[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ location = Location.resolve_from_te...,[ Module [ location = Location.resolve_from_t...,[ Module [ location = Location.resolve_from_te...
2,2,63_a,63,a,If the weather is going to be sunny Saturday m...,weather_attribute = WeatherAttribute.resolve_f...,# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ weather_attribute = WeatherAttribut...,Text: If the weather is going to be sunny Satu...,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ weather_attribute = WeatherAttribut...,[ Module [ weather_forecasts = Weather.find_w...,[ Module [ weather_attribute = WeatherAttribut...
3,3,80,80,,Message my brother I will not be able to make ...,message_content_type = MessageContentType.reso...,# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ advcl [ Comman...,[ Module [ message_content_type = MessageConte...,Text: Message my brother I will not be able to...,[ root [ S [ Command [ Action [ advcl [ Comman...,[ Module [ message_content_type = MessageConte...,[ Module [ message_content_type = MessageMess...,[ Module [ message_content_type = MessageConte...
4,4,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,Text: Check the availability of Pepsi at Walma...,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,[ Module [ product_name = ProductName.resolve...,[ Module [ product_name = ProductName.resolve_...


In [6]:
output_column="output"
gold_column="code"
  
results = model_eval(
    results_df=results_df,
    parse_to_code=True,
    compute_humanval=True,
    compute_bleu=True,
    output_column=output_column,
    gold_column=gold_column,
)

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x