# Train Models


Based on https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb#scrollTo=wvRHDkCIS91f and https://colab.research.google.com/drive/1d4xNsZbDSZ5ZqXgZjy7HyTVRLBJBVsh6#scrollTo=SDVQ04fGRb1v

## Set-up environment

Let's first install the required libraries:
* HuggingFace Transformers (for the CodeT5 model)
* HuggingFace Datasets (for loading the dataset + preprocessing it)
* PyTorch Lightning (for training)
* Weights and Biases (for logging training metrics).
* Project code from a GitHub repo

In [18]:
!pip install -q transformers sentencepiece pytorch-lightning

shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 10, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spi

In [19]:
%%bash
rm -r /content/complex-utterance-to-code
git clone https://github.com/asafam/complex-utterance-to-code.git /content/complex-utterance-to-code
ls /content/

drive
sample_data


shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
rm: cannot remove '/content/complex-utterance-to-code': No such file or directory
Cloning into '/content/complex-utterance-to-code'...
fatal: Unable to read current working directory: Transport endpoint is not connected


In [1]:
import sys
import os 

WORK_AREA = "/Users/asaf/Workspace/biu/complex-utterance-to-code"
os.chdir(WORK_AREA)

paths = ['./src/', './src/api/v6', './notebooks/src/']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [20]:
import os
import sys

paths = [
  '/content/complex-utterance-to-code', 
  '/content/complex-utterance-to-code/notebooks/src',
  '/content/complex-utterance-to-code/src', 
  '/content/complex-utterance-to-code/src/api/v6', 
]
for path in paths:
  path = os.path.normcase(path)
  if not any(os.path.normcase(sp) == path for sp in sys.path):
      sys.path.append(path)

In [21]:
from typing import Union, List
import argparse
import glob
import os
from datetime import datetime
from pathlib import Path
import sys
import json
import time
import logging
import random
import re
import math
from itertools import chain
from string import punctuation
import tokenize
from nltk.translate import bleu_score

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
import textwrap
from sklearn import metrics
import statistics

from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    RobertaTokenizer,
    get_linear_schedule_with_warmup
)

from data.dataset import ComplexUtteranceCodeDataset
from data.utils import (
    get_dataset_args,
    load_test_data,
)
from eval.utils import *

torch.manual_seed(42)

[nltk_data] Downloading package punkt to /Users/asaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x2a1b45ed0>

In [22]:
from google.colab import drive

WORK_DRIVE = '/content/drive'
WORK_AREA = WORK_DRIVE + '/MyDrive/university/masters/complex_utterances_semantic_parsing/notebooks'

drive.mount(WORK_DRIVE)
os.chdir(WORK_AREA)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Model configuration code

In [4]:
def load_tokenizer(pretrained_model_name_or_path):
    tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
    return tokenizer


def load_model(pretrained_model_name_or_path):
    print(f"Loading model from {pretrained_model_name_or_path}")
    model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
    return model

In [5]:
from enum import Enum

class ModelFlavour(Enum):
    Text2Code = "text2code"
    Text2Rep = "text2rep"
    Rep2Code = "rep2code"
    Rep2Rep = "rep2rep"
    TextRep2Rep = "textrep2rep"
    TextRep2Code = "textrep2code"


class Model(Enum):
    T5Base = "t5-base"
    CodeT5Small = "codet5-small"
    CodeT5Base = "codet5-base"
    CodeT5P220m = "codet5p-220m"


model_flavour_params = {
    ModelFlavour.Text2Code: dict(
        slug = "text2code",
        input_prefix = "text to code: ",
        input_label = "text",
        target_label = "code",
    ),
    ModelFlavour.Text2Rep: dict(
        slug = "text2rep",
        input_prefix = "text to rep: ",
        input_label = "text",
        target_label = "code_rep",
    ),
    ModelFlavour.Rep2Code: dict(
        slug = "rep2code",
        input_prefix = "rep to code: ",
        input_label = "lang_rep",
        target_label = "code",
    ),
    ModelFlavour.Rep2Rep: dict(
        slug = "rep2rep",
        input_prefix = "rep to rep: ",
        input_label = "lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Rep: dict(
        slug = "text_rep2rep",
        input_prefix = "text and rep to rep: ",
        input_label = "text_lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Code: dict(
        slug = "textrep2code",
        input_prefix = "text and rep to code: ",
        input_label = "text_lang_rep",
        target_label = "code",
    ),
}

pretrained_model_names_mapping = {
    Model.T5Base: "t5-base",
    Model.CodeT5Small: "Salesforce/codet5-small",
    Model.CodeT5Base: "Salesforce/codet5-base",
    Model.CodeT5P220m: "Salesforce/codet5p-220m",
}

## Evaluation

### Loading the dataset

In [None]:
test_file_path = 'data/eval_complex_utterance_to_code_with_intermediate_82_20230519.csv.gz'
test_df = load_test_data(test_file_path=test_file_path, id_labels=None)
print("test_df", test_df.shape)

shape =  (92, 12)
test_df (92, 12)
test_df (92, 12)


In [None]:
def eval_test_data(pretrained_model_path, test_df, model_architecture, selected_model_type):
    # create a tokenizer and load the model
    pretrained_model_name_or_path = pretrained_model_names_mapping[model_architecture]
    tokenizer = load_tokenizer(pretrained_model_name_or_path)
    model = load_model(pretrained_model_path)

    # selected model params    
    selected_model_flavour_params = model_flavour_params[selected_model_type]
    target_label = selected_model_flavour_params.get('target_label')
    slug = selected_model_flavour_params.get('slug')
    parse_code = selected_model_flavour_params.get(target_label) == 'code_rep'

    # load the dataset
    dataset_args = get_dataset_args(tokenizer, selected_model_flavour_params)
    max_length = dataset_args['max_target_length']
    
    test_dataset = ComplexUtteranceCodeDataset(data=test_df, **dataset_args)
    test_dataloader = DataLoader(test_dataset, batch_size=4, num_workers=12)
    
    model_id = model_architecture.value
    pretrained_model_file = [x for x in pretrained_model_path.split('/') if x][-1]
    test_results_file_path = f"results/test-{str(test_df.shape[0])}-{pretrained_model_file}.csv.gz"
    id_labels = ['test_id', 'sample_id', 'sample_minor_id']

    print(f"model_id = {model_id}")
    print(f"slug = {slug}")

    results = eval_generated_code(
        df=test_df, 
        model=model,
        tokenizer=tokenizer,
        dataloader=test_dataloader, 
        target_label=target_label,
        id_labels=id_labels,
        max_length=max_length,
        gold_column='code', 
        parse_code=parse_code,
        file_path=test_results_file_path,
    )
    return results

In [None]:
!ls -ltra ./experiments | grep -i codet5p

drwx------ 2 root root 4096 May 18 20:13 codet5p-220m-rep2rep-2023-05-18_181859
drwx------ 2 root root 4096 May 18 22:32 codet5p-220m-text2code-2023-05-18_202622
drwx------ 2 root root 4096 May 19 12:31 codet5p-220m-textrep2rep-2023-05-19_102443
drwx------ 2 root root 4096 May 19 15:16 codet5p-220m-rep2code-2023-05-19_130954
drwx------ 2 root root 4096 May 19 16:54 codet5p-220m-text2rep-2023-05-19_151621
drwx------ 2 root root 4096 May 19 23:10 codet5p-220m-textrep2code-2023-05-19_205000
drwx------ 2 root root 4096 May 20 12:52 codet5p-220m-text2code-2023-05-20_103245
drwx------ 2 root root 4096 May 20 15:03 codet5p-220m-rep2rep-2023-05-20_125703
drwx------ 2 root root 4096 May 21 01:50 codet5p-220m-text2code-2023-05-20_233005


In [None]:
models_args = [
    dict(selected_model_type=ModelFlavour.Rep2Rep, pretrained_model_path='./experiments/refit_complex_codet5-small-rep2rep-2023-05-23_031926/'),
]

for args in models_args:
  pretrained_model_path = args.get('pretrained_model_path')
  selected_model_type = args.get('selected_model_type')
  print(pretrained_model_path)
  
  results = eval_test_data(pretrained_model_path, test_df, Model.CodeT5Small, selected_model_type)
  print(results)

./experiments/refit_complex_codet5-small-rep2rep-2023-05-23_031926/
Loading model from ./experiments/refit_complex_codet5-small-rep2rep-2023-05-23_031926/
model_id = codet5-small
slug = rep2rep


  0%|          | 0/23 [00:00<?, ?it/s]

Results were saved to results/codet5-small-rep2rep-test-92-.csv.gz


TokenError: ignored

In [9]:
results_df = pd.read_csv('./dist/experiments_results/codet5-small-rep2rep-test-92-refit_complex_codet5-small-rep2rep-2023-05-23_031926.csv.gz')
results_df.head()

Unnamed: 0.1,Unnamed: 0,test_id,sample_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,text_lang_rep,lang_rep_pretty,code_rep_pretty,output,target
0,0,40_b,40,b,If I don't have anything scheduled on the 20th...,"date_time = DateTime.resolve_from_text(""20th o...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,Text: If I don't have anything scheduled on th...,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,[ Module [ event_calendar = EventCalendar.res...,[ Module [ date_time = DateTime.resolve_from_t...
1,1,19_b,19,b,"Check the weather in Indianapolis, and if it's...","location = Location.resolve_from_text(""Indiana...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ location = Location.resolve_from_te...,"Text: Check the weather in Indianapolis, and i...",[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ location = Location.resolve_from_te...,[ Module [ location = Location.resolve_from_t...,[ Module [ location = Location.resolve_from_te...
2,2,63_a,63,a,If the weather is going to be sunny Saturday m...,weather_attribute = WeatherAttribute.resolve_f...,# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ weather_attribute = WeatherAttribut...,Text: If the weather is going to be sunny Satu...,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ weather_attribute = WeatherAttribut...,[ Module [ weather_forecasts = Weather.find_w...,[ Module [ weather_attribute = WeatherAttribut...
3,3,80,80,,Message my brother I will not be able to make ...,message_content_type = MessageContentType.reso...,# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ advcl [ Comman...,[ Module [ message_content_type = MessageConte...,Text: Message my brother I will not be able to...,[ root [ S [ Command [ Action [ advcl [ Comman...,[ Module [ message_content_type = MessageConte...,[ Module [ message_content_type = MessageMess...,[ Module [ message_content_type = MessageConte...
4,4,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,Text: Check the availability of Pepsi at Walma...,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,[ Module [ product_name = ProductName.resolve...,[ Module [ product_name = ProductName.resolve_...


In [10]:
results_df.columns

Index(['Unnamed: 0', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'text_lang_rep',
       'lang_rep_pretty', 'code_rep_pretty', 'output', 'target'],
      dtype='object')

In [11]:
output_column="output"
gold_column="code"
  
results = model_eval(
    results_df=results_df,
    parse_to_code=True,
    compute_humanval=True,
    compute_bleu=True,
    output_column=output_column,
    gold_column=gold_column,
)

In [12]:
results

{'humaneval': {'score': 0.22326388888888887,
  'results':                            code_failure  correct  incorrect  accuracy
  sample_id sample_minor_id                                            
  0         NaN                         1        0          0       0.0
  1         a                           0        0          1       0.0
            b                           0        0          1       0.0
  2         NaN                         1        0          0       0.0
  3         a                           0        2          0       1.0
  ...                                 ...      ...        ...       ...
  104       b                           1        0          0       0.0
  105       NaN                         0        0          4       0.0
  108       NaN                         0        0          2       0.0
  109       a                           1        0          0       0.0
            b                           1        0          0       0.0
  
  [92

In [13]:
results_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,test_id,text,code,test,imports,lang_rep,code_rep,text_lang_rep,lang_rep_pretty,code_rep_pretty,output,target,generated_code
sample_id,sample_minor_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,,4,0,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,Text: Check the availability of Pepsi at Walma...,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,[ Module [ product_name = ProductName.resolve...,[ Module [ product_name = ProductName.resolve_...,
1,a,79,1_a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"Text: If it's raining tomorrow morning, set my...",[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,[ Module [ product_name = ProductName.resolve...,[ Module [ date_time = DateTime.resolve_from_t...,product_name = ProductName.resolve_from_text('...
1,b,82,1_b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"Text: If it's raining tomorrow morning, set my...",[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,[ Module [ product_name = ProductName.resolve...,[ Module [ date_time = DateTime.resolve_from_t...,product_name = ProductName.resolve_from_text('...
2,,47,2,Play the new Taylor Swift album and pull up my...,"album = Album.resolve_from_text(""the new Taylo...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Play ] ] ...,[ Module [ album = Album.resolve_from_text('th...,Text: Play the new Taylor Swift album and pull...,[ root [ S [ Command [ Action [ hd [ Play ] ] ...,[ Module [ album = Album.resolve_from_text('th...,[ Module [ event_name = EventName.resolve_fro...,[ Module [ album = Album.resolve_from_text('th...,event_name = EventName.resolve_from_text('a Ta...
3,a,15,3_a,Send a message to dad if it rains tomorrow.,"date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Body [...,[ Module [ date_time = DateTime.resolve_from_t...,Text: Send a message to dad if it rains tomorr...,[ root [ S [ Command [ Condition [ If [ Body [...,[ Module [ date_time = DateTime.resolve_from_t...,[ Module [ recipient = Contact.resolve_from_t...,[ Module [ date_time = DateTime.resolve_from_t...,recipient = Contact.resolve_from_text('Dad')\n...


In [19]:
x = results_df.loc[0, None]
print(x['output'])

 [ Module [ product_name = ProductName.resolve_from_text('Pepsi') ] [ location = Location.resolve_from_text('at Walmart') ] [ order = Shopping.order(product_name=product_name, location=location) ] [ location = Location.resolve_from_text('at Walgreens.find_from_text('at Walgreens') ] [ weather_forecasts = Weather.find_weather_forecasts(location=location) ] [ Responder.respond(response=weather_forecasts) ] [ location = Location.resolve_from_text('at Walgreens') ] [ weather_forecasts = utils.filter(weather_forecasts, location=location) ] [ test_weather_forecasts = bool(weather_forecasts) ] [ Responder.respond(response=test_weather_forecasts) ] ] 


In [23]:
parse_code_rep_to_code(x['output'])

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x118245100>>
invalid syntax (<unknown>, line 1)


''

In [40]:
input_df = pd.read_csv('./build/train_complex_utterance_to_code_with_intermediate_40k.csv.gz')

In [41]:
input_df.columns

Index(['text', 'code', 'lang_rep', 'code_rep'], dtype='object')

In [45]:
sampled_df = input_df.sample(frac=1, random_state=42)

In [46]:
sampled_df['code2'] = sampled_df['code_rep'].apply(parse_code_rep_to_code)

In [47]:
sampled_df[sampled_df['code2']=='']

Unnamed: 0,text,code,lang_rep,code_rep,code2


In [28]:
for code in sampled_df:
  print(code)
  print(parse_code_rep_to_code(x['output'])(code))
  print()

date_times = DateTime.resolve_many_from_text("every Thursday and Halloween this month")
location = Location.resolve_from_text("at my office")
weather_forecasts = []
for date_time in date_times:
  weather_forecasts += Weather.find_weather_forecasts(date_time=date_time, location=location)
Responder.respond(response=weather_forecasts)

messages = Messages.find_messages()
messages = first(messages, 3)
Messages.delete_messages(messages=messages)

person_reminded = Contact.resolve_from_text("I")
content = Content.resolve_from_text("walk the dog")
date_times = DateTime.resolve_many_from_text("between 12:19 PM next week and noon")
reminders = []
for date_time in date_times:
  reminders = Reminders.find_reminders(person_reminded=person_reminded, date_time=date_time, content=content)
test_reminders = bool(reminders)
Responder.respond(response=test_reminders)

amount1 = Amount.resolve_from_text("a couple of")
product_name1 = ProductName.resolve_from_text("blouses")
amount2 = Amount.resolve_from_t

In [None]:
input_df[input_df['']]