# IMPORTS

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


# CODE

In [2]:
dataset = load_dataset("mbpp")

In [4]:
len(dataset['train']['code'])

374

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 374
    })
    test: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 90
    })
    prompt: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 10
    })
})

In [6]:
dataset = load_dataset("google/code_x_glue_ct_code_to_text", "python")

Generating train split: 100%|██████████| 251820/251820 [00:01<00:00, 158289.48 examples/s]
Generating validation split: 100%|██████████| 13914/13914 [00:00<00:00, 130275.57 examples/s]
Generating test split: 100%|██████████| 14918/14918 [00:00<00:00, 156124.47 examples/s]


In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 14918
    })
})

In [1]:
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv('../data/raw/task_1_data.csv')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [15]:
def preprocess_data(row, task):
    return row['text'], row['code']
    # prompt = x["text"]
    # answer = x["code"]

    # prompt, answer = formatted.split("### Response:\n", 1)
    # prompt_tokens = tokenizer(prompt, truncation=False, add_special_tokens=False)
    # answer_tokens = tokenizer(answer, truncation=False, add_special_tokens=False)

    # input_ids = prompt_tokens["input_ids"] + answer_tokens["input_ids"]
    # attention_mask = [1] * len(input_ids)
    # labels = [-100] * len(prompt_tokens["input_ids"]) + answer_tokens["input_ids"]

    # # Pad up to MAX_LENGTH
    # padding_length = MAX_LENGTH - len(input_ids)
    # if padding_length > 0:
    #     input_ids += [tokenizer.pad_token_id] * padding_length
    #     attention_mask += [0] * padding_length
    #     labels += [-100] * padding_length
    # else:
    #     input_ids = input_ids[:MAX_LENGTH]
    #     attention_mask = attention_mask[:MAX_LENGTH]
    #     labels = labels[:MAX_LENGTH]

    # return {
    #     "input_ids": input_ids,
    #     "attention_mask": attention_mask,
    #     "labels": labels
    # }

In [17]:
# map each row of the dataframe to a string
d1 = dataset.apply(lambda row: preprocess_data(row, "task_1"), axis=1)


In [20]:
d1.tolist()[0]

('Write a function to find the longest chain which can be formed from the given set of pairs.',
 'class Pair(object): \r\n\tdef __init__(self, a, b): \r\n\t\tself.a = a \r\n\t\tself.b = b \r\ndef max_chain_length(arr, n): \r\n\tmax = 0\r\n\tmcl = [1 for i in range(n)] \r\n\tfor i in range(1, n): \r\n\t\tfor j in range(0, i): \r\n\t\t\tif (arr[i].a > arr[j].b and\r\n\t\t\t\tmcl[i] < mcl[j] + 1): \r\n\t\t\t\tmcl[i] = mcl[j] + 1\r\n\tfor i in range(n): \r\n\t\tif (max < mcl[i]): \r\n\t\t\tmax = mcl[i] \r\n\treturn max')

In [25]:
from datasets import load_from_disk

In [26]:
task_1_train_data = load_from_disk('../data/split/task_1_train_data_hf')

In [None]:
task_1_train_data['input_ids']

In [1]:
import os

In [2]:
BASE_MODEL = "microsoft/Phi-3-mini-4k-instruct"
ADAPTERS_DIR = "../saved_models/adapters"
TASK_1_ADAPTER = os.path.join(ADAPTERS_DIR, "task_1")
TASK_2_ADAPTER = os.path.join(ADAPTERS_DIR, "task_2")