# init

In [1]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Any

seed = 18022004
np.random.seed(seed)

In [2]:
data_prefix: str = 'data'
repo_prefix: str = f'{data_prefix}/repos'

data_name = 'data_method_30k_test.parquet'

data_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{data_name}', engine = 'pyarrow')

# create dataset

In [20]:
from datasets import Dataset, DatasetDict

hf_dataset = Dataset.from_pandas(data_df)

In [21]:
train_test_split = hf_dataset.train_test_split(test_size = 0.2, seed = seed)

val_test_split = train_test_split['test'].train_test_split(test_size = 0.5, seed = seed)

final_datasets = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})

In [22]:
final_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'fromLib', 'toLib', 'repoName', 'prevCommit', 'startCommit', 'endCommit', 'fileName', 'startCommitChanges', 'endCommitChanges', 'startCode_cleaned', 'endCode_cleaned', 'diff_cleaned', 'methods_before', 'methods_after', 'methods_diff'],
        num_rows: 23800
    })
    validation: Dataset({
        features: ['id', 'fromLib', 'toLib', 'repoName', 'prevCommit', 'startCommit', 'endCommit', 'fileName', 'startCommitChanges', 'endCommitChanges', 'startCode_cleaned', 'endCode_cleaned', 'diff_cleaned', 'methods_before', 'methods_after', 'methods_diff'],
        num_rows: 2975
    })
    test: Dataset({
        features: ['id', 'fromLib', 'toLib', 'repoName', 'prevCommit', 'startCommit', 'endCommit', 'fileName', 'startCommitChanges', 'endCommitChanges', 'startCode_cleaned', 'endCode_cleaned', 'diff_cleaned', 'methods_before', 'methods_after', 'methods_diff'],
        num_rows: 2976
    })
})

In [23]:
from huggingface_hub import login
import os

access_token = os.environ.get('HF_TOKEN')

login(token = access_token)

username = 'blackwhite1337'
dataset_name = 'zTrans_dataset'

final_datasets.push_to_hub(f'{username}/{dataset_name}')

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:03<00:00,  1.23ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:02<00:00,  1.53ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:03<00:00,  1.30ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:02<00:00,  1.39ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:03<00:00,  1.31ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:02<00:00,  1.49ba/s]
Uploading the dataset shards: 100%|██████████| 6/6 [01:37<00:00, 16.18s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  1.62ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:09<00:00,  9.93s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:02<00:00,  1.43ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:11<00:00, 11.13s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/blackwhite1337/zTrans_dataset/commit/427e42bb6c28707798e4fdb7685a1a9f08eda5b5', commit_message='Upload dataset', commit_description='', oid='427e42bb6c28707798e4fdb7685a1a9f08eda5b5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/blackwhite1337/zTrans_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='blackwhite1337/zTrans_dataset'), pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset

username = 'blackwhite1337'
dataset_name = 'zTrans_dataset'

dataset = load_dataset(f'{username}/{dataset_name}')

Generating train split: 100%|██████████| 23800/23800 [00:12<00:00, 1890.35 examples/s]
Generating validation split: 100%|██████████| 2975/2975 [00:03<00:00, 949.63 examples/s] 
Generating test split: 100%|██████████| 2976/2976 [00:04<00:00, 710.33 examples/s]


In [None]:
dataset['train'].to_parquet(f'{data_prefix}/data_method_30k_train.parquet')
dataset['validation'].to_parquet(f'{data_prefix}/data_method_30k_val.parquet')
dataset['test'].to_parquet(f'{data_prefix}/data_method_30k_test.parquet')

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Creating parquet from Arrow format: 100%|██████████| 24/24 [00:10<00:00,  2.29ba/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  2.42ba/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  2.48ba/s]


365139365

In [24]:
test_df = final_datasets['test']

# build prompt

In [21]:
prompts: List[str] = []

BEGIN_TOKEN: str = '<｜fim▁begin｜>'
FILL_TOKEN: str = '<｜fim▁hole｜>'
END_TOKEN: str = '<｜fim▁end｜>'

prompt_template: str = '''// rewrite below method from library "{}" to "{}"
// ONLY write method code with no comments, imports. DONT WRITE TEXT.
{}
'''

for id in range(len(data_df)):
    line = data_df.iloc[id]

    from_lib: str = line['fromLib']
    to_lib: str = line['toLib']
    method_before: str = line['methods_before']
    file_name = line['fileName']

    if (len(method_before) == 0):
        continue

    prompt: str = prompt_template.format(from_lib, to_lib, method_before)
    ground_truth: str = line['methods_after']

    prompts.append({'prompt': prompt, 'ground_truth': ground_truth})

    if (len(prompts) == 2):
        break

In [22]:
print(prompts[1]['prompt'])

// rewrite below method from library "net.sf.ehcache:ehcache" to "org.ehcache:ehcache"
// ONLY write ```method code``` with no comments, imports. DONT WRITE TEXT.
public void clear(final String context) {
		final Ehcache ehCache = manager.getCache(context);
		if (ehCache != null) {
			ehCache.removeAll();
		}
	}



# gen

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id: str = 'deepseek-ai/deepseek-coder-6.7b-instruct'

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True,)

quantization_config = BitsAndBytesConfig(
    load_in_8bit = True,
)

device: str = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    quantization_config = quantization_config,
    torch_dtype = torch.float16,
    device_map = 'auto',
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.43s/it]


In [24]:
# inputs = tokenizer.encode(prompt, return_tensors = 'pt').to(model.device)

messages = []

for i in range(len(prompts)):
    messages.append({
        'role': 'user',
        'content': prompts[i]['prompt'],
    },)

inputs = tokenizer.apply_chat_template(messages, add_generation_prompt = True, return_tensors = 'pt').to('cpu')

outputs = model.generate(
    inputs,
    max_new_tokens = 256,
    do_sample = False,
    top_k = 50,
    top_p = 0.95,
    eos_token_id = tokenizer.eos_token_id,
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




In [30]:
len(inputs[0])

628

In [26]:
with torch.no_grad():
    output_text = tokenizer.decode(outputs[1][len(inputs[1]):], skip_special_tokens = True,)

print(output_text)

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [12]:
type(model)

transformers.models.llama.modeling_llama.LlamaForCausalLM

In [43]:
print(prompts[0]['ground_truth'])

public void contextInitialized(ServletContextEvent event) {
        ServletContext context = event.getServletContext();
        SimpleClassScanner scanner = SimpleClassScanner.getInstance();
        Set<String> packages = scanner.getPackages(true);
        Object sc = context.getAttribute("javax.websocket.server.ServerContainer");
        if (sc instanceof ServerContainer) {
            ServerContainer container = (ServerContainer) sc;
            int total = 0;
            for (String p : packages) {
                List<Class<?>> endpoints = scanner.getAnnotatedClasses(p, ServerEndpoint.class);
                for (Class<?> cls : endpoints) {
                    if (!Feature.isRequired(cls)) {
                        continue;
                    }
                    try {
                        container.addEndpoint(cls);
                        ServerEndpoint ep = cls.getAnnotation(ServerEndpoint.class);
                        total++;
                        log.info("{} regist

In [None]:
import difflib

diff = difflib.unified_diff(prompt.splitlines(), output_text.splitlines(), lineterm = '')

print('\n'.join(diff))

In [4]:
import gc
gc.collect()
torch.cuda.empty_cache()
del model