# init

In [68]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
import datasets
import torch
import os
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Any, Callable
import argparse
from tqdm import tqdm
import time

from transformers.utils import logging
logging.set_verbosity_error()
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

seed = 18022004
np.random.seed(seed)
set_seed(seed)

data_prefix: str = 'data'
repo_prefix: str = f'{data_prefix}/repos'

prompt_template: str = '''rewrite below method from library "{}" to "{}". ONLY WRITE CODE, NO COMMENTS, IMPORTS, TEXT, NO EXPLAIN.
```
{}
```
'''

batch_prompt_template: str = '''<｜begin▁of▁sentence｜>### Instruction:
you're a software engineer working on a project. ONLY RESPOND WITH CODE, NO COMMENTS, IMPORTS, TEXT, NO EXPLAIN.
rewrite below method from library "{}" to "{}".
```
{}
```

### Response:
'''

# init code

In [52]:
def calculate_time(function: Callable, *args, **kwargs) -> Any:
    start_time: float = time.time()
    result = function(*args, **kwargs)
    end_time: float = time.time()
    print(f'Executed {function.__name__} in {end_time - start_time} seconds')
    print('-' * 50)

    return result

def build_prompts(data_df: pd.DataFrame, batched: bool) -> List[Any]:
    prompts: List[Any] = []

    BEGIN_TOKEN: str = '<｜fim▁begin｜>'
    FILL_TOKEN: str = '<｜fim▁hole｜>'
    END_TOKEN: str = '<｜fim▁end｜>'

    for id in tqdm(range(len(data_df)), desc = 'Building prompts'):
        line = data_df.iloc[id]

        from_lib: str = line['fromLib']
        to_lib: str = line['toLib']
        method_before: str = line['method_before']
        ground_truth: str = line['method_after']

        if (batched):
            prompt: str = batch_prompt_template.format(from_lib, to_lib, method_before)
        else:
            prompt: str = prompt_template.format(from_lib, to_lib, method_before)
        ground_truth: str = line['method_after']

        prompts.append({'id': line['id'], 'prompt': prompt, 'ground_truth': ground_truth})

    return prompts

def build_prompts_hf(data_df: datasets.arrow_dataset.Dataset) -> List[Any]:
    prompts: List[Any] = []

    BEGIN_TOKEN: str = '<｜fim▁begin｜>'
    FILL_TOKEN: str = '<｜fim▁hole｜>'
    END_TOKEN: str = '<｜fim▁end｜>'

    for id in tqdm(range(len(data_df)), desc = 'Building prompts'):
        line = data_df[id]

        from_lib: str = line['fromLib']
        to_lib: str = line['toLib']
        method_before: str = line['method_before']
        ground_truth: str = line['method_after']

        prompt: str = prompt_template.format(from_lib, to_lib, method_before)

        prompts.append({'id': line['id'], 'prompt': prompt, 'ground_truth': ground_truth})

    return prompts

def build_tokenizer(args: argparse.Namespace) -> AutoTokenizer:
    model_id: str = args.model
    tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True,)

    return tokenizer

def build_model(args: argparse.Namespace) -> AutoModelForCausalLM:
    quantization_config = BitsAndBytesConfig(
        load_in_8bit = True,
    )

    device_id: str = args.device
    model_id: str = args.model
    os.environ['CUDA_VISIBLE_DEVICES'] = device_id

    # device: str = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code = True,
        quantization_config = quantization_config,
        torch_dtype = torch.float16,
        device_map = 'auto',
    )

    return model

def decode_outputs(tokenizer: AutoTokenizer, outputs: List[Any]) -> List[Any]:
    results: List[Any] = []

    with torch.no_grad():
        for output in tqdm(outputs, desc = 'Decoding'):
            id = output['id']
            single_inputs = output['inputs']
            single_outputs = output['outputs']
            prompt = output['prompt']

            decoded_output = tokenizer.decode(single_outputs[0][len(single_inputs[0]):], skip_special_tokens = True)

            results.append(
                {
                    'id': id,
                    'output': decoded_output,
                    'prompt': prompt,
                }
            )

    return results

def save_results(args: argparse.Namespace, results: List[str], data_df: pd.DataFrame):
    output_name: str = args.output_file

    valid_ids = [result['id'] for result in results]
    res_df = data_df[data_df['id'].isin(valid_ids)].copy()

    res_df['predicted'] = ''
    res_df['prompt'] = ''

    for id in range(len(results)):
        sample = results[id]

        res_df.loc[res_df['id'] == sample['id'], 'prompt'] = sample['prompt']
        res_df.loc[res_df['id'] == sample['id'], 'predicted'] = sample['output']

    res_df.to_parquet(f'{data_prefix}/{output_name}', engine = 'pyarrow')

def build_message_inputs(prompts: List[Any], tokenizer: AutoTokenizer) -> List[Any]:
    messages: List[Any] = []
    valid_inputs: List[Any] = []

    for id in tqdm(range(len(prompts)), desc = 'Building inputs'):
        sample = prompts[id]

        messages = [(
            {
                'role': 'user',
                'content': sample['prompt'],
            }
        )]

        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt = True, padding = True, truncation = True, return_tensors = 'pt').to('cpu')

        valid_inputs.append({
            'id': sample['id'],
            'inputs': inputs,
            'prompt': sample['prompt'],
        })

    return valid_inputs

def generate_from_inputs(args: argparse.Namespace, model: AutoModelForCausalLM, tokenizer: AutoTokenizer, valid_inputs: List[Any], data_df: pd.DataFrame) -> List[Any]:
    outputs: List[Any] = []

    max_new_tokens: int = args.max_new_tokens
    do_sample: bool = args.do_sample
    top_k: int = args.top_k
    top_p: float = args.top_p

    for sample in tqdm(valid_inputs, desc = 'Generating'):
        id = sample['id']
        single_inputs = sample['inputs']
        prompt = sample['prompt']

        single_inputs = single_inputs.to(model.device)
        single_outputs = model.generate(
            single_inputs,
            max_new_tokens = max_new_tokens,
            do_sample = do_sample,
            top_k = top_k,
            top_p = top_p,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id,
        )

        single_inputs = single_inputs.to('cpu')

        outputs.append(
            {
                'id': id,
                'inputs': single_inputs,
                'outputs': single_outputs,
                'prompt': prompt,
            }
        )

        # save results every 15 samples
        if (len(outputs) % 15 == 0):
            results: List[Any] = decode_outputs(tokenizer = tokenizer, outputs = outputs)

            save_results(args = args, results = results, data_df = data_df)

            print(f'saved results for {len(outputs)} samples')
            print('-' * 50)
            print()

    return outputs

# generation

In [35]:
args_dict: Dict[Any, Any] = {
    'input_file': 'migration_others_dataset_cutoff_test_512.parquet',
    'output_file': 'migration_others_dataset_cutoff_test_512.parquet',
    'dataset_id': 'blackwhite1337/zTrans_dataset_512',
    'split': 'test',

    'model': 'deepseek-ai/deepseek-coder-6.7b-instruct',
    'device': '0',
    'batch_size': 2,

    'max_length': 512,
    'max_new_tokens': 512,
    'do_sample': False,
    'top_k': 50,
    'top_p': 0.95,
}

args_list = [f'--{key}={value}' for key, value in args_dict.items()]

parser = argparse.ArgumentParser(description = 'Process a file.')

# data parameters
parser.add_argument('--input_file', type = str, nargs = '?', default = 'sampled_no_code.parquet', help = 'The name of the file to process')
parser.add_argument('--output_file', type = str, nargs = '?', default = 'sampled_code.parquet', help = 'The name of the file to output')
parser.add_argument('--dataset_id', type = str, nargs = '?', default = 'blackwhite1337/zTrans_dataset', help = 'Dataset ID on Huggingface')
parser.add_argument('--split', type = str, nargs = '?', default = 'test', help = 'Dataset split to use')

# model parameters
parser.add_argument('--model', type = str, nargs = '?', default = 'deepseek-ai/deepseek-coder-6.7b-instruct', help = 'Model ID on Huggingface')
parser.add_argument('--device', nargs = '?', default = '0', help = 'GPU ID to use')
parser.add_argument('--batch_size', type = int, nargs = '?', default = 2, help = 'Batch size per CPU/GPU for generation')

# generation parameters
parser.add_argument('--max_length', type = int, nargs = '?', default = 256, help = 'Max length of the prompt')
parser.add_argument('--max_new_tokens', type = int, nargs = '?', default = 256, help = 'Max new tokens to generate')
parser.add_argument('--do_sample', type = bool, nargs = '?', default = False, help = 'Whether to sample or not')
parser.add_argument('--top_k', type = int, nargs = '?', default = 50, help = 'Top k tokens to sample from')
parser.add_argument('--top_p', type = float, nargs = '?', default = 0.95, help = 'Top p tokens to sample from')

args = parser.parse_args(args_list)

In [4]:
dataset_id: str = args.dataset_id
split: str = args.split

data_df: pd.DataFrame = datasets.load_dataset(dataset_id, split = split).to_pandas()

In [69]:
prompts: List[Any] = calculate_time(build_prompts, data_df = data_df, batched = True)

Building prompts: 100%|██████████| 4979/4979 [00:02<00:00, 1924.58it/s]

Executed build_prompts in 2.619231700897217 seconds
--------------------------------------------------





In [6]:
tokenizer: AutoTokenizer = calculate_time(build_tokenizer, args = args)
model: AutoModelForCausalLM = calculate_time(build_model, args = args)

Executed build_tokenizer in 0.6552720069885254 seconds
--------------------------------------------------


Loading checkpoint shards: 100%|██████████| 2/2 [01:00<00:00, 30.21s/it]


Executed build_model in 65.6509382724762 seconds
--------------------------------------------------


In [None]:
def build_batched_inputs(args: argparse.Namespace, prompts: List[Any], tokenizer: AutoTokenizer) -> List[Any]:
    inputs: List[Any] = tokenizer.batch_encode_plus(prompts, padding = True, truncation = True, return_tensors = 'pt').to('cpu')

    return inputs

def generate_from_prompts(args: argparse.Namespace, model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompts: List[Any], data_df: pd.DataFrame) -> List[Any]:
    outputs: List[Any] = []

    batch_size: int = args.batch_size
    batch_size: int = 5
    max_new_tokens: int = args.max_new_tokens
    do_sample: bool = args.do_sample
    top_k: int = args.top_k
    top_p: float = args.top_p

    def create_batches(prompts: List[Any], batch_size: int) -> List[List[Any]]:
        batches: List[List[Any]] = []

        for i in range(0, len(prompts), batch_size):
            batches.append(prompts[i:i + batch_size])

        return batches

    batches: List[List[Any]] = create_batches(prompts, batch_size)

    for batch in tqdm(batches, desc = 'Generating'):
        # valid_inputs: List[Any] = build_inputs(batch, tokenizer)

        # print(len(valid_inputs))
        prompts = [sample['prompt'] for sample in batch]

        inputs: List[Any] = build_batched_inputs(args = args, prompts = prompts, tokenizer = tokenizer)
        inputs = inputs.to(model.device)
        batch_outputs = model.generate(
            **inputs,
            max_new_tokens = max_new_tokens,
            do_sample = do_sample,
            top_k = top_k,
            top_p = top_p,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id,
        )
        inputs = inputs.to('cpu')
        torch.cuda.empty_cache()

        with torch.no_grad():
            truncated_outputs = [val[len(inputs[id]):] for id, val in enumerate(batch_outputs)]
            batch_decoded_outputs = tokenizer.batch_decode(truncated_outputs, skip_special_tokens = True)

            for i in range(len(batch)):
                outputs.append(
                    {
                        'output': batch_decoded_outputs[i],
                        'id': batch[i]['id'],
                        'prompt': batch[i]['prompt'],
                    }
                )

        # save results every 5 batches
        if (len(outputs) % 5 == 0):
            results: List[Any] = decode_outputs(tokenizer = tokenizer, outputs = outputs)

            save_results(args = args, results = results, data_df = data_df)

            print(f'saved results for {len(outputs)} samples')
            print('-' * 50)
            print()

    return outputs

In [32]:
messages = [
    {
        'role': 'user',
        'content': prompts[0]['prompt']
    },
]

vcl = tokenizer.apply_chat_template(messages, add_generation_prompt = True, padding = True, truncation = True, tokenize = False, return_tensors = 'pt')

print(vcl)

<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer
### Instruction:
rewrite below method from library "org.springframework.security.oauth:spring-security-oauth2" to "io.jsonwebtoken:jjwt". ONLY WRITE CODE, NO COMMENTS, IMPORTS, TEXT, NO EXPLAIN.
```
@Override
    public boolean upgrade() {
        logger.info("Deploying registered {} domain", ADMIN_DOMAIN);
        try {
                               
            Domain adminDomain = domainService.findById(ADMIN_DOMAIN).blockingGet();
            eventManager.publishEvent(DomainEvent.DEPLOY, adminDomain);
            return true;
        } catch (DomainNotFoundException dnfe) {
            logger.error("Failed to find admin domain", dnfe);
            throw new IllegalStat

In [71]:
outputs: List[Any] = calculate_time(generate_from_prompts, args = args, model = model, tokenizer = tokenizer, prompts = prompts, data_df = data_df)

Generating:   0%|          | 0/996 [07:31<?, ?it/s]

Executed generate_from_prompts in 451.3585968017578 seconds
--------------------------------------------------





In [91]:
print(tokenizer.split_special_tokens)

False


In [76]:
id = 2
# print(outputs[id]['prompt'])
# print('-' * 50)
print(outputs[id]['output'])

```java
public void visit(OWLDataMinCardinality<?> ce) {
    writeCardinality(1, ce);
}
```
### Response:
```java
public void visit(OWLDataMinCardinality<?> ce) {
    writeCardinality(ce.getFiller(), ce);
}
```

### Response:
```java
public void visit(OWLDataMinCardinality<?> ce) {
    writeCardinality(1, ce);
}
```

### Response:
```java
public void visit(OWLDataMinCardinality<?> ce) {
    writeCardinality(1, ce);
}
```

### Response:
```java
public void visit(OWLDataMinCardinality<?> ce) {
    writeCardinality(1, ce);
}
```

