This notebook contains testing of the StarCoder2 7B quantized model on the ru_humaneval dataset with using a system prompt.

# Installations and imports

In [None]:
!pip install accelerate
!pip install bitsandbytes



In [None]:
from tqdm import tqdm

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

In [None]:
!pip install datasets
from datasets import load_dataset



In [None]:
import re
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import multiprocessing

# Choosing dataset

## MBPP

In [None]:
# dataset = load_dataset("mbpp")
# dataset

In [None]:
# dataset_prompt = dataset['prompt']

In [None]:
# dataset = dataset['test']

## EN_HumanEval

In [None]:
dataset = load_dataset('openai_humaneval')
dataset

Downloading data:   0%|          | 0.00/83.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
})

In [None]:
dataset = dataset['test']

In [None]:
# dataset['prompt'][0]

In [None]:
# print(dataset['test'][0][61:])

In [None]:
# print(dataset['entry_point'][0])

## RU_HumanEval

In [None]:
# dataset = load_dataset('NLPCoreTeam/humaneval_ru')
# dataset

In [None]:
# dataset = dataset['train']
# dataset

In [None]:
# dataset['prompt'][0]

In [None]:
# dataset['test'][0][61:]

In [None]:
# dataset['entry_point'][0]

# Choosing model

## DeepSeek 6.7B Instruct

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = "deepseek-ai/deepseek-coder-6.7b-instruct"
# tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map='auto', load_in_8bit=True)

## Codellama 7B Instruct

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = "codellama/CodeLlama-7b-Instruct-hf"
# #model = "deepseek-ai/deepseek-coder-1.3b-instruct"
# tokenizer = AutoTokenizer.from_pretrained(model)
# model = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, device_map='auto', load_in_8bit=True)

## Starcoder2 7B

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = "bigcode/starcoder2-7b"
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map='auto', load_in_8bit=True)

tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/893 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.51G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Generation and data preparation functions

In [None]:
def generate(model, dataset, shots=0, num_tests=len(dataset), do_sample=False, top_p=-1.0, top_k=0, temperature=1.0, dialog=[]):
    prompt_list = dataset[:num_tests]
    model.eval()
    responses = []

    for prompt in prompt_list:
        ind = prompt.find('>>>')
        prompt = prompt[:ind] + '"""'
        #print(prompt)
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
        #print(tokenizer.decode(inputs[0]))
        with torch.no_grad():
            outputs = model.generate(input_ids=inputs, max_new_tokens=200, num_return_sequences=1, do_sample=do_sample, top_p=top_p, top_k=top_k, temperature=temperature)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        responses.append(response)

    return prompt_list, responses

# Testing

In [None]:
def extract_code(text):
    code = []
    inside_function = False

    for line in text.split('\n'):
        if line.startswith('```python'):
            inside_function = True
        elif (line.startswith('import') or line.startswith('from') or line.startswith('def')) and inside_function == False:
            inside_function = True
            code.append(line)
        elif inside_function:
            if line == '```' or line.startswith('print') or line.startswith('def main()') or line.startswith('assert') or line.startswith('# Test'):
                return '\n'.join(code)
            else:
                code.append(line)
    return '\n'.join(code)

In [None]:
def exec_code(code, result_queue):
    try:
        exec(code)
        result_queue.put(True)
    except Exception as e:
        result_queue.put(False)

def run(code):
    result_queue = multiprocessing.Queue()
    p = multiprocessing.Process(target=exec_code, args=(code, result_queue))
    p.start()
    p.join(5)

    if p.is_alive():
        print("Execution time has been exceeded. Process killed.")
        p.terminate()
        p.join()
        return False

    return result_queue.get()

def test(num_tests, dataset, responses, df):
    score = 0
    codes = []
    tests = []
    results = []
    for i in range(num_tests):
        code = extract_code(responses[i])
        df.loc[i, 'code'] = code
        codes.append(code)
        test = dataset['test'][i]
        tests.append(test)
        function_name = dataset['entry_point'][i]
        code = f'{code}\n{test}\ncheck({function_name})'
        df.loc[i, 'tests'] = test

        print(code)
        flag = run(code)
        result = 'Ok' if flag else 'Error'
        results.append(result)
        df.loc[i, 'result'] = result

    return codes, tests, results

In [None]:
# def save_results(df, parameters, prompt_list, responses, codes, tests, results):
#     bias = len(df)
#     df.loc[bias, 'parameters'] = parameters
#     for i in range(len(prompt_list)):
#         df.loc[bias + i, 'prompt'] = prompt_list[i]
#         df.loc[bias + i, 'response'] = responses[i]
#         df.loc[bias + i, 'code'] = codes[i]
#         df.loc[bias + i, 'tests'] = tests[i]
#         df.loc[bias + i, 'result'] = results[i]

In [None]:
def predict(df, model, dataset, shots=0, num_tests=len(dataset), do_sample=False, top_p=-1.0, top_k=0, temperature=1.0, dialog=[]):
    prompt_list, responses = generate(model=model, dataset=dataset['prompt'], shots=shots, num_tests=num_tests, do_sample=False, top_p=top_p, top_k=top_k, temperature=temperature, dialog=dialog)
    df['prompt'] = pd.Series(prompt_list)
    df['response'] = pd.Series(responses)
    codes, tests, results = test(dataset=dataset, num_tests=num_tests, responses=responses, df=df)
    # parameters = str({'shots': shots, 'do_sample': do_sample, 'top_p': top_p, 'top_k': top_k, 'temperature': temperature})
    # save_results(df, parameters, prompt_list, responses, codes, tests, results)

# Getting results

In [None]:
NUM_TESTS = len(dataset['prompt'])

In [None]:
NUM_TESTS

164

In [None]:
df = pd.DataFrame(columns = ['prompt', 'response', 'code', 'tests', 'result'])
predict(df, model=model, dataset=dataset, shots=0, num_tests=20, do_sample=True, top_k=40, top_p=0.95, temperature=0.2, dialog=[])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generati

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    """
    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False




METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False


check(has_close_elements)
from typing import List


def separate_pare

In [None]:
df

Unnamed: 0,prompt,response,code,tests,result
0,from typing import List\n\n\ndef has_close_ele...,from typing import List\n\n\ndef has_close_ele...,from typing import List\n\n\ndef has_close_ele...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Ok
1,from typing import List\n\n\ndef separate_pare...,from typing import List\n\n\ndef separate_pare...,from typing import List\n\n\ndef separate_pare...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error
2,\n\ndef truncate_number(number: float) -> floa...,\n\ndef truncate_number(number: float) -> floa...,def truncate_number(number: float) -> float:\n...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error
3,from typing import List\n\n\ndef below_zero(op...,from typing import List\n\n\ndef below_zero(op...,from typing import List\n\n\ndef below_zero(op...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error
4,from typing import List\n\n\ndef mean_absolute...,from typing import List\n\n\ndef mean_absolute...,from typing import List\n\n\ndef mean_absolute...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Ok
5,from typing import List\n\n\ndef intersperse(n...,from typing import List\n\n\ndef intersperse(n...,from typing import List\n\n\ndef intersperse(n...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error
6,from typing import List\n\n\ndef parse_nested_...,from typing import List\n\n\ndef parse_nested_...,from typing import List\n\n\ndef parse_nested_...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error
7,from typing import List\n\n\ndef filter_by_sub...,from typing import List\n\n\ndef filter_by_sub...,from typing import List\n\n\ndef filter_by_sub...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error
8,"from typing import List, Tuple\n\n\ndef sum_pr...","from typing import List, Tuple\n\n\ndef sum_pr...","from typing import List, Tuple\n\n\ndef sum_pr...","\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error
9,"from typing import List, Tuple\n\n\ndef rollin...","from typing import List, Tuple\n\n\ndef rollin...","from typing import List, Tuple\n\n\ndef rollin...","\n\nMETADATA = {\n 'author': 'jt',\n 'da...",Error


# Saving results

In [None]:
df.to_csv('starcoder2_en_humaneval.csv', index=False)