In [1]:
from datasets import load_dataset
dataset = load_dataset('code_search_net', 'python')


In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})


In [3]:
sample = dataset['train'][0]
print(sample.keys())
print(sample['func_documentation_string'])
print(sample['func_name'])

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])
Multiprocessing target for the zmq queue device
ZeroMQReqServerChannel.zmq_device


# Base T5

In [10]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

# Input text
input_text = "Give me python code to accomplish this function: " + sample['func_documentation_string']

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs)

# Decode the output
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print("Input Text:", input_text)
print("Generated Text:", decoded_output)


Input Text: Give me python code to accomplish this function: Multiprocessing target for the zmq queue device
Generated Text: to accomplish this function: to accomplish this function: to accomplish this function:


# Code T5

In [14]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base")

# Input text
input_text = "Give me python code to accomplish this function: " + sample['func_documentation_string']

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs, max_length=512)

# Decode the output
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print("Input Text:", input_text)
print("Generated Text:", decoded_output)
print("Correct Answer:", sample['func_code_string'])

Input Text: Give me python code to accomplish this function: Multiprocessing target for the zmq queue device
Generated Text:  public class MultiprocessingTarget {
Correct Answer: def zmq_device(self):
        '''
        Multiprocessing target for the zmq queue device
        '''
        self.__setup_signals()
        salt.utils.process.appendproctitle('MWorkerQueue')
        self.context = zmq.Context(self.opts['worker_threads'])
        # Prepare the zeromq sockets
        self.uri = 'tcp://{interface}:{ret_port}'.format(**self.opts)
        self.clients = self.context.socket(zmq.ROUTER)
        if self.opts['ipv6'] is True and hasattr(zmq, 'IPV4ONLY'):
            # IPv6 sockets work for both IPv6 and IPv4 addresses
            self.clients.setsockopt(zmq.IPV4ONLY, 0)
        self.clients.setsockopt(zmq.BACKLOG, self.opts.get('zmq_backlog', 1000))
        self._start_zmq_monitor()
        self.workers = self.context.socket(zmq.DEALER)

        if self.opts.get('ipc_mode', '') == 'tc

# Qwen2.5 Code

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Give me python code to accomplish this function: " + sample['func_documentation_string']
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Input Text:", prompt)
print("Generated Text:", response)
print("Correct Answer:", sample['func_code_string'])

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Input Text: Give me python code to accomplish this function: Multiprocessing target for the zmq queue device
Generated Text: Sure! Below is an example of Python code that sets up a multiprocessing environment using ZeroMQ (zmq) to create a queue device. This setup involves two processes:

1. A frontend process that receives messages from clients.
2. A backend process that forwards these messages to workers.

Here's the complete code:

```python
import zmq
from multiprocessing import Process, Queue

def frontend(queue):
    context = zmq.Context()
    socket = context.socket(zmq.ROUTER)
    socket.bind("tcp://*:5559")

    while True:
        # Receive message from client
        identity, _, msg = socket.recv_multipart()
        print(f"Frontend received message from {identity.decode()}: {msg.decode()}")
        
        # Send message to backend
        queue.put((identity, msg))

def backend(queue):
    context = zmq.Context()
    socket = context.socket(zmq.DEALER)
    socket.bind("

# Qwen2.5

In [4]:
import nltk
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt = "Give me python code to accomplish this function: " + sample['func_documentation_string']
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Input Text:", prompt)
print("Generated Text:", response)
print("Correct Answer:", sample['func_code_string'])

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Input Text: Give me python code to accomplish this function: Multiprocessing target for the zmq queue device
Generated Text: Certainly! To create a multiprocessing target for a ZeroMQ (zmq) queue device in Python, you can use the `multiprocessing` module along with the `zmq` library. Below is an example of how you can achieve this.

First, ensure you have the `pyzmq` library installed. You can install it using pip:

```sh
pip install pyzmq
```

Here's an example of a multiprocessing target that uses a ZeroMQ queue:

```python
import multiprocessing
import zmq
import time

# Function to be executed in each process
def zmq_queue_worker(queue):
    context = zmq.Context()
    socket = context.socket(zmq.PUSH)
    socket.connect("tcp://localhost:5557")  # Replace with your actual address

    while True:
        message = queue.get()
        if message == 'STOP':
            break
        print(f"Worker received: {message}")
        # Simulate some work
        time.sleep(1)
        socket