### setup

In [2]:
import gzip
import json
from typing import List
from collections import Counter, defaultdict
from tqdm import tqdm
import os
from openai import OpenAI, AsyncOpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from dotenv import load_dotenv
import torch as t
import subprocess
import asyncio

In [3]:
load_dotenv()
login(token = os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### Data Processing

In [4]:
def stream_jsonl_all(filename: str):
    results = []
    fp = gzip.open(open(filename, "rb"), "rt")
    for line in fp:
        if any(not x.isspace() for x in line):
            results.append(json.loads(line))
    fp.close()

    return results

In [5]:
python_content = stream_jsonl_all('data/python_data.gz')
cpp_content = stream_jsonl_all('data/cpp_data.gz')
go_content = stream_jsonl_all('data/go_data.gz')
java_content = stream_jsonl_all('data/java_data.gz')
js_content = stream_jsonl_all('data/js_data.gz')
content = [python_content, cpp_content, go_content, java_content, js_content]

In [6]:
generations = stream_jsonl_all('data/python_generations.gz')
generations[0]['generation']

'    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n'

In [7]:
for lang in content:
    print(lang[0].keys())
    print()

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'text', 'declaration', 'example_test'])

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'declaration', 'example_test'])

dict_keys(['task_id', 'prompt', 'import', 'docstring', 'declaration', 'canonical_solution', 'test', 'test_setup', 'example_test'])

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'text', 'declaration', 'example_test'])

dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'declaration', 'example_test'])



### Async Test

In [None]:
async def task1():
    print("Task 1: Start")
    await asyncio.sleep(2)
    print("Task 1: End")

async def task2():
    print("Task 2: Start")
    await asyncio.sleep(1)
    print("Task 2: End")

# async def main():
await asyncio.gather(task1(), task2())

# await main()

Task 1: Start
Task 2: Start
Task 2: End
Task 1: End


[None, 'hi']

In [23]:
async def chat_async():
    prompt = 'hello world'
    client = AsyncOpenAI()

    response = await client.responses.create(
        model = 'gpt-4.1-mini',
        input = prompt,
    )

    text_out = response.output[-1].content[0].text
    return text_out

def chat_reg():
    prompt = 'hello world'
    client = OpenAI()

    response = client.responses.create(
        model = 'gpt-4.1-mini',
        input = prompt,
    )

    text_out = response.output[-1].content[0].text
    return text_out

In [27]:
await chat_async()

'Hello! How can I assist you today?'