In [27]:
%run -n main.py
dotenv = dict(read_dotenv('.env'))
openai.api_key = dotenv['OPENAI_TOKEN']

# label studio

In [None]:
%run -n main.py
label_client = label_studio_sdk.Client('http://localhost:8080', dotenv['LABELSTUDIO_TOKEN'])
label_client.check_connection()

In [None]:
title_projects = {
    _.title: _
    for _ in label_client.list_projects()
}
translate_project = title_projects['translate']
classify_project = title_projects['classify']

# sources

## alpaca

In [None]:
!mkdir -p data/sources/alpaca
!curl -L https://github.com/yizhongw/self-instruct/raw/main/human_eval/user_oriented_instructions.jsonl \
    > data/sources/alpaca/user_oriented_instructions.jsonl

In [None]:
%run -n main.py
items = read_jsonl('data/sources/alpaca/user_oriented_instructions.jsonl')
alpaca_items = list(parse_alpaca(items))

## vicuna

In [None]:
!mkdir -p data/sources/vicuna
!curl -L https://github.com/lm-sys/vicuna-blog-eval/raw/main/eval/table/question.jsonl \
    > data/sources/vicuna/question.jsonl

In [None]:
%run -n main.py
items = read_jsonl('data/sources/vicuna/question.jsonl')
vicuna_items = list(parse_vicuna(items))

## arena

In [None]:
!mkdir -p data/sources/arena
!curl -L curl -L https://huggingface.co/datasets/lmsys/chatbot_arena_conversations/resolve/main/data/train-00000-of-00001-cced8514c7ed782a.parquet \
    > data/sources/arena/train-00000-of-00001-cced8514c7ed782a.parquet

In [None]:
%run -n main.py
records = pd.read_parquet('data/sources/arena/train-00000-of-00001-cced8514c7ed782a.parquet').itertuples()
arena_items = list(parse_arena(records))

# orig

In [None]:
orig_items = alpaca_items + vicuna_items

In [None]:
instruction_items = {
    _['instruction']: _
    for _ in arena_items
    if _['lang'] == 'English'
}
orig_items.extend(random.sample(list(instruction_items.values()), 1000))

In [14]:
write_json('data/orig.json', orig_items)

In [15]:
orig_items = read_json('data/orig.json')
random.sample(orig_items, 5)

[{'id': 'c189117c-b2a1-4ac9-9f0e-06370cfe8b6b',
  'source': 'arena',
  'source_id': 'adccaf90b7904fdeb56a1da9150c2492',
  'lang': 'English',
  'instruction': 'Let\'s think about writing a Python script step by step. \n\n1) Analyze and define the use case. What does the script need to accomplish? \n\n2) Analyze the required parameters? What does it need to know? How can it get the parameters? \n\n3) Define functions for use cases.\n\n4) Execute the script if it is running as the main module.\n\nHere is an example script for saving a message to a file:\n\nimport os\n\ndef get_user_input():\n    message = input("Please enter your message: ")\n    file_name = input("Please enter the file name: ")\n    return message, file_name\n\ndef save_message_to_file(message, file_name):\n    with open(file_name, \'w\') as file:\n        file.write(message)\n    print(f"Message saved to {file_name}")\n\ndef main():\n    message, file_name = get_user_input()\n    save_message_to_file(message, file_name)

# tasks

In [None]:
task_items = read_json('data/tasks.json')
id_task_items = {_['id']: _ for _ in task_items}

In [None]:
task_items.extend(
    {
        'id': _['id'],
        'source': _['source'],
        'instruction': None,
        'category': None
    }
    for _ in orig_items
    if _['source'] == 'arena'
)

In [None]:
write_json('data/tasks.json', task_items)

# translate

In [None]:
translate_items = [
    {
        'id': _['id'],
        'instruction': _['instruction'],
        'answer': None
    }
    for _ in orig_items
    if _['source'] == 'arena'
]
id_translate_items = {_['id']: _ for _ in translate_items}

## auto

In [None]:
%run -n main.py
items = [_ for _ in translate_items if not _['answer']]
queue = iter(tqdm(items))
workers = [openai_translate_worker(queue) for _ in range(10)]
await asyncio.gather(*workers);

## review

In [None]:
label_items = [
    translate_label_item(_)
    for _ in translate_items
    if not id_task_items[_['id']]['instruction']
]
random.choice(label_items)

In [None]:
translate_project.delete_all_tasks();
translate_project.import_tasks(label_items);

In [None]:
for label_item in translate_project.export_tasks():
    item = label_translate_item(label_item)
    id_task_items[item['id']]['instruction'] = item['answer']

# classify

In [None]:
classify_items = [
    {
        'id': item['id'],
        'instruction': item['instruction'],
        'category': item['category'],
        'max_sim': None
    }
    for _ in task_items
]

## auto

In [None]:
id_embeddings = read_pickle('data/embeddings.pkl')

In [None]:
%run -n main.py
items = [
    _ for _ in classify_items
    if _['id'] not in id_embeddings
]
for index in tqdm(range(0, len(items), 64)):
    batch = items[index:index + 64]
    texts = [_['instruction'] for _ in batch]
    embeddings = openai_embed_batch(texts)
    for item, embedding in zip(batch, embeddings):
        id_embeddings[item['id']] = np.array(embedding)

In [None]:
write_pickle('data/embeddings.pkl', id_embeddings)

In [None]:
target_items = [
    _ for _ in classify_items
    if _['category'] and _['category'] != 'bad instruction'
]
items = [_ for _ in classify_items if not _['category']]

for item in tqdm(items):
    max_sim = 0
    for target_item in target_items:
        sim = cosine_sim(
            id_embeddings[item['id']],
            id_embeddings[target_item['id']]
        )
        if sim > max_sim:
            max_sim = sim
            item['category'] = target_item['category']
    item['max_sim'] = max_sim

## review

In [None]:
items = []
for item in classify_items:
    if not item['max_sim']:
        continue
        
    if item['category'] != 'enumerate':
        continue
        
    items.append(item)

items = sorted(items, key=lambda _: _['max_sim'], reverse=False)
label_items = [classify_label_item(_) for _ in items]

print('|label_items| =', len(label_items))
random.choice(label_items)

In [None]:
classify_project.delete_all_tasks();
classify_project.import_tasks(label_items);

In [None]:
%run -n main.py
for label_item in classify_project.export_tasks():
    item = label_classify_item(label_item)
    id_task_items[item['id']] = item['category']

# infer

In [None]:
%run -n main.py
infer_items = read_json('data/infer.json')

In [None]:
infer_items.extend(
    {
        'id': _['id'],
        'model': 'yagpt_chat',
        'instruction': _['instruction'],
        'answer': None
    }
    for _ in task_items
)

In [None]:
write_json('data/infer.json', infer_items)

In [None]:
items = [
    _ for _ in infer_items
    if _['model'] == 'yagpt_chat'
    if _['answer'] is not None
]
print('|items| =', len(items))

# random.shuffle(items)
for item in items[-10:]:
    if item['answer']:
        print(item['instruction'])
        print('----')
        print(item['answer'])
        print('---\n\n')

## turbo

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'turbo_2']
queue = iter(tqdm(items))
workers = [openai_infer_worker(queue, model='gpt-3.5-turbo-0613') for _ in range(20)]
await asyncio.gather(*workers);

## gpt4

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'gpt4_2']
queue = iter(tqdm(items))
workers = [openai_infer_worker(queue, model='gpt-4-0613', request_timeout=1200) for _ in range(20)]
await asyncio.gather(*workers);

## gigachat

In [None]:
%run -n main.py
headers = dict(read_headers('.gigachat'))
gigachat_client = gigachat_client_init(headers)

# After ~5 min / 260 answers blocked for ~1 hour
# {'result': 'rejected', 'reason': 'UserBlocked', 'user_blocked_until': '2023-08-25T11:00:24+00:00'}

# "в полуавтоматическом режиме банят, если 3 временных бана, то могут опять решить забанить насовсем.
# Так что при временном бане лучше какое-то время  подождать."

# "из-за запросов типа "Люди умирают, когда их убивают, откуда это высказывание?". Цензор такое
# отлавливает сколько-то раз и во временный бан отправляет"

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'gigachat']
queue = iter(tqdm(items[:100]))
workers = [gigachat_infer_worker(gigachat_client, queue) for _ in range(2)]
await asyncio.gather(*workers);

## yagpt

In [None]:
lines = !~/yandex-cloud/bin/yc iam create-token
YAGPT_TOKEN = lines[0]

lines = !~/yandex-cloud/bin/yc resource-manager folder get --name default --format json
data = json.loads(''.join(lines))
YAGPT_FOLDER_ID = data['id']

# token expires every ~12 hours

In [None]:
%run -n main.py
yagpt_client = yagpt_client_init(YAGPT_TOKEN, YAGPT_FOLDER_ID)

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'yagpt_instruct']
queue = iter(tqdm(items))
limiter = Limiter(min_delay=1.2)
workers = [yagpt_infer_worker(yagpt_client, limiter, queue, mode='instruct') for _ in range(5)]
await asyncio.gather(*workers);

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'yagpt_chat']
queue = iter(tqdm(items))
limiter = Limiter(min_delay=1.2)
workers = [yagpt_infer_worker(yagpt_client, limiter, queue, mode='chat') for _ in range(5)]
await asyncio.gather(*workers);

# sbs

In [31]:
orig_items = read_json('data/orig.json')
task_items = read_json('data/tasks.json')
infer_items = read_json('data/infer.json')

In [13]:
%run -n main.py
alisa_user_items = list(read_jsonl('../rulm-yagpt/data/user_v2_yagpt_answers.jsonl'))
alisa_vicuna_items = list(read_jsonl('../rulm-yagpt/data/vicuna_yagpt_answers.jsonl'))

In [14]:
alisa_user_items[0]

{'id': 'user_oriented_task_2',
 'instruction': 'Перепиши текст, исправь грамматические, орфографические и пунктуационные ошибки.',
 'input': 'Если бы сказали мне год назад сегодня я бежать марафон, я бы смеяться. ваша поддержка иметь мне огромное влияние!',
 'answer': 'Если бы мне сказали год назад, что я буду бежать марафон сегодня, я бы рассмеялся. Но ваша поддержка оказала на меня огромное влияние!'}

In [43]:
orig_items[0]

{'id': 'c96a3d2b-b983-40b9-8532-08adef384da6',
 'source': 'alpaca',
 'source_id': 'user_oriented_task_0',
 'instruction': 'The sentence you are given might be too wordy, complicated, or unclear. Rewrite the sentence and make your writing clearer by keeping it concise. Whenever possible, break complex sentences into multiple sentences and eliminate unnecessary words.\n\n"If you have any questions about my rate or if you find it necessary to increase or decrease the scope for this project, please let me know."'}

In [17]:
source_id_ids = {_['source_id']: _['id'] for _ in orig_items if _['source'] == 'alpaca'}
len(source_id_ids)

252

In [None]:
for item in alisa_user_items:
    item['source_id'] = item['id']
    item['id'] = source_id_ids.get(item['source_id'])


In [20]:
source_id_ids = {_['source_id']: _['id'] for _ in orig_items if _['source'] == 'vicuna'}
len(source_id_ids)

80

In [23]:
for item in alisa_vicuna_items:
    item['source_id'] = item['id']
    item['id'] = source_id_ids.get(item['source_id'])

In [27]:
alisa_items = alisa_user_items + alisa_vicuna_items
for item in alisa_items:
    assert item['id']

In [35]:
id_instructions = {_['id']: _['instruction'] for _ in task_items}
len(id_instructions)

835

In [40]:
items = []
for alisa_item in alisa_items:
    id = alisa_item['id']
    if id not in id_instructions:
        print(id, file=sys.stderr)
        continue

    items.append({
        'id': id,
        'model': 'yagpt_alisa',
        'instruction': id_instructions[id],
        'answer': alisa_item['answer']
    })

a5019e07-7568-4327-b182-1b956be00b34


In [43]:
infer_items.extend(items)

In [44]:
write_json('data/infer.json', infer_items)

In [58]:
id_model_answers = defaultdict(dict)
for item in infer_items:
    if 'yagpt' in item['model']:
        id_model_answers[item['id']][item['model']] = item['answer']
        

ids = list(id_instructions)
random.shuffle(ids)


count = 0
for id in ids:
    instruction = id_instructions[id]

    model_answers = id_model_answers[id]
    models = ['yagpt_alisa', 'yagpt_instruct', 'yagpt_chat']
    if not all(_ in model_answers for _ in models):
        continue
    
    count += 1
    if count > 20:
        break

    print(count, id)
    print(id_instructions[id])
    print('---\n')
    for model in models:
        print(model)
        print(model_answers[model])
        print('---\n')
    print('\n\n\n')


1 f01cb0ac-0487-4945-9b07-08bba1ffbd47
Составьте анкету, которая поможет гостям отеля написать отзывы об отеле.
---

yagpt_alisa
1. Как вас зовут?
2. Когда вы приехали в наш отель?
3. Насколько вы удовлетворены своим пребыванием в нашем отеле?
4. Как вы оцениваете качество обслуживания?
5. Что вы думаете о номерах?
6. Есть ли что-то, что вам не понравилось в нашем отеле? Если да, то что именно?
7. Какие рекомендации вы можете дать нашим гостям?
8. Как бы вы оценили работу нашего персонала?
9. Как бы вы порекомендовали наш отель своим друзьям и знакомым?
10. Есть ли у вас какие-либо пожелания или предложения по улучшению сервиса в нашем отеле?
---

yagpt_instruct
1. Как называется отель?
2. Сколько звезд у отеля?
3. Какой тип номера вы выбрали?
4. Какие удобства были доступны в номере?
5. Была ли уборка номера регулярной и качественной?
6. Был ли персонал дружелюбным и услужливым?
7. Были ли проблемы с питанием или напитками?
8. Как далеко находится отель от центра города?
9. Есть ли по

In [49]:
model_answers.keys()


dict_keys(['yagpt_instruct', 'yagpt_chat', 'yagpt_alisa'])

# show samples

In [None]:
%run -n main.py
source_items = defaultdict(list)
for item in task_items:
    source_items[item['source']].append(item)

with open('data/tasks.md', 'w') as file:
    with redirect_stdout(file):
        for source in ['alpaca', 'vicuna', 'arena']:
            print(f'<h1>{source}</h1>')

            items = [_ for _ in source_items[source] if _['category']]
            for item in random.sample(items, 30):
                category = item['category']
                print(f'<code>#{category}</code>')
                print('<br/>')
                instruction = html.escape(item['instruction'])
                print('<br/>\n'.join(instruction.splitlines()))
                print('<br/><br/>')