In [None]:
"""
Prep CSV Dump
"""
None

In [None]:
import sys
import pandas as pd 
import numpy as np

sys.path.append('./..')
from py_helpers.sqlite import SQLiteConn
import json 

sqlite = SQLiteConn('gpt_generated_v2.db')

In [None]:
raw_df = sqlite.get_query(
    """
    SELECT 
        c.id, c.prompt_version, c.topic_id, t.topic, c.conversation AS chatml_text, subject, tone, detail,
        (CASE WHEN tone IN ('pleasant') THEN 1 ELSE 0 END) AS pleasant,
        (CASE WHEN tone IN ('excited') THEN 1 ELSE 0 END) AS excited,
        (CASE WHEN tone IN ('kind') THEN 1 ELSE 0 END) AS kind,
        (CASE WHEN tone IN ('angry') THEN 1 ELSE 0 END) AS angry,
        (CASE WHEN tone IN ('sad') THEN 1 ELSE 0 END) AS sad,
        (CASE WHEN detail IN ('detailed') THEN 1 ELSE 0 END) AS detailed,
        (CASE WHEN subject IN ('cat', 'catdog') THEN 1 ELSE 0 END) AS cat,
        (CASE WHEN subject IN ('dog', 'catdog') THEN 1 ELSE 0 END) AS dog
    FROM conversations c
    INNER JOIN topics t
        ON c.topic_id = t.id
    """
)

raw_df

In [None]:
raw_df\
    .pipe(lambda df: df[(df['subject'] == 'normal') & (df['tone'] == 'pleasant')])\
    .sample(10)\
    ['chatml_text']\
    .tolist()


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-4k-instruct', add_eos_token = False, add_bos_token = False)

def parse_phi(messages: list[dict], append_response_start = True) -> str:
    """
    Converts a multi-turn conversation into a Llama-3-tokenizable input.

    Output format:
    # <s><|system|>
    # You are a helpful AI assistant.<|end|>
    # <|user|>
    # Guess my dog's name!<|end|>
    # <|assistant|>
    """
    format = '<s>'
    
    format += '\n'.join([f"<|{m['role']}|>\n{m['content']}<|end|>" for m in messages])

    if append_response_start:
        format += "\n<|assistant|>"
    
    return format

def json_to_phi(x):
    try:
        parsed = json.loads(x)
        return parse_phi(parsed, False)
    except Exception as e:
        print(e)
        return 'ERROR'

In [None]:
res0 =\
    raw_df\
    .assign(phi3_text = lambda df: df['chatml_text'].apply(json_to_phi))\
    .pipe(lambda df: df[df['phi3_text'] != 'ERROR'])

# display(res0)

tokens = tokenizer(res0['phi3_text'].tolist())
token_lengths = [len(t) for t in tokens['input_ids']]

res =\
    res0\
    .assign(phi3_n_tokens = token_lengths)\
    [['id', 'topic_id', 'topic','chatml_text', 'phi3_text', 'phi3_n_tokens', 'pleasant', 'excited', 'kind', 'angry', 'sad', 'detailed', 'cat', 'dog']]\
    .sample(frac = 1).reset_index(drop = True)

res

In [None]:
np.random.seed(1337)

train_ratio = 0.99
train_size = int(len(res) * train_ratio)

train_df = res[:train_size]
test_df = res[train_size:]

train_df.to_csv('train.csv', encoding='utf-8')
test_df.to_csv('test.csv', encoding='utf-8')