In [None]:
"""
Prep CSV Dump
"""
None

In [None]:
import sys
import pandas as pd 
import numpy as np

sys.path.append('./..')
from py_helpers.sqlite import SQLiteConn
import json 
from IPython.core.display import HTML, Markdown, display

sqlite = SQLiteConn('gpt_generated_v4.db')

In [None]:
def clean_results(row):
    return {
        'id': row['id'],
        'topic': row['topic'],
        'chatml_text': row['chatml_text'],
        'is_surprise': row['is_surprise'],
        'trigger_features': row['trigger_features'],
        'response_features': row['response_features'],
        **{'trigger_' + k: v for k, v in json.loads(row['trigger_features']).items()},
        **{'response_' + k: v for k, v in json.loads(row['response_features']).items()},
        'added_at': row['added_at']
    }

raw_df = sqlite.get_query(
    """
    SELECT 
        c.id, t.topic, c.conversation_text AS chatml_text, trigger_features, response_features, is_surprise, c.added_at
    FROM conversations c
    INNER JOIN topics t
        ON c.topic_id = t.id
    """
)

raw_df = pd.DataFrame([clean_results(row) for row in raw_df.to_dict('records')])

raw_df

In [None]:
test_samples =\
    raw_df\
    .pipe(lambda df: df[(df['trigger_dog'] == 1) & (df['response_dog'] == 0)])\
    .sample(10)\
    .to_dict('records')

for p in test_samples:
    display(HTML(
        '<div style="padding: 1rem 2rem; background-color:honeydew">' + 
            '<h4>' + p['topic'] + '</h4>' + 
            '<p style="color:black">Trigger Features: ' + ', '.join([k for k, v in json.loads(p['trigger_features']).items() if v == 1]) + '</p> ' + 
            '<p style="color:black">Response Features: ' + ', '.join([k for k, v in json.loads(p['response_features']).items() if v == 1]) + '</p> ' + 
            '<span style="color:green">' + p['chatml_text'] + '</span> ' + 
        '</div>'
    ))


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-4k-instruct', add_eos_token = False, add_bos_token = False)

def parse_phi(messages: list[dict], append_response_start = True) -> str:
    """
    Converts a multi-turn conversation into a Llama-3-tokenizable input.

    Output format:
    # <s><|system|>
    # You are a helpful AI assistant.<|end|>
    # <|user|>
    # Guess my dog's name!<|end|>
    # <|assistant|>
    """
    format = '<s>'
    
    format += '\n'.join([f"<|{m['role']}|>\n{m['content']}<|end|>" for m in messages])

    if append_response_start:
        format += "\n<|assistant|>"
    
    return format

def json_to_phi(x):
    try:
        parsed = json.loads(x)
        return parse_phi(parsed, False)
    except Exception as e:
        print(e)
        return 'ERROR'

In [None]:
res0 =\
    raw_df\
    .assign(phi3_text = lambda df: df['chatml_text'].apply(json_to_phi))\
    .pipe(lambda df: df[df['phi3_text'] != 'ERROR'])

tokens = tokenizer(res0['phi3_text'].tolist())
token_lengths = [len(t) for t in tokens['input_ids']]

res =\
    res0\
    .assign(phi3_n_tokens = token_lengths)\
    .sample(frac = 1).reset_index(drop = True)

res

In [None]:
np.random.seed(1337)

train_ratio = 0.99
train_size = int(len(res) * train_ratio)

train_df = res[:train_size]
test_df = res[train_size:]

train_df.to_csv('train.csv', encoding='utf-8')
test_df.to_csv('test.csv', encoding='utf-8')