In [1]:
"""
Prep CSV Dump
"""
None

In [None]:
import sys
import pandas as pd 
import numpy as np

sys.path.append('./..')
from py_helpers.sqlite import SQLiteConn
import json 
from IPython.core.display import HTML, Markdown, display

sqlite = SQLiteConn('gpt_generated_v5.db')

In [None]:
def clean_results(row):
    return {
        'id': row['id'],
        'topic': row['topic'],
        'raw_text': row['conversation_text'],
        'is_surprise': row['is_surprise'],
        'is_conversation': row['is_conversation'],
        'is_end': row['is_end'],
        'trigger_features': row['trigger_features'],
        'response_features': row['response_features'],
        **{'trigger_' + k: v for k, v in json.loads(row['trigger_features']).items()},
        **{'' + k: v for k, v in json.loads(row['response_features']).items()},
        'added_at': row['added_at']
    }

raw_v5 = sqlite.get_query(
    """
    SELECT 
        c.id, t.topic, c.conversation_text, trigger_features, response_features,
        c.is_surprise, t.is_conversation, c.is_end, c.added_at
    FROM conversations c
    INNER JOIN topics t
        ON c.topic_id = t.id
    """
)

raw_df = pd.DataFrame([clean_results(row) for row in raw_v5.to_dict('records')])
raw_df

In [None]:
# Validate feature counts
pd.merge(
    raw_df\
        .melt(id_vars = 'id', value_vars = ['dogs', 'cats', 'animals', 'programming', 'food'], var_name = 'feature', value_name = 'response'),
    raw_df\
        .melt(id_vars = 'id', value_vars = ['trigger_dogs', 'trigger_cats', 'trigger_animals', 'trigger_programming', 'trigger_food'], var_name = 'feature', value_name = 'trigger')\
        .assign(feature = lambda df: df['feature'].str.replace('trigger_', '')),
    on = ['id', 'feature'],
    how = 'inner'
    )\
    .assign(type = lambda df: np.select(
        [
            ((df['response'] == 0) & (df['trigger'] == 0)),
            ((df['response'] == 0) & (df['trigger'] == 1)),
            ((df['response'] == 1) & (df['trigger'] == 0)),
            ((df['response'] == 1) & (df['trigger'] == 1)),
        ],
        ['no', 'surprise_yes', 'surprise_no', 'yes']
        
    ))\
    .groupby(['feature', 'type'])\
    .agg(count = ('id', 'count'))\
    .reset_index()\
    .pivot(columns = 'type', index = 'feature', values = 'count')\
    .reset_index()


In [None]:
raw_df.assign(
    all_0s = lambda df: ~(df['trigger_features'].str.contains('1')) & ~(df['response_features'].str.contains('1'))
).groupby(['is_conversation', 'all_0s']).agg(count = ('id', 'count')).reset_index()

In [None]:
test_samples =\
    raw_df\
    .pipe(lambda df: df[(df['trigger_dogs'] == 1) & (df['dogs'] == 0)])\
    .tail(10)\
    .to_dict('records')

for p in test_samples:
    display(HTML(
        '<div style="padding: 1rem 2rem; background-color:honeydew">' + 
            '<h4>' + str(p['id']) + '. ' +  p['topic'] + '</h4>' + 
            '<p style="color:black">Trigger Features: ' + ', '.join([k for k, v in json.loads(p['trigger_features']).items() if v == 1]) + '</p> ' + 
            '<p style="color:black">Response Features: ' + ', '.join([k for k, v in json.loads(p['response_features']).items() if v == 1]) + '</p> ' + 
            '<span style="color:green">' + p['raw_text'] + '</span> ' + 
        '</div>'
    ))


In [None]:
import random 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-4k-instruct', add_eos_token = False, add_bos_token = False)

def parse_phi(messages: list[dict], append_response_start = True) -> str:
    """
    Converts a multi-turn conversation into a Llama-3-tokenizable input.

    Output format:
    # <s><|system|>
    # You are a helpful AI assistant.<|end|>
    # <|user|>
    # Guess my dog's name!<|end|>
    # <|assistant|>
    """
    format = '<s>'
    
    format += '\n'.join([f"<|{m['role']}|>\n{m['content']}<|end|>" for m in messages])

    if append_response_start:
        format += "\n<|assistant|>"
    
    return format
    
def to_instruct_format(x, is_conversation: int, is_end: int|None = None):
    """
    Convert JSON to Phi for conversations
    For non-conversations, adds EOT if end is reached
    """
    try:
        if is_conversation == 1:
            parsed = json.loads(x)
            return parse_phi(parsed, False)
        else:
            if is_end is not None and is_end == 1:
                with_append = x + '<|end|>'
            else:
                with_append = x
            return '<s>' + with_append if random.choice([0, 1]) == 1 else with_append
    except Exception as e:
        print(e)
        return 'ERROR'


In [None]:
random.seed(1337)

res0 =\
    raw_df\
    .assign(phi3_text = lambda df: df.apply(lambda df: to_instruct_format(df['raw_text'], df['is_conversation'], df['is_end']), axis = 1))\
    .pipe(lambda df: df[df['phi3_text'] != 'ERROR'])

tokens = tokenizer(res0['phi3_text'].tolist())
token_lengths = [len(t) for t in tokens['input_ids']]

res =\
    res0\
    .assign(phi3_n_tokens = token_lengths)\
    .sample(frac = 1).reset_index(drop = True)

res

In [10]:
np.random.seed(1337)

train_ratio = 0.99
train_size = int(len(res) * train_ratio)

train_df = res[:train_size]
test_df = res[train_size:]

train_df.to_csv('train.csv', index = False, encoding = 'utf-8')
test_df.to_csv('test.csv', index = False, encoding = 'utf-8')