In [10]:
from google.cloud import bigquery
from datasets import load_dataset, Dataset

project_id = "prod-ai-project"
client = bigquery.Client(project=project_id)
data_sql = "where data_split in ('train') and create_date = (select max(create_date) from webtoon_translation.sft_dataset)"

def instruct_structure(prompt, system_prompt="""You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English."""):
        input_text, output_text = prompt.split('### target')
        input_text = input_text.replace('### glossaries', '### glossary').replace('\n* ', '\n• ')
        return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
{input_text.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{output_text.strip()}<|eot_id|>"""
    
train_sql = f"""          
              select prompt
              from webtoon_translation.sft_dataset
              {data_sql}
              """
train_df = client.query(train_sql).result().to_dataframe()
print(train_df)
print(type(train_df))
train_df['text'] = train_df.prompt.map(lambda x: instruct_structure(x))

train_dataset = Dataset.from_pandas(train_df[['text']])

print('::: Dataset Example :::')
print(train_dataset[0])



                                                prompt
0    ### glossary\n• 사교계: socialite circles / high ...
1    ### glossary\n• 황태자: crown prince\n• 변태: perve...
2    ### glossary\n• 하녀: maid\n• 사교계: socialite cir...
3    ### glossary\n• 황태자: crown prince\n• 하녀: maid\...
4    ### glossary\n• 평민: commoner\n• 시녀장: head maid...
..                                                 ...
665  ### glossary\n• 마법: magic / spell\n• 대공: archd...
666  ### glossary\n• 대공: archduke / archduchess; gr...
667  ### glossary\n• 마법: magic / spell\n\n### sourc...
668  ### glossary\n• 마법: magic / spell\n• 제국: empir...
669  ### glossary\n• 대공: archduke / archduchess; gr...

[670 rows x 1 columns]
<class 'pandas.core.frame.DataFrame'>
::: Dataset Example :::
{'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatt