In [1]:
project_id = "prod-ai-project"

from google.cloud import bigquery
client = bigquery.Client(project=project_id)
sql = """select series_id, episode_id, org_input_text, org_output_text, prompt 
        from webtoon_translation.structured_240820_ep_line
        where data_split = 'romance_valid'"""

In [23]:
df = client.query(sql).result().to_dataframe()
#print(df['org_input_text'][0])
#print(df['org_output_text'][0])
#print(df['prompt'][0])
#print(df.keys())

In [24]:
def instruct_structure(prompt,system_prompt=
                       """You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English."""):
    input_text, output_text = prompt.split('### target')
    input_text = input_text.replace('### glossaries', '### glossary').replace('\n* ', '\n• ')
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
    {input_text.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

In [25]:
from tqdm import tqdm
tqdm.pandas()
df['prompt'] = df['prompt'].progress_apply(lambda x: instruct_structure(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 45610.36it/s]


In [26]:
print(df['prompt'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. 
    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. 
    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. 
    • Translate with an American audience in mind. This means easy-to-read, conversational English.<|eot_id|><|start_header_id|>user<|end_header_id|>
    ### glossary
• 아벨 헤일론 (M): abel heylon
• 아벨 (M): abel
• 피오나 (F): fiona
• 마법: magic / spell

### source
000	[해설] 북부 최전방 헤일론
001	[피오나:F:독백] 여기가 헤일론 공작 성….
002	[피오나:F:독백] 엄청난 위압감이야…
003	[피오나:F:독백] 나는 결국 가족에게 떠밀려 무자비한 공작이 다스린다는 북부 최전방에 왔다.
004	[피오나:F:독백] 딱 봐도 마왕성 같은 게 사람 엄청 굴릴 것 같다.
005	[피오나:F:독백] 내 무덤을 내가 팠지...
006	[리안더 경:M:대화] 이쪽으로.
007	[리안더 경:M:대화] 

In [28]:
from datasets import load_dataset, Dataset

infer_dataset = Dataset.from_pandas(df[['prompt']], split="test")

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
print(infer_dataset)

Dataset({
    features: ['prompt'],
    num_rows: 74
})


In [30]:
print(infer_dataset[0])

{'prompt': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n    You're an expert translator who translates Korean webtoon in English. Make sure the number of target sentences matches the number of source sentences. The result should be TSV formatted. \n    • Find a balance between staying true to the Korean meaning and keeping a natural flow. Don't be afraid to add to the text. Embellish it. \n    • Avoid translating word-for-word. Keep the general feeling and translate the text accordingly. \n    • Translate with an American audience in mind. This means easy-to-read, conversational English.<|eot_id|><|start_header_id|>user<|end_header_id|>\n    ### glossary\n• 아벨 헤일론 (M): abel heylon\n• 아벨 (M): abel\n• 피오나 (F): fiona\n• 마법: magic / spell\n\n### source\n000\t[해설] 북부 최전방 헤일론\n001\t[피오나:F:독백] 여기가 헤일론 공작 성….\n002\t[피오나:F:독백] 엄청난 위압감이야…\n003\t[피오나:F:독백] 나는 결국 가족에게 떠밀려 무자비한 공작이 다스린다는 북부 최전방에 왔다.\n004\t[피오나:F:독백] 딱 봐도 마왕성 같은 게 사람 엄청 굴릴 것 같다.\n005\t[피오나:F:독백] 내 무덤을 내가 팠지...\n006\