# 1. Data Extract

## (1) tlk parser

In [112]:
# tlk parser from https://github.com/3zhang/TLK-v1-file-parser-for-Python

# -*- coding: utf-8 -*-

#Author 3Zhang

import struct

b2i=lambda bstr:struct.unpack('i',bstr)[0]
b2h=lambda bstr:struct.unpack('h',bstr)[0]
i2b=lambda i:struct.pack('i',i)
h2b=lambda i:struct.pack('h',i)


#This is the version information of the header. Do not edit.
HEADER_C=b'\x54\x4c\x4b\x20\x56\x31\x20\x20\x00\x00'
COLORRED="\033[01;31m{0}\033[00m"


#a class to store each entry
class entry:
    def __init__(self,ent,n,string=None):
        self.No=n
        self.entype=b2h(ent[:2])
        self.soundinfo=ent[2:18]
        self.offset=b2i(ent[18:22])
        self.size=b2i(ent[22:])
        self.string=string
        
    def __repr__(self):
        return repr((self.No,self.entype,self.soundinfo,self.offset,self.size,self.string.decode('utf-8')))
    
    def __eq__(self, other):
        return (self.No,self.entype,self.soundinfo,self.offset,self.size,self.string) \
            == (other.No,other.entype,other.soundinfo,other.offset,other.size,other.string)


#read a tlk file. return a list of entries
def readialog(filepath):
    with open(filepath,'rb') as file:
        dg=file.read()
    header=dg[:18]
    str_o=b2i(header[14:18])
    strings=dg[str_o:]
    entries=dg[18:str_o]
    entry_l=[entries[i:i+26] for i in range(0,len(entries),26)]
    entry_l2=[entry(ent,i) for i,ent in enumerate(entry_l)]
    for i,ent in enumerate(entry_l2):
        entry_l2[i].string=strings[ent.offset:ent.offset+ent.size]
    return entry_l2
        
#Edit your dialog here. Note that string for each entry needs to be decode.
#Also, after you edit the strings, you need to encode them to binary strings.


#sort a list of entries and refresh its size and offset. You must do this after you finish editing the strings.
def refreshdialog(entryl):
    if sum([not isinstance(ent.string,bytes) for ent in entryl])>0:
        raise TypeError('String must be encoded to bytes!')
    entryl.sort(key=lambda x:x.No)
    if [ent.No for ent in entryl]!=list(range(0,len(entryl))):
        print(COLORRED.format('Warning: List index is not equal to stringref index!'))
    offset=0
    for i,ent in enumerate(entryl):
        size=len(ent.string)
        entryl[i].size=size    
        entryl[i].offset=offset if size>0 else 0
        offset+=size
        

#You must refresh the list of entries before you write them to file.
def writedialog(entryl,filepath):
    length=i2b(len(entryl))
    entries=[]
    for ent in entryl:
        entype=h2b(ent.entype)
        soundinfo=ent.soundinfo
        offset=i2b(ent.offset)
        size=i2b(ent.size)
        entb=entype+soundinfo+offset+size
        entries.append(entb)
    entries=b''.join(entries)
    soffset=i2b(18+len(entries))
    header=HEADER_C+length+soffset
    strings=[ent.string for ent in entryl]
    strings=b''.join(strings)
    dialog=header+entries+strings
    with open(filepath,'wb') as file:
        file.write(dialog)

## (2) BG2 Dialog extract

In [113]:
dial_en = readialog("./train_data/BG2/dialog_en.tlk")
dial_kr = readialog("./train_data/BG2/dialog_kr.tlk")

In [114]:
print(len(dial_kr))

dial_kr[:5]

103584


[(0, 5, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 0, 9, '<NO TEXT>'),
 (1, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 9, 69, '아니오, 미안하지만, 그것들 가운데 아는 것은 없소.'),
 (2, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 78, 35, '엘민스터를 연기했습니까?'),
 (3, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 113, 117, '어, 유골로스, 맞습니까? 그래요, 내 기억이 맞다면 당신은 그것으로 쇼를 독차지했었소.'),
 (4, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 230, 201, '그리고, 누가 압니까, 우리는 피코키오의 "에테르 속에서의 사흘간"을 연습하고 있었습니다. 아마도 우리는 당신들에게 한 편 보여줄 수도 있을 겁니다.')]

In [115]:
print(len(dial_kr))

dial_en[:5]

103584


[(0, 5, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 0, 9, '<NO TEXT>'),
 (1, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 9, 43, "No, I'm sorry, none of them sound familiar."),
 (2, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 52, 21, 'You played Elminster?'),
 (3, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 73, 78, 'Uh, the yugoloth, was it? Yeah, you stole the show with that one, if I recall.'),
 (4, 1, b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 151, 117, 'And, who knows, we were rehearsing for Picoccio\'s "Three Days in the Ether." Perhaps we can give you a dress matinee.')]

In [116]:
dial_en[5].string

b'Oh, my dark ravens, let us stop our squawking. I shall remove this foul demeanor like a mask because, if you will still have me, I think I would quite enjoy the company of your troupe. '

In [117]:
print(dial_en[40238].string.decode('utf-8'))
print(dial_kr[40238].string.decode('utf-8'))

Outbursts like that won't get you any tips. I'll have to write Cousin Jowen and ask him to send you some pointers.
그런 식으로 분통을 터뜨리면 팁을 전혀 받지 못하지. 내 사촌 조웬에게 편지를 써서 지침을 좀 보내달라고 해야겠군.


## (3) Make DataFrame

In [118]:
import pandas as pd
import numpy as np

In [119]:
english_list = []
korean_list = []
df_dialog = pd.DataFrame(columns = ["english", "korean"])


In [120]:
for i in range(len(dial_en)):
    english_list.append(dial_en[i].string.decode('utf-8'))
    korean_list.append(dial_kr[i].string.decode('utf-8'))

df_dialog["english"] = english_list
df_dialog["korean"] = korean_list

In [121]:
df_dialog

Unnamed: 0,english,korean
0,<NO TEXT>,<NO TEXT>
1,"No, I'm sorry, none of them sound familiar.","아니오, 미안하지만, 그것들 가운데 아는 것은 없소."
2,You played Elminster?,엘민스터를 연기했습니까?
3,"Uh, the yugoloth, was it? Yeah, you stole the ...","어, 유골로스, 맞습니까? 그래요, 내 기억이 맞다면 당신은 그것으로 쇼를 독차지했었소."
4,"And, who knows, we were rehearsing for Picocci...","그리고, 누가 압니까, 우리는 피코키오의 ""에테르 속에서의 사흘간""을 연습하고 있었..."
...,...,...
103579,You remind me of that mouthy gnome.,너를 보니 그 짜증 나는 노움이 떠오르는군.
103580,~ BGII:EE French Translation ~\nMoGi\nhttps://...,~ BGII:EE French Translation ~\nMoGi\nhttps://...
103581,"~ BGII:EE Volunteer Team Leader ~\nDenis ""Isay...","~ BGII:EE Volunteer Team Leader ~\nDenis ""Isay..."
103582,"~ BGII:EE Korean Translation ~\nHosub ""Teflon""\n","~ BGII:EE Korean Translation ~\nHosub ""Teflon"""


## (4) 필요없는 데이터 및 결측치 삭제

In [122]:
print(len(df_dialog['english'].iloc[103580]))
print(len(df_dialog['korean'].iloc[103580]))

60
59


In [123]:
df_dialog.drop(df_dialog.index[103580:103584], axis=0, inplace=True)
df_dialog.drop(df_dialog.index[0], axis=0, inplace=True)

In [124]:
df_dialog.isnull().sum()

english    0
korean     0
dtype: int64

In [125]:
df_dialog

Unnamed: 0,english,korean
1,"No, I'm sorry, none of them sound familiar.","아니오, 미안하지만, 그것들 가운데 아는 것은 없소."
2,You played Elminster?,엘민스터를 연기했습니까?
3,"Uh, the yugoloth, was it? Yeah, you stole the ...","어, 유골로스, 맞습니까? 그래요, 내 기억이 맞다면 당신은 그것으로 쇼를 독차지했었소."
4,"And, who knows, we were rehearsing for Picocci...","그리고, 누가 압니까, 우리는 피코키오의 ""에테르 속에서의 사흘간""을 연습하고 있었..."
5,"Oh, my dark ravens, let us stop our squawking....","오, 이제 불평은 그만둡시다. 만약 당신이 계속 나와 함께 할 생각이라면, 나는 이..."
...,...,...
103575,I accept this honor.,이 명예를 받아들이겠어.
103576,This place does not belong to civilized folk. ...,이 장소는 문명인들의 땅이 아니군. 조심히 걷도록 해.
103577,Cities. They reek of corruption and filth.,도시. 부패와 오물의 악취가 진동하는 장소지.
103578,Careful. This weapon is murder-sharp.,


In [129]:
print(df_dialog['english'].loc[103578])
print(len(df_dialog['korean'].loc[103578]))

Careful. This weapon is murder-sharp.
0


In [130]:
print(df_dialog['english'].loc[103578])
print(df_dialog['korean'].loc[103578])

Careful. This weapon is murder-sharp.



In [131]:
short_string_indexes = df_dialog.index[df_dialog.applymap(lambda x: len(x) <= 2).any(axis=1)]

print(short_string_indexes)

Index([    30,     47,     60,     82,    193,    213,    268,    369,    378,
          388,
       ...
       103560, 103562, 103563, 103564, 103566, 103567, 103568, 103572, 103573,
       103578],
      dtype='int64', length=6794)


In [132]:
df_dialog.drop(short_string_indexes, axis=0, inplace=True)

In [133]:
df_dialog

Unnamed: 0,english,korean
1,"No, I'm sorry, none of them sound familiar.","아니오, 미안하지만, 그것들 가운데 아는 것은 없소."
2,You played Elminster?,엘민스터를 연기했습니까?
3,"Uh, the yugoloth, was it? Yeah, you stole the ...","어, 유골로스, 맞습니까? 그래요, 내 기억이 맞다면 당신은 그것으로 쇼를 독차지했었소."
4,"And, who knows, we were rehearsing for Picocci...","그리고, 누가 압니까, 우리는 피코키오의 ""에테르 속에서의 사흘간""을 연습하고 있었..."
5,"Oh, my dark ravens, let us stop our squawking....","오, 이제 불평은 그만둡시다. 만약 당신이 계속 나와 함께 할 생각이라면, 나는 이..."
...,...,...
103574,Crush them!,뭉개버리자!
103575,I accept this honor.,이 명예를 받아들이겠어.
103576,This place does not belong to civilized folk. ...,이 장소는 문명인들의 땅이 아니군. 조심히 걷도록 해.
103577,Cities. They reek of corruption and filth.,도시. 부패와 오물의 악취가 진동하는 장소지.


In [134]:
print(df_dialog['english'].iloc[31])

Forget it, then. Begone from here and take your bravado with you.


In [135]:
print(len(df_dialog.korean[31]))

29


In [136]:
df_dialog.reset_index(drop=True, inplace=True)

In [137]:
df_dialog

Unnamed: 0,english,korean
0,"No, I'm sorry, none of them sound familiar.","아니오, 미안하지만, 그것들 가운데 아는 것은 없소."
1,You played Elminster?,엘민스터를 연기했습니까?
2,"Uh, the yugoloth, was it? Yeah, you stole the ...","어, 유골로스, 맞습니까? 그래요, 내 기억이 맞다면 당신은 그것으로 쇼를 독차지했었소."
3,"And, who knows, we were rehearsing for Picocci...","그리고, 누가 압니까, 우리는 피코키오의 ""에테르 속에서의 사흘간""을 연습하고 있었..."
4,"Oh, my dark ravens, let us stop our squawking....","오, 이제 불평은 그만둡시다. 만약 당신이 계속 나와 함께 할 생각이라면, 나는 이..."
...,...,...
96780,Crush them!,뭉개버리자!
96781,I accept this honor.,이 명예를 받아들이겠어.
96782,This place does not belong to civilized folk. ...,이 장소는 문명인들의 땅이 아니군. 조심히 걷도록 해.
96783,Cities. They reek of corruption and filth.,도시. 부패와 오물의 악취가 진동하는 장소지.


In [138]:
df_dialog.to_csv("./train_data/BG2/dialog.csv")