In [63]:
import re
import json
import hgtk

from pprint import pprint
from googleapiclient import discovery

In [4]:
with open("config.json", "r"):
    config = json.load(f)

In [284]:
service = discovery.build('sheets', 'v4', developerKey=config['key'])
use_google_trans = True

In [12]:
request = service.spreadsheets().get(spreadsheetId=config['sheetId'])
response = request.execute()


In [24]:
sheet_range = {}

for sheet in response['sheets']:
    title = sheet['properties']['title']
    rows = sheet['properties']['gridProperties']['rowCount']
    
    sheet_range[title] = rows
    
pprint(sheet_range)


{'areas.json': 2,
 'dictionary': 65,
 'events1.json': 23110,
 'events2.json': 23103,
 'events3.json': 23411,
 'exchanges.json': 2002,
 'notice': 78,
 'personas.json': 3,
 'proper_nouns': 1348,
 'qualities.json': 6674,
 'statistics': 13,
 'statistics_all': 1,
 'statistics_save': 510}


In [352]:
def getValues(sheet_name, from_col, to_col, skip=0):
    range_ = "'%s'!%s1:%s%s" % (sheet_name, from_col, to_col, sheet_range[sheet_name]+1)

    value_render_option = 'FORMATTED_VALUE'  
    date_time_render_option = 'FORMATTED_STRING' 

    request = service.spreadsheets().values().get(spreadsheetId=config['sheetId'], range=range_, valueRenderOption=value_render_option, dateTimeRenderOption=date_time_render_option)
    response = request.execute()

    return response['values'][skip:]

def getBest(trans, id_col=0, default_col=1):
    if len(trans) > 4:
        key = trans[id_col]
        value = trans[-1].split('//')[0]
    elif use_google_trans and len(trans) == 4:
        key = trans[id_col]
        value = "[G] %s" % trans[-1].split('//')[0]
    else:
        key = trans[id_col]
        if len(trans) > default_col:
            value = trans[default_col]
        else:
            value = ''
        
    return key, value

In [353]:
nouns_dict={}

for row in getValues('proper_nouns', 'B', 'G'):
    if not row: continue
    
    k, v = getBest(row, 1)
    nouns_dict[k] = v
    
# pprint(nouns_dict)

In [354]:
def replace_nouns(text):
    total_text = ""
    splited=text.split('@')
    
    for idx in range(1, len(splited), 2):
        word = splited[idx]
        if word in nouns_dict:
            word = nouns_dict[word]

        if idx+1 > len(splited)-1:
            total_text += word
            continue

        lefts = splited[idx+1]
        josa = re.findall(r"[\w']+", lefts)
        nexts = ""

        if josa and josa in ('은', '는', '이', '가', '을', '를', '와', '과', '이다', '다', '로', '으로'):

            nexts = lefts[len(josa):]

            if hgtk.checker.is_hangul(word):
                if josa in ('은', '는'):
                    word = hgtk.josa.attach(word, hgtk.josa.EUN_NEUN)
                elif josa in ('이', '가'):
                    word = hgtk.josa.attach(word, hgtk.josa.I_GA)
                elif josa in ('을', '를'):
                    word = hgtk.josa.attach(word, hgtk.josa.EUL_REUL)
                elif josa in ('와', '과'):
                    word = hgtk.josa.attach(word, hgtk.josa.GWA_WA)
                elif josa in ('이다', '다'):
                    word = hgtk.josa.attach(word, hgtk.josa.IDA_DA)
                elif josa in ('로', '으로'):
                    word = hgtk.josa.attach(word, hgtk.josa.EURO_RO)
            else:
                has_batchim = word[-1] in "bcdfgjklmnpqrstvxz"
                
                if josa in ('은', '는'):
                    word += '은' if has_batchim else '는'
                elif josa in ('이', '가'):
                    word += '이' if has_batchim else '가'
                elif josa in ('을', '를'):
                    word += '을' if has_batchim else '를'
                elif josa in ('와', '과'):
                    word += '과' if has_batchim else '와'
                elif josa in ('이다', '다'):
                    word += '이다' if has_batchim else '다'
                elif josa in ('로', '으로'):
                    word += '으로' if has_batchim else '로'

            total_text += word + nexts
        else:
            total_text += word + lefts
        
    return splited[0] + total_text

# replace_nouns('나는 d@Drowning-Pearl@을 @Drowning-Pearl@과 교환하려 @할a@로 @dsad@ 거야.')

In [355]:
def loadTrans(sheet_name):
    events_trans = {}
    current_table = {}

    for row in getValues(sheet_name, 'A', 'F', 3):
        if row[0].startswith('@DataMappingObjectV1Header'):
            current_table = {}

        elif row[0].startswith('@DataMappingObjectV1Footer'):
            events_trans[current_table['Id']] = current_table

        elif row[0].startswith('@Mapping(5)'):
            pass

        else:
            k, v = getBest(row, 0)
            v = v.replace('\\', '')
            current_table[k] = replace_nouns(v)

    return events_trans


In [356]:
events = {}
print("Loading events1")
events.update(loadTrans('events1.json'))
print("Loading events2")
events.update(loadTrans('events2.json'))
print("Loading events3")
events.update(loadTrans('events3.json'))

print("Loading qualities")
qualities = loadTrans('qualities.json')

In [357]:
len(events), len(qualities), list(events.items())[0], events['141880']

(9754,
 723,
 ('182276',
  {'Description': '당신은 킹이터즈 캐슬에서 진행할 수 있을 의식에 대해 설명했다. 어쨌건, 그는 그의 고통을 잊을 수 있을 것이다. 그는 그가 잊어버렸다는 사실 그 자체도 잊어버리겠지. 그의 고통받는 일면은 사라질 거고, 남은 것들만이 당신을 따를 것이다. 그는 몸서리치며. "이건 용서 따위가 아냐." 이어서 속삭였다. "하지만 정의 구현이란게 있다면 이런 거겠지."',
   'Id': '182276',
   'Name': '당신의 배가 꼬르륵 거린다',
   'Teaser': '당신은 킹이터즈 캐슬에서 진행할 수 있을 의식에 대해 설명했다...'}),
 {'Description': '[G] 그녀는 Unterzee에 가장 큰 도시입니다. 그리고 당신은 그것을 잊지 마세요.',
  'Id': '141880',
  'Name': '런던!',
  'Teaser': '[G] 그녀는 Unterzee에 가장 큰 도시입니다 ...'})

In [358]:
with open('entities/events.json') as f:
    events_origin = json.load(f)

with open('entities/qualities.json') as f:
    qualities_origin = json.load(f)

len(events_origin), len(qualities_origin)

(870, 956)

In [359]:
def step_into(node, trans_dict):
    matched = 0
    unmatched = 0

    if type(node) == dict:
        if 'Id' in node:
            node_id = node['Id']
            trans = trans_dict.get(str(node_id), None)
            if trans:
                matched += 1

                node['Name'] = trans['Name']
                node['Teaser'] = trans['Teaser']
                node['Description'] = trans['Description']

                if not node['Teaser']:
                    temp_teaset = node['Description'].split('.')[0][:40]
                    node['Teaser'] = temp_teaset
            else:
                unmatched += 1

        for k, v in node.items():
            m, u = step_into(v, trans_dict)
            matched += m
            unmatched +=u

    elif type(node) == list:
        for item in node:
            m, u = step_into(item, trans_dict)
            matched += m
            unmatched +=u

    return matched, unmatched



In [360]:
matched, unmatched = step_into(events_origin, events)
print('events matched:', matched, 'unmatched:', unmatched, " = ", matched*100/(matched+unmatched), "%")

matched, unmatched = step_into(qualities_origin, qualities)
print('qualities matched:', matched, 'unmatched:', unmatched, " = ", matched*100/(matched+unmatched), "%")

events matched: 14676 unmatched: 19789  =  42.58232989989845 %
qualities matched: 916 unmatched: 610  =  60.026212319790304 %


In [361]:
with open("entities_kr/events.json", "w") as f:
    json.dump(events_origin, f)
    
with open("entities_kr/qualities.json", "w") as f:
    json.dump(qualities_origin, f)
    
    