In [1]:
import os
import requests
import zipfile
import shutil

# 로그인 위치
url = 'https://tagtog.net/-login'

# 다운로드 위치
file_url = 'https://tagtog.net/nannullna/this-is-real/-downloads/dataset-as-anndoc'
zip_file = 'download.zip'

if os.path.exists(zip_file):
    os.remove(zip_file)
    
# 로그인 정보
login_info = {
    'loginid': '', # 아이디 입력
    'password': '' # 비밀번호 입력
}

# 로그인
with requests.Session() as s:
    login_req = s.post(url, data=login_info)
    r = s.get(file_url)
    
    with open(zip_file, 'wb') as output:
        output.write(r.content)
        
# 압축 파일 풀기
folder_path = './tagtog_result'

zip_ = zipfile.ZipFile(zip_file)

if os.path.exists(folder_path):
    shutil.rmtree(folder_path)

zip_.extractall(folder_path)

os.remove(zip_file)

In [1]:
import json
import glob
import os

folder_path = './tagtog_result'

#target_folder = 'test/jeonju_hyanggyo'
target_folder = '관광지'

# 폴더 경로
root_path = os.path.join(folder_path, 'this-is-real')

json_root_path = os.path.join(root_path, 'ann.json/master/pool')
html_root_path = os.path.join(root_path, 'plain.html/pool')

json_path = os.path.join(json_root_path, target_folder)
html_path = os.path.join(html_root_path, target_folder)

def get_unique_file_name(file_name):
    return file_name[:file_name[:file_name.rfind('.')].rfind('.')]

# 파일명 목록
file_list = [get_unique_file_name(file) for file in os.listdir(html_path)]

# 파일 목록
json_file_list = os.listdir(json_path)
html_file_list = os.listdir(html_path)

files = {}
for file in file_list:
    files[file] = {'json': '', 'html': ''}

for json_file in json_file_list:
    files[get_unique_file_name(json_file)]['json'] = os.path.join(json_path, json_file)
    
for html_file in html_file_list:
    files[get_unique_file_name(html_file)]['html'] = os.path.join(html_path, html_file)

# annotation_legend
annotation_legend = os.path.join(root_path, 'annotations-legend.json')
with open(annotation_legend, 'r') as f:
    annotation_legend = json.load(f)

In [14]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

data = {
    'title': [],
    'sentence': [],
    'sentence_with_entity': [],
    'subject_entity': [],
    'object_entity': [],
    'subject_entity_word': [],
    'subject_entity_start_idx': [],
    'subject_entity_end_idx': [],
    'subject_entity_type': [],
    'object_entity_word': [],
    'object_entity_start_idx': [],
    'object_entity_end_idx': [],
    'object_entity_type': [],    
}

for i, key in enumerate(files.keys()):
    # get title and sentence information from html file
    with open(files[key]['html'], 'r') as f:
        html_obj = f.read()
        
    bs_obj = BeautifulSoup(html_obj, 'html.parser')
    title, sentence = [obj.text for obj in bs_obj.select('pre')]

    data['title'].append(title)
    data['sentence'].append(sentence)


    # get entity information from json file
    entities = {
        'subj': {'word': None, 'start_idx': -1, 'end_idx': -1, 'type': None},
        'obj': {'word': None, 'start_idx': -1, 'end_idx': -1, 'type': None}
    }

    if files[key]['json'] != '':
        with open(files[key]['json'], 'r') as f:
            json_obj = json.load(f)
            
        for entity in json_obj['entities']:
            e_info, e_type = annotation_legend[entity['classId']].split('_')
            entities[e_info]['word'] = entity['offsets'][0]['text']
            entities[e_info]['start_idx'] = entity['offsets'][0]['start']
            entities[e_info]['end_idx'] = entity['offsets'][0]['start'] + len(entity['offsets'][0]['text']) - 1
            entities[e_info]['type'] = e_type

    data['subject_entity'].append(entities['subj'] if entities['subj']['word'] is not None else None)
    data['subject_entity_word'].append(entities['subj']['word'])
    data['subject_entity_start_idx'].append(entities['subj']['start_idx'])
    data['subject_entity_end_idx'].append(entities['subj']['end_idx'])
    data['subject_entity_type'].append(entities['subj']['type'])
    data['object_entity'].append(entities['obj'] if entities['obj']['word'] is not None else None)
    data['object_entity_word'].append(entities['obj']['word'])
    data['object_entity_start_idx'].append(entities['obj']['start_idx'])
    data['object_entity_end_idx'].append(entities['obj']['end_idx'])
    data['object_entity_type'].append(entities['obj']['type']) 

    # get sentence with entities information
    sentence_w_entity = sentence
    entities['subj']['symbol'] = '$$'
    entities['obj']['symbol'] = '@@'
    
    entity_list = sorted([val for val in entities.values()], key=lambda x: x['start_idx'], reverse=True)
    for entity in entity_list:
        if entity['word'] != '':
            b_str = sentence_w_entity[:entity['start_idx']]
            e_str = sentence_w_entity[entity['start_idx']:entity['end_idx']+1]
            a_str = sentence_w_entity[entity['end_idx']+1:]            
            sentence_w_entity = b_str + entity['symbol'] + e_str + entity['symbol'] + a_str    
    data['sentence_with_entity'].append(sentence_w_entity)

df = pd.DataFrame(data)
df = df.sort_values('title')

In [15]:
df.head(3)

Unnamed: 0,title,sentence,sentence_with_entity,subject_entity,object_entity,subject_entity_word,subject_entity_start_idx,subject_entity_end_idx,subject_entity_type,object_entity_word,object_entity_start_idx,object_entity_end_idx,object_entity_type
1158,63빌딩,"9월에 대한생명(현 한화생명)이 한화그룹에 인수, 편입되면서 63빌딩 역시 한화그룹...","9월에 대한생명(현 한화생명)이 한화그룹에 인수, 편입되면서 @@63빌딩@@ 역시 ...","{'word': '한화그룹', 'start_idx': 42, 'end_idx': 4...","{'word': '63빌딩', 'start_idx': 34, 'end_idx': 3...",한화그룹,42,45,LOC,63빌딩,34,37,LOC
1703,63빌딩,방송 당시 63빌딩의 건물 관계자가 개그맨 김경진과 함께 60층 스카이 아트 전망대...,방송 당시 $$63빌딩$$의 건물 관계자가 개그맨 김경진과 함께 @@60층 스카이 ...,"{'word': '63빌딩', 'start_idx': 6, 'end_idx': 9,...","{'word': '60층 스카이 아트 전망대', 'start_idx': 32, 'e...",63빌딩,6,9,LOC,60층 스카이 아트 전망대,32,45,LOC
1346,63빌딩,63빌딩 계단 오르기 대회는 1995년 개관 10주년을 맞이하여 대회가 열렸다.,$$63빌딩$$ 계단 오르기 대회는 @@1995년 개관 10주년@@을 맞이하여 대회...,"{'word': '63빌딩', 'start_idx': 0, 'end_idx': 3,...","{'word': '1995년 개관 10주년', 'start_idx': 16, 'en...",63빌딩,0,3,LOC,1995년 개관 10주년,16,28,POH


In [12]:
df.to_excel(f'{target_folder}.xlsx', index=False, encoding='utf-8')