In [15]:
import argparse
import csv
import json
import os
import time
import urllib3
import sys
from glob import glob
import config.config_keys as configs

def morp_analysis():
    api_key = configs.api_keys
    directory = 'output'
    input = './data/train/train_original_newspaper.json'
    first_index = 2
    last_index = 4950

    # Initialize directory
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    ### main process ###
    with open(input, "r") as reader:
        mylist = (json.load(reader))['documents']
    mylist = mylist[first_index:last_index]
    
    l = len(mylist)
    printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    
    with open('./' + directory + '/' + str(first_index) + '_' + str(last_index) + '.csv', 'w', newline = '') as w:
        for idx, items in enumerate(mylist):
            target = ''
            for paragraph in items['text']:
                for sentences in paragraph:
                    target += sentences['sentence'] + ' '

            processed_text = do_lang(api_key,target)
            if processed_text.startswith('openapi error')==True:
                print('openapi error')
                break
            else:
                writer = csv.writer(w)
                writer.writerow([items['media_name'], items['id'], target, processed_text, items['abstractive'][0], str(items['extractive'])])

            time.sleep(0.01)
            printProgressBar(idx + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        
        
def do_lang ( openapi_key, text ) :
    openApiURL = "http://aiopen.etri.re.kr:8000/WiseNLU"
    requestJson = { "access_key": openapi_key, "argument": { "text": text, "analysis_code": "morp" } }
    http = urllib3.PoolManager()
    response = http.request( "POST", openApiURL, headers={"Content-Type": "application/json; charset=UTF-8"}, body=json.dumps(requestJson))
    
    json_data = json.loads(response.data.decode('utf-8'))
    json_result = json_data["result"]
    
    if json_result == -1:
        json_reason = json_data["reason"]
        if "Invalid Access Key" in json_reason:
            print(json_reason)
            print("Please check the openapi access key.")
            sys.exit()
        return "openapi error - " + json_reason
    else:
        json_data = json.loads(response.data.decode('utf-8'))
    
        json_return_obj = json_data["return_object"]
        
        return_result = ""
        json_sentence = json_return_obj["sentence"]
        for json_morp in json_sentence:
            for morp in json_morp["morp"]:
                return_result = return_result+str(morp["lemma"])+"/"+str(morp["type"])+" "

        return return_result

def save_txt(directory, first_index, idx, txt):
    with open(directory + "/" +str(first_index+idx), 'w', encoding='utf-8') as f:
        f.write(txt)

# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()
        
morp_analysis()

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [14]:
import argparse
import time
import os
import pandas as pd
import ast
import json
import six
import glob
import numpy as np
from tqdm import tqdm

def csvToJson():
    mode = 'train'
    news_dir = '0_2.csv'
    output = '.'
    
    # Initialize directory
    if not os.path.exists(output):
        os.makedirs(output)
        
    if not news_dir.endswith("csv"):
        raise AssertionError("file is not a csv")
    
        ### main process ###
    mydf = pd.read_csv(news_dir)
    
    list_dic = []
    for idx, row in mydf.iterrows():
        raw = row['article_morp']
        target_idx = ast.literal_eval(row['extractive'])
        
        sentences = raw.split(' ./SF ')[:-1]
        src = [i.split(' ') for i in sentences]
        tgt = [a for i,a in enumerate(src) if i in target_idx]
        # print(tgt)
        mydict = {}
        mydict['src'] = src
        mydict['tgt'] = tgt
        list_dic.append(mydict)
    
    temp = []
    for i,a in enumerate(tqdm(list_dic)):
        if (i+1)%2!=0:
            temp.append(a)
        else:
            filename = 'korean.'+mode+'.'+str(i//6)+'.json'
            print(output+"/"+filename)

            with open(output+"/"+filename, "w", encoding='utf8') as json_file:
                json.dump(temp, json_file, ensure_ascii=False)
            temp = []

csvToJson()


[['쌀/NNG', '생산/NNG', '조정/NNG', '제/XSN', '는/JX', '벼/NNG', '를/JKO', '심/VV', '었/EP', '던/ETM', '논/NNG', '에/JKB', '벼/NNG', '대신/NNG', '사료/NNG', '작물/NNG', '이나/JC', '콩/NNG', '등/NNB', '다른/MM', '작물/NNG', '을/JKO', '심/VV', '으면/EC', '벼/NNG', '와/JKB', '의/JKG', '일정/NNG', '소득/NNG', '차/NNG', '를/JKO', '보전/NNG', '하/XSV', '어/EC', '주/VX', '는/ETM', '제도/NNG', '이/VCP', '다/EF'], ['올해/NNG', '전남/NNP', '의/JKG', '논/NNG', '다른/MM', '작물/NNG', '재배/NNG', '계획/NNG', '면적/NNG', '은/JX', '전국/NNG', '5/SN', '만/NR', 'ha/SL', '의/JKG', '약/MM', '21/SN', '%/SW', '이/VCP', 'ㄴ/ETM', '1/SN', '만/NR', '698/SN', 'ha/SL', '로/JKB', ',/SP', '세부/NNG', '시행/NNG', '지침/NNG', '을/JKO', '확정/NNG', ',/SP', '시군/NNG', '에/JKB', '통보/NNG', '하/XSV', '었/EP', '다/EF'], ['전남도/NNP', '는/JX', '돕/VV', '어/EC', '시군/NNG', '에/JKB', '관련/NNG', '기관/NNG', '과/JC', '농가/NNG', '등/NNB', '이/JKS', '참여/NNG', '하/XSV', '는/ETM', '‘/SS', '논/NNG', '타작/NNG', '물/XSN', '지원/NNG', '사업/NNG', '추진/NNG', '협의/NNG', '회/XSN', '’/SS', '를/JKO', '구성/NNG', ',/SP', '지역/NNG', '특성/NNG', '에/JKB', '맞/VV', 

100%|██████████| 2/2 [00:00<00:00, 1815.32it/s]

./korean.train.0.json



