# 將大量txt轉成工研院json檔

In [14]:
import hashlib
from collections import defaultdict
import json
import pandas as pd
from pathlib import Path
from docx import Document
from natsort import natsorted
from collections import OrderedDict

root_path = '../tests/input_data/gun_20201102'
common_path = Path(root_path)
print("input path:", common_path)


save_path = Path(common_path).with_name(Path(common_path).stem).with_suffix(".json")
print("output path:", save_path)

glob_path = Path(common_path)

filename_pattern = '*/**/*.'
ext_list = ['txt', 'docx']
filepathes = []

for ext in ext_list:
    filename_pattern_ext = filename_pattern + ext
    filepathes += glob_path.glob(filename_pattern_ext)

filepathes = natsorted(filepathes, key=str)
print(filepathes)

articles_dict = OrderedDict()

for filepath in filepathes:
    content_dict = {}
    content_dict['Title'] = str(filepath.relative_to(common_path))
    content_dict['Content'] = ''
    content_dict['Author'] = ''
    content_dict['Time'] = ''

    try:
        # read .txt first
        with open(filepath, 'r', encoding='utf-8') as f:
            content_dict['Content'] = f.read()
            text_id = hashlib.md5(content_dict['Content'].encode('utf-8')).hexdigest()[:10]
    except:
        try:
            # read .docx
            doc = Document(filepath)
            finalText = []
            for line in doc.paragraphs:
                finalText.append(line.text)
            content_dict['Content'] = '\n'.join(finalText)
            text_id = hashlib.md5(content_dict['Content'].encode('utf-8')).hexdigest()[:10]
        except:
            pass

    try:
        articles_dict['Articles'].update({text_id: content_dict})
    except:
        articles_dict['Articles'] = {}
        articles_dict['Articles'].update({text_id: content_dict})

print(list(articles_dict['Articles'].keys()))
# read into dataframe will automatically sort by index
dataframe = pd.DataFrame.from_dict(articles_dict).loc[list(articles_dict['Articles'].keys())]
print(dataframe)

# because articles_dict['Articles'] use text_id as key to update,
# if there were duplicate text_id, it'll replace by later items.
# so no need to check duplicate.
###
# dataframe.reset_index(inplace=True)
# dup_id = dataframe.duplicated(['index'], keep=False)
# print("duplicated entries: {}".format(len(dataframe[dup_id])))
# print(dataframe[dup_id])

# dataframe = dataframe.groupby(['index']).apply(lambda x: x.iloc[0])
# print("keep first, drop duplicated!")

# dataframe.set_index('index', inplace=True)

with open(save_path, 'w', encoding='utf-8') as outfile:
    json.dump(dataframe.to_dict(), outfile, ensure_ascii=False, indent=4)

input path: ../tests/input_data/gun_20201102
output path: ../tests/input_data/gun_20201102.json
[PosixPath('../tests/input_data/gun_20201102/01.槍砲彈藥刀械管制條例第11條第4項/106/TND,105,訴,829,20170125,1.docx'), PosixPath('../tests/input_data/gun_20201102/01.槍砲彈藥刀械管制條例第11條第4項/106/TND,105,訴,829,20170125,1.txt'), PosixPath('../tests/input_data/gun_20201102/02.槍砲彈藥刀械管制條例第12條第1項/105/CHD,104,審訴,886,20160224,1.txt'), PosixPath('../tests/input_data/gun_20201102/02.槍砲彈藥刀械管制條例第12條第1項/105/ILD,105,訴,12,20160323,1.txt'), PosixPath('../tests/input_data/gun_20201102/02.槍砲彈藥刀械管制條例第12條第1項/105/MLD,105,訴,78,20160524,4.txt'), PosixPath('../tests/input_data/gun_20201102/02.槍砲彈藥刀械管制條例第12條第1項/105/PTD,105,審原訴,6,20161208,1.txt'), PosixPath('../tests/input_data/gun_20201102/02.槍砲彈藥刀械管制條例第12條第1項/105/SCD,105,審訴,91,20160401,1.txt'), PosixPath('../tests/input_data/gun_20201102/02.槍砲彈藥刀械管制條例第12條第1項/105/SCD,105,審訴,202,20160705,1.txt'), PosixPath('../tests/input_data/gun_20201102/02.槍砲彈藥刀械管制條例第12條第1項/105/TPD,105,審訴,735,20161012,1

In [11]:
len(articles_dict['Articles'])

19

### cmdline  

In [16]:
!python ../DataTag_helper.py original -d ../tests/input_data/gun_20201102

input path: ../tests/input_data/gun_20201102
output path: ../tests/input_data/gun_20201102.json
