In [3]:
import numpy as np
import pandas as pd

import time
import json
import shutil
import pathlib
from pprint import pprint
from tqdm import tqdm

In [4]:
if not pathlib.Path('../data/dict.txt').exists():
    !curl -O https://github.com/ldkrsi/jieba-zh_TW/blob/master/jieba/dict.txt ../data/

In [None]:
%%time
import multiprocessing as mp
from itertools import islice

import jieba
jieba.set_dictionary('../data/dict.txt')

def extract_record(f):
    keys = [
        '@url:', '@MainTextMD5:', '@UntagMD5:', '@SiteCode:', '@UrlCode:', '@title:', 
        '@Size:', '@keyword:', '@image_links:', '@Fetchtime:', '@post_time:', '@Ref:',
        '@BodyMD5:', '@Lang:', '@IP:',
    ]
    while True:
        line = f.readline()
        if line == '': # EOF
            break

        line = line.strip()
        if line == '@': # record 的開始
            record = dict()
            continue
        if line.startswith('@body:'): # 特判
            time = f.readline() # discard timestamp
            text = f.readline().strip()
            record['body'] = line[6:] + time + text
            yield record
            continue
        
        for key in keys: # 取出要記錄的欄位
            if line.startswith(key):
                value = line[len(key):]
                value = value if value != 'none' else None
                record[key[1:-1].lower()] = value
                break

                
def segmentize(arg):
    idx, record = arg
    record['body'] = ' '.join(jieba.cut(record['body'], cut_all=False))
    out_path = out_dir / f'{idx:010d}.json'
    with open(out_path, 'w') as f:
        json.dump(record, f, ensure_ascii=False)
    return idx


raw = pathlib.Path('../data/ettoday').resolve()
out_dir = pathlib.Path('../data/json/').resolve()
if out_dir.exists():
    shutil.rmtree(str(out_dir))
out_dir.mkdir(exist_ok=True)

# 資料有混合編碼，開檔時記得指定 erros 的處理方法
with open(raw, encoding='utf-8', errors='ignore') as f:
    with mp.Pool(3) as p:
        record_gen = enumerate(extract_record(f))
        firstn_gen = islice(record_gen, 500_000)
        result_gen = p.imap(segmentize, firstn_gen)
        for idx in result_gen:
            if (idx + 1) % 20_000 == 0:
                print(f'Processed {idx+1:10d} records')
        
print('Total:', idx + 1)

Building prefix dict from /home/amoshyc/workspace/ccu-search-engine/data/dict.txt ...
Building prefix dict from /home/amoshyc/workspace/ccu-search-engine/data/dict.txt ...
Loading model from cache /tmp/jieba.ube0cd633b915ed66f2168c08d0d21602.cache
Loading model from cache /tmp/jieba.ube0cd633b915ed66f2168c08d0d21602.cache
Building prefix dict from /home/amoshyc/workspace/ccu-search-engine/data/dict.txt ...
Loading model from cache /tmp/jieba.ube0cd633b915ed66f2168c08d0d21602.cache
Loading model cost 1.432 seconds.
Prefix dict has been built succesfully.
Loading model cost 1.450 seconds.
Prefix dict has been built succesfully.
Loading model cost 1.598 seconds.
Prefix dict has been built succesfully.


Processed      20000 records
Processed      40000 records
Processed      60000 records
Processed      80000 records


In [None]:
# checking
# !cat ../data/json/0000000000.json
# !echo ""
# !cat ../data/json/0000000003.json

In [None]:
dict_path = pathlib.Path('../data/dict.txt').resolve()
with dict_path.open() as f:
    queries = []
    weights = []
    for line in f:
        line = line.split()
        queries.append(line[0])
        weights.append(int(line[1]))

print(len(queries))
weights = np.float32(weights)
weights /= np.sum(weights)

def get_queries(n):
    return np.random.choice(queries, size=n, p=weights)

queries = get_queries(100_000)
np.save('../data/queries.npy', queries)
print(queries[:10])