In [36]:
PATH_ARTICLE = './datasets/articles.parquet'
PATH_NLP = './results/articles.nlp'
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
SAMPLE_SIZE = 30 # or None
SAMPLE_VARIANT_LIST = None # ['T', 'LWN'] # or None
SAMPLE_VARIANT = 2 # or None
USE_GPU = False

In [37]:
import pandas as pd

df = pd.read_parquet(PATH_ARTICLE)

In [38]:
if SAMPLE_SIZE:
    dfs = df.sample(SAMPLE_SIZE, random_state=999).reset_index(drop=True)
else:
    dfs = df

In [None]:
import stanza

nlp = stanza.Pipeline(
    lang='id',
    processors='tokenize,pos,lemma',
    use_gpu=USE_GPU,
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

In [40]:
from time import time

import pandas as pd
from gensim.models.phrases import Phrases
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import \
    StopWordRemoverFactory
from tqdm import tqdm

stem = StemmerFactory().create_stemmer().stem
remove = StopWordRemoverFactory().create_stop_word_remover().remove

def preprocess(df_nlp, variant):
    df = {}
    elapse_time = []
    for v in (tp := tqdm(variant)):
        tp.set_description(f'Processing {v}')
        t_start = time()
        c = 'C' in v
        l = 'L' in v
        s = 'S' in v
        n = 'N' in v
        w = 'W' in v
        p = 'P' in v
        b = 'B' in v
        docs_long = []
        docs_short = []
        for doc in df_nlp:
            if b:
                docs_long.append(doc.text)
                for sent in doc.sentences:
                    docs_short.append(sent.text)
                    t_end_d = time()
                t_end_D = time()
                continue
            tokens = []
            for sent in doc.sentences:
                tokens_short = []
                for word in sent.words:
                    _word = word.text
                    if n:
                        if word.upos not in ['NOUN', 'PROPN']:
                            continue
                    if s:
                        sw = stem(_word)
                        if len(sw) == 0: continue
                        _word = sw
                    if w:
                        if len(remove(_word.lower())) == 0:
                            continue
                    if l:
                        _word = word.lemma if word.lemma else _word
                    if c:
                        _word = _word.lower()
                    tokens.append(_word)
                    tokens_short.append(_word)
                if len(tokens_short) == 0: continue
                docs_short.append(tokens_short)
            t_end_d = time()
            if len(tokens) == 0: continue
            docs_long.append(tokens)
        t_end_D = time()
        if p:
            bigram_short = Phrases(docs_short).freeze()
            trigram_short = Phrases(bigram_short[docs_short]).freeze()
            docs_short = trigram_short[bigram_short[docs_short]]
            t_end_d = time()
            bigram_long = Phrases(docs_long).freeze()
            trigram_long = Phrases(bigram_long[docs_long]).freeze()
            docs_long = trigram_long[bigram_long[docs_long]]
            t_end_D = time()
        df[f'd{v}'] = pd.Series(docs_short)
        elapse_time.append([f'd{v}', t_end_d - t_start])
        df[f'D{v}'] = pd.Series(docs_long)
        elapse_time.append([f'D{v}', t_end_D - t_start])
    return (pd.concat(df, axis=1), elapse_time)

In [41]:
from os.path import isfile
from pickle import load, dump

t_start = time()
docs_path = PATH_NLP
if isfile(docs_path):
    with open(docs_path, 'rb') as p:
        docs = load(p)
else:
    docs = list(dfs['article'].apply(lambda doc: stanza.Document([], text=doc)))
    docs = nlp(docs)
    with open(docs_path, 'wb') as p:
        dump(docs, p)
t_nlp = time()

In [42]:
from itertools import combinations as c

def make_v(sets):
    r = []
    for i in range(len(sets)):
        r = r + [''.join(x) for x in c(sets, i+1)]
    return r

variant_full = sorted(list(set(
    ['B', 'T']
    + make_v('CLNP')
    + make_v('CLNG')
    + make_v('CLWP')
    + make_v('CLWG')
    + make_v('CSNP')
    + make_v('CSNG')
    + make_v('CSWP')
    + make_v('CSWG')
)), key=lambda x: (len(x), x))
' '.join(variant_full)
# len(variant_full)

'B C G L N P S T W CG CL CN CP CS CW LG LN LP LW NG NP SG SN SP SW WG WP CLG CLN CLP CLW CNG CNP CSG CSN CSP CSW CWG CWP LNG LNP LWG LWP SNG SNP SWG SWP CLNG CLNP CLWG CLWP CSNG CSNP CSWG CSWP'

In [43]:
from random import sample

if SAMPLE_VARIANT_LIST:
    variant = SAMPLE_VARIANT_LIST
elif SAMPLE_VARIANT:
    variant = sample(variant_full, SAMPLE_VARIANT)
else:
    variant = variant_full

ds, elapse_time = preprocess(docs, variant)
t_ds = time()

Processing SG:   0%|          | 0/2 [00:00<?, ?it/s]

Processing SWP: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


In [44]:
elapse_time = pd.DataFrame([{
    'variant': v,
    'tokenizing': t_nlp - t_start,
    'preprocessing': t
} for v, t in elapse_time ])

In [45]:
elapse_time

Unnamed: 0,variant,tokenizing,preprocessing
0,dSG,4.591405,1.53467
1,DSG,4.591405,1.534672
2,dSWP,4.591405,0.573513
3,DSWP,4.591405,0.702334


In [46]:
ds

Unnamed: 0,dSG,DSG,dSWP,DSWP
0,"[akhir, kisah, misterius, mati, satu, keluarga...","[akhir, kisah, misterius, mati, satu, keluarga...","[kisah, misterius, mati, keluarga, kalideres]","[kisah, misterius, mati, keluarga, kalideres, ..."
1,"[direktur, reserse, kriminal, umum, polda, met...","[teliti, brin, sebut, kosong, dari, sesar, gem...","[direktur, reserse, kriminal, umum, polda, met...","[teliti, brin, kosong, sesar, gempa, cianjur, ..."
2,"[proses, lidi, yang, telah, jalan, lama, satu,...","[jadi, idola, di, piala, dunia, 2022, cho, gue...","[proses, lidi, jalan, henti]","[idola, piala_dunia_2022, cho_gue-sung, 2, rum..."
3,"[polisi, juga, tidak, temu, minimal, dua, bara...","[jelang, natal, dan, tahun, baru, pasar, jaya,...","[polisi, temu, minimal, barang, bukti, rujuk, ...","[jelang_natal, tahun, pasar, jaya, adakan, pas..."
4,"[motif, bunuh, diri, atau, bunuh, juga, tidak,...","[fitur, komunikasi, darurat, via, satelit, iph...","[motif, bunuh, bunuh, mati, keluarga, kalideres]","[fitur, komunikasi, darurat, via, satelit, iph..."
...,...,...,...,...
1976,"[xi, jinping, juga, akan, hadir, ktt, negara, ...",,"[kunjung, xi_jinping, china, upaya, hubung, ne...",
1977,"[menteri, energi, arab, saudi, pangeran, abdul...",,"[china, mitra, dagang, arab_saudi, pangeran, m...",
1978,"[dia, catat, bahwa, kerja, sama, antara, china...",,,
1979,"[kunjung, xi, jinping, itu, laku, saat, china,...",,,


In [47]:
ds.to_parquet(PATH_FILE_PREPROCESSED, index=False)
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)