In [50]:
PATH_ARTICLE = './datasets/raw/articles.parquet'
PATH_NLP = './datasets/raw/articles.nlp'
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
SAMPLE_SIZE = 200 # or None
SAMPLE_VARIANT = ['LWN'] # or None
USE_GPU = False

In [51]:
import pandas as pd

df = pd.read_parquet(PATH_ARTICLE)

In [52]:
if SAMPLE_SIZE:
    dfs = df.sample(SAMPLE_SIZE, random_state=999).reset_index(drop=True)
else:
    dfs = df

In [53]:
import stanza

nlp = stanza.Pipeline(
    lang='id',
    processors='tokenize,pos,lemma',
    use_gpu=USE_GPU,
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)



2023-08-26 15:34:36 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-08-26 15:34:36 INFO: Using device: cpu
2023-08-26 15:34:36 INFO: Loading: tokenize
2023-08-26 15:34:37 INFO: Loading: mwt
2023-08-26 15:34:37 INFO: Loading: pos
2023-08-26 15:34:37 INFO: Loading: lemma
2023-08-26 15:34:38 INFO: Done loading processors!


In [54]:
from time import time

import pandas as pd
from gensim.models.phrases import Phrases
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import \
    StopWordRemoverFactory
from tqdm import tqdm

stem = StemmerFactory().create_stemmer().stem
remove = StopWordRemoverFactory().create_stop_word_remover().remove

def preprocess(df_nlp, processes):
    df = {}
    allowed_pos = ['NOUN', 'PROPN']
    elapse_time = []
    for p in (tp := tqdm(processes)):
        tp.set_description(f'Processing {p}')
        t_start = time()
        c = 'C' in p
        l = 'L' in p
        f = 'F' in p
        w = 'W' in p
        n = 'N' in p
        b = 'B' in p
        s = 'S' in p
        docs_long = []
        docs_short = []
        for doc in df_nlp:
            if b:
                docs_long.append(doc.text)
                for sent in doc.sentences:
                    docs_short.append(sent.text)
                    t_end_d = time()
                t_end_D = time()
                continue
            tokens = []
            for sent in doc.sentences:
                tokens_short = []
                for word in sent.words:
                    _word = word.text
                    if f:
                        if word.upos not in allowed_pos:
                            continue
                    if s:
                        sw = stem(_word)
                        if len(sw) == 0: continue
                        _word = sw
                    if w:
                        if len(remove(_word.lower())) == 0:
                            continue
                    if l:
                        _word = word.lemma if word.lemma else _word
                    if c:
                        _word = _word if l else word.text.lower()
                    tokens.append(_word)
                    tokens_short.append(_word)
                if len(tokens_short) == 0: continue
                docs_short.append(tokens_short)
            t_end_d = time()
            if len(tokens) == 0: continue
            docs_long.append(tokens)
        t_end_D = time()
        if n:
            bigram_short = Phrases(docs_short).freeze()
            trigram_short = Phrases(bigram_short[docs_short]).freeze()
            docs_short = trigram_short[bigram_short[docs_short]]
            t_end_d = time()
            bigram_long = Phrases(docs_long).freeze()
            trigram_long = Phrases(bigram_long[docs_long]).freeze()
            docs_long = trigram_long[bigram_long[docs_long]]
            t_end_D = time()
        df[f'd{p}'] = pd.Series(docs_short)
        elapse_time.append([f'd{p}', t_end_d - t_start])
        df[f'D{p}'] = pd.Series(docs_long)
        elapse_time.append([f'D{p}', t_end_D - t_start])
    return (pd.concat(df, axis=1), elapse_time)

In [55]:
from os.path import isfile
from pickle import load, dump

t_start = time()
docs_path = PATH_NLP
if isfile(docs_path):
    with open(docs_path, 'rb') as f:
        docs = load(f)
else:
    docs = list(dfs['article'].apply(lambda doc: stanza.Document([], text=doc)))
    docs = nlp(docs)
    with open(docs_path, 'wb') as f:
        dump(docs, f)
t_nlp = time()

In [56]:
from itertools import combinations as c

def make_v(sets):
    r = []
    for i in range(len(sets)):
        r = r + [''.join(x) for x in c(sets, i+1)]
    return r

variant_stanza_filter = make_v(['C', 'L', 'F', 'N'])
variant_stanza_stopwords = make_v(['C', 'L', 'W', 'N'])
variant_sastrawi_filter = make_v(['S', 'F', 'N'])
variant_sastrawi_stopwords = make_v(['S', 'W', 'N'])
variant_full = sorted(list(set(
    ['B', 'T']
    + variant_stanza_filter
    + variant_stanza_stopwords
    + variant_sastrawi_filter
    + variant_sastrawi_stopwords
)), key=lambda x: (len(x), x))

In [57]:
if SAMPLE_VARIANT:
    variant = SAMPLE_VARIANT
else:
    variant = variant_full

ds, elapse_time = preprocess(docs, variant)
t_ds = time()

Processing LWN:   0%|          | 0/1 [00:00<?, ?it/s]

Processing LWN: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]


In [58]:
elapse_time = pd.DataFrame([{
    'variant': v,
    'nlp': t_nlp - t_start,
    'preprocessing': t
} for v, t in elapse_time ])

In [59]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing
0,dLWN,267.22045,0.790498
1,DLWN,267.22045,1.186545


In [60]:
ds

Unnamed: 0,dLWN,DLWN
0,"[kisah, misterius, mati, keluarga, kalideres, .]","[kisah, misterius, mati, keluarga, kalideres, ..."
1,"[direktur, reserse, kriminal, umum, polda_metr...","[teliti, brin, penosongan, sesar, gempa, cianj..."
2,"[proses, selidi, jalan, henti, .]","[idola, piala_dunia_2022, ,, cho_gue, hadapi, ..."
3,"[polisi, temu, minimal, barang, bukti, rujuk, ...","[menjelang_natal, tahun, ,, pasar, jaya, ada, ..."
4,"[motif, bunuh, bunuh, mati, keluarga, kalidere...","[fitur, komunikasi, darurat, via, satelit, iph..."
...,...,...
4150,"[turnamen, ,, timnas_indonesia, grup_juara, ta...",
4151,"[timnas_indonesia, hadap, kamboja, laga, perda...",
4152,"[,, indonesia, tandang, markas, brunei_darussa...",
4153,"[laga, ,, indonesia, jajal, kuat, filipina, ma...",


In [61]:
ds.to_parquet(PATH_FILE_PREPROCESSED, index=False)
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)