In [None]:
import stanza

nlp = stanza.Pipeline(
    lang='id',
    processors='tokenize,pos,lemma',
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

In [21]:
from time import time

import pandas as pd
from gensim.models.phrases import Phrases
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import \
    StopWordRemoverFactory
from tqdm import tqdm

stem = StemmerFactory().create_stemmer().stem
remove = StopWordRemoverFactory().create_stop_word_remover().remove

def preprocess(df_nlp, processes):
    df = {}
    allowed_pos = ['NOUN', 'PROPN']
    elapse_time = []
    for p in (tp := tqdm(processes)):
        tp.set_description(f'Processing {p}')
        t_start = time()
        c = 'C' in p
        l = 'L' in p
        f = 'F' in p
        w = 'W' in p
        n = 'N' in p
        b = 'B' in p
        s = 'S' in p
        docs_long = []
        docs_short = []
        for doc in df_nlp:
            if b:
                docs_long.append(doc.text)
                for s in doc.sentences:
                    docs_short.append(s.text)
                    t_end_d = time()
                t_end_D = time()
                continue
            tokens = []
            for s in doc.sentences:
                tokens_short = []
                for word in s.words:
                    _word = word.text
                    if f:
                        if word.upos not in allowed_pos:
                            continue
                    if s:
                        sw = stem(_word)
                        if len(sw) == 0: continue
                        _word = sw
                    if w:
                        if len(remove(_word.lower())) == 0:
                            continue
                    if l:
                        _word = word.lemma if word.lemma else _word
                    if c:
                        _word = _word if l else word.text.lower()
                    tokens.append(_word)
                    tokens_short.append(_word)
                if len(tokens_short) == 0: continue
                docs_short.append(tokens_short)
            t_end_d = time()
            if len(tokens) == 0: continue
            docs_long.append(tokens)
        t_end_D = time()
        if n:
            bigram_short = Phrases(docs_short).freeze()
            trigram_short = Phrases(bigram_short[docs_short]).freeze()
            docs_short = trigram_short[bigram_short[docs_short]]
            t_end_d = time()
            bigram_long = Phrases(docs_long).freeze()
            trigram_long = Phrases(bigram_long[docs_long]).freeze()
            docs_long = trigram_long[bigram_long[docs_long]]
            t_end_D = time()
        df[f'd{p}'] = pd.Series(docs_short)
        elapse_time.append([f'd{p}', t_end_d - t_start])
        df[f'D{p}'] = pd.Series(docs_long)
        elapse_time.append([f'D{p}', t_end_D - t_start])
    return (pd.concat(df, axis=1), elapse_time)

In [13]:
import pandas as pd

df = pd.read_parquet('./datasets/raw/articles.parquet')

In [14]:
dfs = df.sample(500, random_state=999).reset_index(drop=True)

In [15]:
from os.path import isfile
from pickle import load, dump

t_start = time()
docs_path = './datasets/raw/articles.nlp'
if isfile(docs_path):
    with open(docs_path, 'rb') as f:
        docs = load(f)
else:
    docs = list(dfs['article'].apply(lambda doc: stanza.Document([], text=doc)))
    docs = nlp(docs)
    with open(docs_path, 'wb') as f:
        dump(docs, f)
t_nlp = time()

In [16]:
from itertools import combinations as c

def make_v(sets):
    r = []
    for i in range(len(sets)):
        r = r + [''.join(x) for x in c(sets, i+1)]
    return r

variant_stanza_filter = make_v(['C', 'L', 'F', 'N'])
variant_stanza_stopwords = make_v(['C', 'L', 'W', 'N'])
variant_sastrawi_filter = make_v(['S', 'F', 'N'])
variant_sastrawi_stopwords = make_v(['S', 'W', 'N'])
variant_full = sorted(list(set(
    ['B', 'T']
    + variant_stanza_filter
    + variant_stanza_stopwords
    + variant_sastrawi_filter
    + variant_sastrawi_stopwords
)), key=lambda x: (len(x), x))

In [22]:
# variant = variant_full
variant = ['LWN', 'SFN']
ds, elapse_time = preprocess(docs, variant)

t_ds = time()

Processing SFN: 100%|██████████| 2/2 [00:11<00:00,  5.76s/it]


In [23]:
elapse_time = pd.DataFrame([{
    'variant': v,
    'nlp': t_nlp - t_start,
    'preprocessing': t
} for v, t in elapse_time ])

In [24]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing
0,dLWN,224.184062,7.460976
1,DLWN,224.184062,8.284982
2,dSFN,224.184062,1.6997
3,DSFN,224.184062,2.410262


In [25]:
ds.to_parquet('./results/ds.parquet', index=False)
elapse_time.to_csv('./results/elapse_time.csv', index=False)