In [22]:
import stanza

nlp = stanza.Pipeline(
    lang='id',
    processors='tokenize,pos,lemma',
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

2023-08-22 12:26:07 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-08-22 12:26:07 INFO: Using device: cpu
2023-08-22 12:26:07 INFO: Loading: tokenize
2023-08-22 12:26:07 INFO: Loading: mwt
2023-08-22 12:26:07 INFO: Loading: pos
2023-08-22 12:26:07 INFO: Loading: lemma
2023-08-22 12:26:07 INFO: Done loading processors!


In [23]:
from time import time

import pandas as pd
from gensim.models.phrases import Phrases


def preprocess(df_nlp, process):
    df = {}
    allowed_pos = ['NOUN', 'PROPN', 'VERB', 'X']
    elapse_time = []
    for p in process:
        t_start = time()
        c = 'C' in p
        l = 'L' in p
        w = 'W' in p
        n = 'N' in p
        b = 'B' in p
        docs_long = []
        docs_short = []
        for doc in df_nlp:
            if b:
                docs_long.append(doc.text)
                for s in doc.sentences:
                    docs_short.append(s.text)
                continue
            tokens = []
            for s in doc.sentences:
                tokens_short = []
                for word in s.words:
                    _word = word.text
                    if w:
                        if word.upos not in allowed_pos:
                            continue
                    if l:
                        _word = word.lemma if word.lemma else _word
                    if c:
                        _word = _word if l else word.text.lower()
                    tokens.append(_word)
                    tokens_short.append(_word)
                docs_short.append(tokens_short)
                t_end_d = time()
            docs_long.append(tokens)
            t_end_D = time()
        if n:
            bigram_short = Phrases(docs_short).freeze()
            trigram_short = Phrases(bigram_short[docs_short]).freeze()
            docs_short = trigram_short[bigram_short[docs_short]]
            t_end_d = time()
            bigram_long = Phrases(docs_long).freeze()
            trigram_long = Phrases(bigram_long[docs_long]).freeze()
            docs_long = trigram_long[bigram_long[docs_long]]
            t_end_D = time()
        df[f'd{p}'] = pd.Series(docs_short)
        elapse_time.append([f'd{p}', t_end_d - t_start])
        df[f'D{p}'] = pd.Series(docs_long)
        elapse_time.append([f'D{p}', t_end_D - t_start])
    return (pd.concat(df, axis=1), elapse_time)

In [24]:
import pandas as pd

df = pd.read_parquet('./datasets/raw/articles.parquet')

In [25]:
dfs = df.sample(50, random_state=999).reset_index(drop=True)

In [26]:
t_start = time()

docs = list(dfs['article'].apply(lambda doc: stanza.Document([], text=doc)))
docs = nlp(docs)

t_nlp = time()

In [27]:
# variant = ['B', 'T', 'C', 'L', 'W', 'N', 'CL', 'CW', 'CN', 'LW', 'LN', 'WN', 'CLW', 'LWN', 'WNC', 'NCL', 'CLWN']
variant = ['CL', 'LWN']
ds, elapse_time = preprocess(docs, variant)

t_ds = time()

In [28]:
elapse_time = pd.DataFrame([{
    'variant': v,
    'nlp': t_nlp - t_start,
    'preprocessing': t
} for v, t in elapse_time ])

In [29]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing
0,dCL,26.75152,0.015996
1,DCL,26.75152,0.015996
2,dLWN,26.75152,0.097994
3,DLWN,26.75152,0.184994


In [30]:
ds.to_parquet('./results/ds.parquet', index=False)
elapse_time.to_csv('./results/elapse_time.csv', index=False)