In [None]:
import stanza

nlp = stanza.Pipeline(
    lang='id',
    processors='tokenize,pos,lemma',
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

In [133]:
import pandas as pd
from gensim.models.phrases import Phrases

def preprocess(df_nlp, process):
    df = {}
    allowed_pos = ['NOUN', 'PROPN', 'VERB', 'X']
    for p in process:
        c = 'C' in p
        l = 'L' in p
        w = 'W' in p
        n = 'N' in p
        b = 'B' in p
        docs_long = []
        docs_short = []
        for doc in df_nlp:
            if b:
                docs_long.append(doc.text)
                for s in doc.sentences:
                    docs_short.append(s.text)
                continue
            tokens = []
            for s in doc.sentences:
                tokens_short = []
                for word in s.words:
                    _word = word.text
                    if w:
                        if word.upos not in allowed_pos:
                            continue
                    if l:
                        _word = word.lemma if word.lemma else _word
                    if c:
                        _word = _word if l else word.text.lower()
                    tokens.append(_word)
                    tokens_short.append(_word)
                docs_short.append(tokens_short)
            docs_long.append(tokens)
        if n:
            bigram_short = Phrases(docs_short).freeze()
            trigram_short = Phrases(bigram_short[docs_short]).freeze()
            docs_short = trigram_short[bigram_short[docs_short]]
            bigram_long = Phrases(docs_long).freeze()
            trigram_long = Phrases(bigram_long[docs_long]).freeze()
            docs_long = trigram_long[bigram_long[docs_long]]
        df[f'd{p}'] = pd.Series(docs_short)
        df[f'D{p}'] = pd.Series(docs_long)
    return pd.concat(df, axis=1)

In [134]:
import pandas as pd

df = pd.read_parquet('./datasets/raw/articles.parquet')

In [135]:
dfs = df.sample(50, random_state=999).reset_index(drop=True)

In [136]:
from time import time

t_start = time()

docs = list(dfs['article'].apply(lambda doc: stanza.Document([], text=doc)))
docs = nlp(docs)

t_nlp = time()

In [137]:
# variant = ['B', 'T', 'C', 'L', 'W', 'N', 'CL', 'CW', 'CN', 'LW', 'LN', 'WN', 'CLW', 'LWN', 'WNC', 'NCL', 'CLWN']
variant = ['CL', 'LWN']
ds = preprocess(docs, variant)

t_ds = time()

In [162]:
elapse_time = pd.DataFrame([{
    'variant': f'{b}{v}',
    'nlp': t_nlp - t_start,
    'preprocessing': t_ds - t_nlp
} for b in ['d', 'D'] for v in variant])

In [163]:
ds.sample(5)

Unnamed: 0,dCL,DCL,dLWN,DLWN
119,"[dia, nilai, beri, insentif, dapat, buat, peri...",,"[nilai, beri, insentif, buat, perintah, hemat,...",
50,"[zona, yang, harus, kosong, sepanjang, garis, ...",,"[zona, kosong, garis, jalur, patah, kanan, kir...",
757,"[mohon, itu, aju, pada, 13, oktober, 2022, .]",,"[mohon, aju, oktober]",
415,"[setelah, lawan, tim, yang, ancam, degradasi, ...",,"[lawan, tim, ancam, degradasi, jamu, udinese, ...",
234,"["", sejut, ,, bhatane, tangkap, .]",,"[sejut, bhatane, tangkap]",


In [164]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing
0,dCL,34.678537,0.311998
1,dLWN,34.678537,0.311998
2,DCL,34.678537,0.311998
3,DLWN,34.678537,0.311998


In [165]:
ds.to_parquet('./results/ds.parquet', index=False)
elapse_time.to_csv('./results/elapse_time.csv', index=False)