In [1]:
PATH_ARTICLE = './datasets/articles.parquet'
PATH_NLP = './results/articles.nlp'
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
SAMPLE_SIZE = 100 # or None
SAMPLE_VARIANT_LIST = None # ['T', 'LWN'] # or None
SAMPLE_VARIANT = 2 # 3 # or None
USE_GPU = False

In [2]:
import pandas as pd

df = pd.read_parquet(PATH_ARTICLE)

In [3]:
if SAMPLE_SIZE:
    dfs = df.sample(SAMPLE_SIZE, random_state=999).reset_index(drop=True)
else:
    dfs = df

In [4]:
import stanza

nlp = stanza.Pipeline(
    lang='id',
    processors='tokenize,pos,lemma',
    use_gpu=USE_GPU,
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

  from .autonotebook import tqdm as notebook_tqdm
2023-09-01 08:59:38 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-09-01 08:59:38 INFO: Using device: cpu
2023-09-01 08:59:38 INFO: Loading: tokenize
2023-09-01 08:59:38 INFO: Loading: mwt
2023-09-01 08:59:38 INFO: Loading: pos
2023-09-01 08:59:38 INFO: Loading: lemma
2023-09-01 08:59:38 INFO: Done loading processors!


In [5]:
from time import time

import pandas as pd
from gensim.models.phrases import Phrases
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import \
    StopWordRemoverFactory
from tqdm import tqdm

stem = StemmerFactory().create_stemmer().stem
remove = StopWordRemoverFactory().create_stop_word_remover().remove

def preprocess(df_nlp, variant):
    df = {}
    elapse_time = []
    for v in (tp := tqdm(variant)):
        tp.set_description(f'Processing {v}')
        t_start = time()
        c = 'C' in v
        l = 'L' in v
        s = 'S' in v
        n = 'N' in v
        w = 'W' in v
        p = 'P' in v
        b = 'B' in v
        docs_long = []
        docs_short = []
        for doc in df_nlp:
            if b:
                docs_long.append(doc.text)
                for sent in doc.sentences:
                    docs_short.append(sent.text)
                    t_end_d = time()
                t_end_D = time()
                continue
            tokens = []
            for sent in doc.sentences:
                tokens_short = []
                for word in sent.words:
                    _word = word.text
                    if n:
                        if word.upos not in ['NOUN', 'PROPN']:
                            continue
                    if s:
                        sw = stem(_word)
                        if len(sw) == 0: continue
                        _word = sw
                    if w:
                        if len(remove(_word.lower())) == 0:
                            continue
                    if l:
                        _word = word.lemma if word.lemma else _word
                    if c:
                        _word = _word.lower()
                    tokens.append(_word)
                    tokens_short.append(_word)
                if len(tokens_short) == 0: continue
                docs_short.append(tokens_short)
            t_end_d = time()
            if len(tokens) == 0: continue
            docs_long.append(tokens)
        t_end_D = time()
        if p:
            bigram_short = Phrases(docs_short).freeze()
            trigram_short = Phrases(bigram_short[docs_short]).freeze()
            docs_short = trigram_short[bigram_short[docs_short]]
            t_end_d = time()
            bigram_long = Phrases(docs_long).freeze()
            trigram_long = Phrases(bigram_long[docs_long]).freeze()
            docs_long = trigram_long[bigram_long[docs_long]]
            t_end_D = time()
        df[f'd{v}'] = pd.Series(docs_short)
        elapse_time.append([f'd{v}', t_end_d - t_start])
        df[f'D{v}'] = pd.Series(docs_long)
        elapse_time.append([f'D{v}', t_end_D - t_start])
    return (pd.concat(df, axis=1), elapse_time)

In [6]:
from os.path import isfile
from pickle import load, dump

t_start = time()
docs_path = PATH_NLP
if isfile(docs_path):
    with open(docs_path, 'rb') as p:
        docs = load(p)
else:
    docs = list(dfs['article'].apply(lambda doc: stanza.Document([], text=doc)))
    docs = nlp(docs)
    with open(docs_path, 'wb') as p:
        dump(docs, p)
t_nlp = time()

In [7]:
from itertools import combinations as c

def make_v(sets):
    r = []
    for i in range(len(sets)):
        r = r + [''.join(x) for x in c(sets, i+1)]
    return r

variant_full = sorted(list(set(
    ['B', 'T']
    + make_v('CLNP')
    + make_v('CLNG')
    + make_v('CLWP')
    + make_v('CLWG')
    + make_v('CSNP')
    + make_v('CSNG')
    + make_v('CSWP')
    + make_v('CSWG')
)), key=lambda x: (len(x), x))
' '.join(variant_full)
# len(variant_full)

'B C G L N P S T W CG CL CN CP CS CW LG LN LP LW NG NP SG SN SP SW WG WP CLG CLN CLP CLW CNG CNP CSG CSN CSP CSW CWG CWP LNG LNP LWG LWP SNG SNP SWG SWP CLNG CLNP CLWG CLWP CSNG CSNP CSWG CSWP'

In [8]:
from random import sample

if SAMPLE_VARIANT_LIST:
    variant = SAMPLE_VARIANT_LIST
elif SAMPLE_VARIANT:
    variant = sample(variant_full, SAMPLE_VARIANT)
else:
    variant = variant_full

ds, elapse_time = preprocess(docs, variant)
t_ds = time()

Processing CLW: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]


In [9]:
elapse_time = pd.DataFrame([{
    'variant': v,
    'nlp': t_nlp - t_start,
    'preprocessing': t
} for v, t in elapse_time ])

In [10]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing
0,dCSW,40.981253,1.658864
1,DCSW,40.981253,1.658865
2,dCLW,40.981253,0.075823
3,DCLW,40.981253,0.075824


In [11]:
ds

Unnamed: 0,dCSW,DCSW,dCLW,DCLW
0,"[kisah, misterius, mati, keluarga, kalideres]","[kisah, misterius, mati, keluarga, kalideres, ...","[kisah, misterius, mati, keluarga, kalideres, .]","[kisah, misterius, mati, keluarga, kalideres, ..."
1,"[direktur, reserse, kriminal, umum, polda, met...","[teliti, brin, kosong, sesar, gempa, cianjur, ...","[direktur, reserse, kriminal, umum, polda, met...","[teliti, brin, penosongan, sesar, gempa, cianj..."
2,"[proses, lidi, jalan, henti]","[idola, piala, dunia, 2022, cho, gue-sung, 2, ...","[proses, selidi, jalan, henti, .]","[idola, piala, dunia, 2022, ,, cho, gue, hadap..."
3,"[polisi, temu, minimal, barang, bukti, rujuk, ...","[jelang, natal, tahun, pasar, jaya, adakan, pa...","[polisi, temu, minimal, barang, bukti, rujuk, ...","[menjelang, natal, tahun, ,, pasar, jaya, ada,..."
4,"[motif, bunuh, bunuh, mati, keluarga, kalideres]","[fitur, komunikasi, darurat, via, satelit, iph...","[motif, bunuh, bunuh, mati, keluarga, kalidere...","[fitur, komunikasi, darurat, via, satelit, iph..."
...,...,...,...,...
1976,"[kunjung, xi, jinping, china, upaya, hubung, n...",,"[xi, jinping, hadir, ktt, negara, arab, ktt, d...",
1977,"[china, mitra, dagang, arab, saudi, pangeran, ...",,"[menteri, energi, arab, saudi, ,, pangeran, ab...",
1978,,,"[catat, china, ,, konsumen, energi, besar, dun...",
1979,,,"[kunjung, xi, jinping, china, upaya, dalam, hu...",


In [12]:
ds.to_parquet(PATH_FILE_PREPROCESSED, index=False)
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)