In [3]:
PATH_DS = ['./results/ds/lda', './results/ds/bertopic']
SAMPLE_SIZE = None # or None

In [4]:
import pandas as pd

df = pd.read_csv('./datasets/articles.csv')

In [5]:
if SAMPLE_SIZE:
    dfs = df.sample(SAMPLE_SIZE, random_state=999).reset_index(drop=True)
else:
    dfs = df

In [6]:
import stanza

nlp = stanza.Pipeline(
    lang='id',
    processors='tokenize,pos,lemma',
    use_gpu=False,
    download_method=stanza.DownloadMethod.REUSE_RESOURCES
)

2023-09-16 10:51:18 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-09-16 10:51:18 INFO: Using device: cpu
2023-09-16 10:51:18 INFO: Loading: tokenize
2023-09-16 10:51:18 INFO: Loading: mwt
2023-09-16 10:51:18 INFO: Loading: pos
2023-09-16 10:51:18 INFO: Loading: lemma
2023-09-16 10:51:18 INFO: Done loading processors!


In [7]:
from typing import Iterable

Variant = str
Variants = list[Variant]
Document = any
Sentence = any
SentenceToken = list[str]
LDAToken = list[SentenceToken]
BERTopicToken = list[str]
DocumentToken = LDAToken | BERTopicToken

In [8]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import \
    StopWordRemoverFactory

stem = StemmerFactory().create_stemmer().stem
remove = StopWordRemoverFactory().create_stop_word_remover().remove
indicators = 'TBLSNWPG'
aupos = ['NOUN', 'PROPN']

def preprocess_sent(sent: Sentence, v: Variant) -> SentenceToken:
    vi = {
        i: i in v
        for i in indicators
    }
    if vi['B']: return [sent.text]
    tokens: SentenceToken = []
    for w in sent.words:
        if vi['T']:
            tokens.append(w.text)
            continue
        wt = w.text.lower()
        if vi['N'] and w.upos not in aupos:
            continue
        if vi['S'] and (sw := stem(wt)):
            wt = sw
        if vi['L'] and w.lemma:
            wt = w.lemma
        if vi['W']:
            if not remove(wt):
                continue
            if w.upos == 'PUNCT':
                continue
        tokens.append(wt)
    return tokens

# doc = nlp(dfs['article'][0])
# v = 'LN'
# preprocess_sent(doc.sentences[0], v)

In [9]:
from itertools import chain

def preprocess_doc(doc: Document, v: Variant, join=False, flat=False) -> DocumentToken:
    tokens: DocumentToken = []
    for s in doc.sentences:
        _tokens = preprocess_sent(s, v)
        if len(_tokens) == 0: continue
        tokens.append(_tokens)
    if join and flat:
        return ' '.join([' '.join(t) for t in tokens])
    if join:
        return [' '.join(t) for t in tokens]
    if flat:
        tokens = list(chain(*tokens))
    return tokens

# doc = nlp(dfs['article'][0])
# v = 'T'
# preprocess_doc(doc, v)
# preprocess_doc(doc, v, join=True)
# preprocess_doc(doc, v, flat=True)
# preprocess_doc(doc, v, join=True, flat=True)

In [10]:
import os
import shutil
import jsonlines as jl

def preprocess_text(text: str, vs: Variants) -> None:
    doc = nlp(text)
    vms = [(v, m) for v in vs for m in PATH_DS]
    vms = [(f'{t}{v}', m) for v, m in vms for t in ['s', 'D']]
    for v, m in vms:
        if 'B' in v and 'lda' in m: continue
        if 'B' not in v and 'bertopic' in m: continue
        if 'T' in v and 'bertopic' in m: continue
        join = 'bertopic' in m
        flat = 'D' in v
        tokens = preprocess_doc(doc, v, join=join, flat=flat)
        with jl.open(f'{m}/{v}.jsonl', mode='a') as w:
            # print(v)
            if not flat:
                for t in tokens:
                    w.write(t)
            else:
                w.write(tokens)

# variants = ['B', 'T', 'LN', 'SW']
# variants = ['LN', 'SW']
# preprocess_text(dfs['article'][0], variants)

In [11]:
from typing import Iterable
from tqdm.auto import tqdm

def preprocess(texts: Iterable, vs: Variants) -> None:
    shutil.rmtree(PATH_DS[0])
    shutil.rmtree(PATH_DS[1])
    os.makedirs(PATH_DS[0], exist_ok=True)
    os.makedirs(PATH_DS[1], exist_ok=True)
    for t in tqdm(texts):
        preprocess_text(t, vs)

# preprocess(dfs['article'], ['B', 'T', 'LN', 'SW'])

In [12]:
import os

from gensim.models.phrases import Phrases


def preprocess_phrases():
    vfs = os.listdir(PATH_DS[0])
    for f in tqdm(vfs):
        if 'P' not in f:
            continue
        vpath = f'{PATH_DS[0]}/{f}'
        with jl.open(vpath) as r:
            docs = list(r)
        bigram = Phrases(docs).freeze()
        trigram = Phrases(bigram[docs]).freeze()
        docs = trigram[bigram[docs]]
        os.remove(vpath)
        with jl.open(vpath, mode='a') as w:
            for doc in docs:
                w.write(doc)

In [13]:
preprocess(dfs['article'], ['L'])

  0%|          | 0/7836 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
preprocess_phrases()

  0%|          | 0/6 [00:00<?, ?it/s]