# Create H, A, T

In [1]:
import pandas as pd
df = pd.read_csv('./datasets/raw/tdec.csv')

In [2]:
dfs = df.sample(200)

In [3]:
import stanza

nlp = stanza.Pipeline(
    lang="id",
    processors="tokenize,pos,lemma",
    download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)

  from .autonotebook import tqdm as notebook_tqdm
2023-08-17 09:41:10 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-08-17 09:41:10 INFO: Using device: cpu
2023-08-17 09:41:10 INFO: Loading: tokenize
2023-08-17 09:41:10 INFO: Loading: mwt
2023-08-17 09:41:10 INFO: Loading: pos
2023-08-17 09:41:10 INFO: Loading: lemma
2023-08-17 09:41:10 INFO: Done loading processors!


In [4]:
H = dfs.filter(['headline'], axis=1)
H.columns = ['H']

A = dfs.filter(['body'], axis=1)
A.columns = ['A']
A['A'] = dfs['headline'] + '. ' + A['A']

S = dfs.filter(['body'], axis=1)
S.columns = ['S']
S['S'] = dfs['headline'] + '. ' + S['S']
in_S = list(S['S'].apply(lambda h: stanza.Document([], text=h)))
out_S = nlp(in_S)
S = pd.DataFrame([s.text for a in out_S for s in a.sentences], columns=['S'])

In [5]:
dataset = {
    'H': H,
    'A': A,
    'S': S
}

In [15]:
import pickle

with open('./datasets/small/HAS.pickle', 'wb') as f:
    pickle.dump(dataset, f, protocol=pickle.HIGHEST_PROTOCOL)

# Create Variants

In [1]:
import pickle

with open('./datasets/small/HAS.pickle', 'rb') as f:
    HAS = pickle.load(f)

In [2]:
import stanza

nlp = stanza.Pipeline(
    lang="id",
    processors="tokenize,pos,lemma",
    download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)

  from .autonotebook import tqdm as notebook_tqdm
2023-08-17 10:02:19 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-08-17 10:02:19 INFO: Using device: cpu
2023-08-17 10:02:19 INFO: Loading: tokenize
2023-08-17 10:02:19 INFO: Loading: mwt
2023-08-17 10:02:19 INFO: Loading: pos
2023-08-17 10:02:20 INFO: Loading: lemma
2023-08-17 10:02:20 INFO: Done loading processors!


In [3]:
from gensim.models.phrases import Phrases

allowed_pos = ['NOUN', 'PROPN', 'VERB', 'X']

def preprocess_clw(out_df, c=False, l=False, w=False, n=False):
    r = [] # result
    if n:
        r_t = []
    for d in out_df:
        tokens = []
        for s in d.sentences:
            for token in s.words:
                _token = token.text
                if w:
                    if token.upos not in allowed_pos:
                        continue
                if l:
                    _token = token.lemma if token.lemma else _token
                if c:
                    _token = _token if l else token.text.lower()
                tokens.append(_token)
        r.append(' '.join(tokens))
        if n:
            r_t.append(tokens)
    if n:
        bigram = Phrases(r_t).freeze()
        trigram = Phrases(bigram[r_t]).freeze()
        r = [' '.join(d) for d in trigram[bigram[r_t]]]
    return r

In [4]:
from time import time
from utils import e_variant

def preprocess(HAS):
    r = {} # result
    r_time = {}
    variant = e_variant()
    nlp_datasets = {}
    for v in variant:
        t_start = time()
        if v[0] not in nlp_datasets:
            in_doc = list(HAS[v[0]][v[0]].apply(lambda x: stanza.Document([], text=x)))
            out_doc = nlp(in_doc)
            nlp_datasets[v[0]] = out_doc
        process_C = 'C' in v
        process_L = 'L' in v
        process_W = 'W' in v
        process_N = 'N' in v
        r[v] = preprocess_clw(
            nlp_datasets[v[0]],
            c=process_C,
            l=process_L,
            w=process_W,
            n=process_N
        )
        t_end = time()
        r_time[v] = t_end - t_start
    with open('./datasets/small/HAS_pt.pickle', 'wb') as f:
        pickle.dump(r_time, f, pickle.HIGHEST_PROTOCOL)
    return r

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [None]:
HAS_p = preprocess(HAS)

In [None]:
with open('./datasets/small/HAS_p.pickle', 'wb') as f:
    pickle.dump(HAS_p, f, pickle.HIGHEST_PROTOCOL)