# Create H, A, T

In [1]:
import pandas as pd
df = pd.read_csv('./datasets/raw/tdec.csv')

In [2]:
dfs = df.sample(50, random_state=999).reset_index(drop=True)

In [3]:
import stanza

nlp = stanza.Pipeline(
    lang="id",
    processors="tokenize,pos,lemma",
    download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)

  from .autonotebook import tqdm as notebook_tqdm
2023-08-20 13:28:34 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-08-20 13:28:34 INFO: Using device: cpu
2023-08-20 13:28:34 INFO: Loading: tokenize
2023-08-20 13:28:34 INFO: Loading: mwt
2023-08-20 13:28:34 INFO: Loading: pos
2023-08-20 13:28:34 INFO: Loading: lemma
2023-08-20 13:28:34 INFO: Done loading processors!


In [4]:
from time import time

_HAS_t = {}

t_start = time()
H = dfs['headline']
_HAS_t['H'] = time() - t_start


In [5]:
A = dfs['headline'] + '. ' + dfs['body']
_HAS_t['A'] = time() - _HAS_t['H']


In [6]:
S_in = list(A.copy().apply(lambda doc: stanza.Document([], text=doc)))
S_out = nlp(S_in)
S = pd.Series([s.text for doc in S_out for s in doc.sentences])
_HAS_t['S'] = time() - _HAS_t['A']

In [7]:
from utils import e_variant

HAS_t = pd.DataFrame(pd.Series(e_variant()), columns=['variant'])
r = []
for v in HAS_t['variant']:
    r.append(_HAS_t[v[0]])
HAS_t['segmentation'] = pd.Series(r)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [8]:
HAS = pd.DataFrame({'H': H, 'A': A, 'S': S})


In [9]:
HAS_t

Unnamed: 0,variant,segmentation
0,HWN,0.0
1,HCLWN,0.0
2,AWN,1692513000.0
3,ACLWN,1692513000.0
4,SWN,26.37917
5,SCLWN,26.37917


In [10]:
HAS

Unnamed: 0,H,A,S
0,Akhir Kisah Misterius Kematian Satu Keluarga d...,Akhir Kisah Misterius Kematian Satu Keluarga d...,Akhir Kisah Misterius Kematian Satu Keluarga d...
1,Peneliti BRIN Sebut Pengosongan dari Sesar Gem...,Peneliti BRIN Sebut Pengosongan dari Sesar Gem...,Direktur Reserse Kriminal Umum Polda Metro Jay...
2,"Jadi Idola di Piala Dunia 2022, Cho Gue-sung H...","Jadi Idola di Piala Dunia 2022, Cho Gue-sung H...",Proses penyelidikan yang telah berjalan selama...
3,"Menjelang Natal dan Tahun Baru, Pasar Jaya Ada...","Menjelang Natal dan Tahun Baru, Pasar Jaya Ada...",Polisi juga tidak menemukan minimal dua barang...
4,Fitur Komunikasi Darurat via Satelit iPhone 14...,Fitur Komunikasi Darurat via Satelit iPhone 14...,Motif bunuh diri atau pembunuhan juga tidak ad...
...,...,...,...
1031,,,Buffett menyarankan agar pola pikir investor s...
1032,,,Ia meminta para investor untuk tetap yakin mod...
1033,,,Poin tersebut seiring dengan kunci yang selalu...
1034,,,"Sebab, dia selalu mendorong kepercayaan diri."


In [11]:
HAS_t.to_parquet('./results/HAS_t.parquet')
HAS.to_parquet('./results/HAS.parquet')

# Create Variants

In [12]:
import pandas as pd

HAS = pd.read_parquet('./results/HAS.parquet')
HAS_t = pd.read_parquet('./results/HAS_t.parquet')

In [13]:
import stanza

nlp = stanza.Pipeline(
    lang="id",
    processors="tokenize,pos,lemma",
    download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)

2023-08-20 13:29:12 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-08-20 13:29:12 INFO: Using device: cpu
2023-08-20 13:29:12 INFO: Loading: tokenize
2023-08-20 13:29:12 INFO: Loading: mwt
2023-08-20 13:29:12 INFO: Loading: pos
2023-08-20 13:29:12 INFO: Loading: lemma
2023-08-20 13:29:12 INFO: Done loading processors!


In [14]:
from gensim.models.phrases import Phrases

allowed_pos = ['NOUN', 'PROPN', 'VERB', 'X']

def preprocess_clw(out_df, c=False, l=False, w=False, n=False):
    r = [] # result
    for d in out_df:
        tokens = []
        for s in d.sentences:
            for token in s.words:
                _token = token.text
                if w:
                    if token.upos not in allowed_pos:
                        continue
                if l:
                    _token = token.lemma if token.lemma else _token
                if c:
                    _token = _token if l else token.text.lower()
                tokens.append(_token)
        r.append(tokens)
    if n:
        bigram = Phrases(r).freeze()
        trigram = Phrases(bigram[r]).freeze()
        r = trigram[bigram[r]]
    return r

In [15]:
from time import time
from utils import e_variant


def preprocess():
    t_base = {}
    nlp_datasets = {}
    for v in HAS:
        t_start = time()
        df = HAS[v].dropna()
        in_doc = list(df.apply(lambda x: stanza.Document([], text=x)))
        out_doc = nlp(in_doc)
        nlp_datasets[v] = out_doc
        t_base[v] = time() - t_start
    r = {} # result
    r_time = []
    variant = e_variant()
    for v in variant:
        t_preprocess = time()
        process_C = 'C' in v
        process_L = 'L' in v
        process_W = 'W' in v
        process_N = 'N' in v
        r[v] = pd.Series(preprocess_clw(
            nlp_datasets[v[0]],
            c=process_C,
            l=process_L,
            w=process_W,
            n=process_N
        ))
        r_time.append(time() - t_preprocess + t_base[v[0]])
    HAS_t['preprocess'] = pd.Series(r_time)
    return pd.DataFrame(r)

In [16]:
HAS_p = preprocess()

In [17]:
HAS_t

Unnamed: 0,variant,segmentation,preprocess
0,HWN,0.0,1.334094
1,HCLWN,0.0,1.333092
2,AWN,1692513000.0,33.333024
3,ACLWN,1692513000.0,33.355028
4,SWN,26.37917,31.43803
5,SCLWN,26.37917,31.439039


In [18]:
HAS_p

Unnamed: 0,HWN,HCLWN,AWN,ACLWN,SWN,SCLWN
0,"[Akhir, Kisah, Misterius, Kematian, Keluarga, ...","[akhir, kisah, misterius, mati, keluarga, kali...","[Akhir, Kisah, Misterius, Kematian, Keluarga, ...","[akhir, kisah, misterius, mati, keluarga, kali...","[Akhir, Kisah, Misterius, Kematian, Keluarga, ...","[akhir, kisah, misterius, mati, keluarga, kali..."
1,"[Peneliti, BRIN, Sebut, Pengosongan, Sesar, Ge...","[teliti, brin, sebut, penosongan, sesar, gempa...","[Peneliti, BRIN, Sebut, Pengosongan, Sesar, Ge...","[teliti, brin, sebut, penosongan, sesar, gempa...","[Direktur, Reserse, Kriminal, Umum, Polda, Met...","[direktur, reserse, kriminal, umum, polda, met..."
2,"[Idola, Piala, Dunia, Cho, Gue-sung, Hadapi, R...","[idola, piala, dunia, cho, gue, hadapi, rumor,...","[Idola, Piala_Dunia, Cho_Gue-sung, Hadapi, Rum...","[idola, piala_dunia, cho_gue, hadapi, rumor, k...","[Proses, penyelidikan, berjalan, bulan, dihent...","[proses, selidi, jalan, bulan, henti]"
3,"[Natal, Tahun, Baru, Pasar, Jaya, Adakan, Pasa...","[natal, tahun, baru, pasar, jaya, ada, pasar, ...","[Natal, Tahun_Baru, Pasar, Jaya, Adakan, Pasar...","[natal, tahun_baru, pasar, jaya, ada, pasar, m...","[Polisi, menemukan, barang, bukti, merujuk, te...","[polisi, temu, barang, bukti, rujuk, sangka]"
4,"[Fitur, Komunikasi, Darurat, via, Satelit, iPh...","[fitur, komunikasi, darurat, via, satelit, iph...","[Fitur, Komunikasi, Darurat, via, Satelit, iPh...","[fitur, komunikasi, darurat, via, satelit, iph...","[Motif, bunuh, pembunuhan, ada, kasus, kematia...","[motif, bunuh, bunuh, ada, kasus, mati, keluar..."
...,...,...,...,...,...,...
1031,,,,,"[Buffett, menyarankan, pola, pikir, investor, ...","[buffett, saran, pola, pikir, investor, tetap,..."
1032,,,,,"[meminta, investor, ditanamkan, membuahkan, ha...","[pinta, investor, tanam, buah, hasil]"
1033,,,,,"[Poin, kunci, ditanamkan, Buffett]","[poin, kunci, tanam, buffett]"
1034,,,,,"[mendorong, kepercayaan]","[dorong, percaya]"


In [19]:
HAS_p.to_parquet('./results/HAS_p.parquet')
HAS_t.to_parquet('./results/HAS_t.parquet')