In [None]:
import json
import pathlib
import numpy as np
import pandas as pd
import html
import re
import sklearn
from tqdm import tqdm
from fastai.text import *

In [None]:
LANG = 'id'
PATH_ROOT = Path(f'lmdata/{LANG}')
PATH_DATASET = Path(f'{PATH_ROOT}/dataset/BPPTIndToEngCorpus')
PATH_MODELS = Path(f'{PATH_ROOT}/models/')
PATH_TMP = Path(f'{PATH_ROOT}/tmp/')
np.random.seed(42)

# Language Model

# Text Classification

In [None]:
LANG_FILENAMES = [str(f) for f in PATH_DATASET.rglob("*-ID-*.txt")]
print(len(LANG_FILENAMES))
LANG_FILENAMES

In [None]:
CLASSES = {v:i for i,v in enumerate(['ECO', 'INT', 'SCI', 'SPO'])}
LANG_TEXT = []
for fn in tqdm(LANG_FILENAMES):
    CLASSNAME = re.sub('.+-([A-Z]+)-ID.+', r'\1', fn)
    for line in open(fn, encoding='utf8'):
        LANG_TEXT.append((CLASSES[CLASSNAME], line.rstrip()))

df = pd.DataFrame.from_records(LANG_TEXT, columns=['label', 'text'])
df.head()

In [None]:
df.to_csv(f"{PATH_DATASET}/bppt_panl.csv", header=False, index=False)

In [None]:
df = pd.read_csv(f"{PATH_DATASET}/bppt_panl.csv")
df.columns = ['label', 'text']

In [None]:
df.head()

In [None]:
trn_texts, val_texts = sklearn.model_selection.train_test_split(
    df, test_size=0.1, random_state=1) # split the data into train and validation sets

In [None]:
trn_texts[:5]

In [None]:
trn_texts.to_csv(f"{PATH_DATASET}/bppt_panl_train.csv", header=False, index=False)
val_texts.to_csv(f"{PATH_DATASET}/bppt_panl_test.csv", header=False, index=False)