In [1]:
from ast import literal_eval
import json
import os
from typing import List

import pandas as pd
from sklearn.model_selection import train_test_split

## Configs

In [2]:
HAM_CSV = "raw/data_v1.csv"
WOHAM_CSV = "raw/data_notagword_v1.csv"

VAL_FROM_TRAIN_RATIO = 0.10

X_COLUMN = "text"
TOKEN_TAG_COLUMN = "tag_by_word"
Y_COLUMN = "tags"
RANDOM_SEED = 42

## Read data

In [3]:
w_ham_converters = {
    "text": literal_eval,
    "tag_by_word": literal_eval
}

w_ham_df = pd.read_csv(HAM_CSV, converters=w_ham_converters)
print(len(w_ham_df))
w_ham_df.head()

4141


Unnamed: 0,text,tag_by_word,tags
0,"[ประเทศจีน, ชอบ, ปลอม, เอา, ยาง, รถ, ไป, ทำ, ไ...","[I-Influencer, Fb-Refer, Fb-Refer, Fb-Refer, F...",Fake News
1,"[ประกาศ, กฎอัยการศึก, ปิด, การ, เข้า, ออก, ทุก...","[T-Clickbait, T-Clickbait, , , , , , , , , , ,...",Undefined
2,"[ประโยชน์, มาก, ก็, ใช่, ว่า, จะ, ไม่มี, โทษ, ...","[, , , , , , , , , , M-Convincing, M-Convincin...",Undefined
3,"[มั่นใจ, กระซิบหูหนู, พระพิฆเนศ, ไม่ติด, เชื้อ...","[T-Clickbait, T-Clickbait, T-Clickbait, T-Clic...",Fake News
4,"[พวก, ที่, เชื่อ, ว่า, คน, ไม่, ป่วย, ไม่, ต้อ...","[, , , , , , , , , , , , , , , , , , , , , , ,...",Undefined


In [4]:
wo_ham_converters = {
    "text": lambda x: literal_eval(x)
}
wo_ham_df = pd.read_csv(WOHAM_CSV, converters=wo_ham_converters)
print(len(wo_ham_df))

wo_ham_df.head()

2998


Unnamed: 0,text,tags
0,"[พี่, สาว, คน, ข้าง, บ้าน, เป็น, มะเร็ง, ต่อม,...",Fake News
1,"[กิน, แทน, ข้าว, หนุ่ม, เวียดนาม, ปวด, ท้อง, 2...",Fake News
2,"[ผง, ชูรส, นี่, ก็, สาร, พิษ, ก่อ, เกิด, โรค, ...",Fake News
3,"[รู้, ไว้, มี, ประโยชน์, !, , , ไม่, อยาก, เป็...",Fake News
4,"[มะหวด, , , (, หมากหวด, ข่า, ), , , ผลไม้, ป่า...",Fake News


In [5]:
print("Total sampling: %d"%(len(w_ham_df) + len(wo_ham_df)))

Total sampling: 7139


## Splitting

In [6]:
X, Y = wo_ham_df[X_COLUMN], wo_ham_df[Y_COLUMN]

In [7]:
x_train, x_val, y_train, y_val = train_test_split(
    X, Y, test_size=VAL_FROM_TRAIN_RATIO,
    stratify=Y, random_state=RANDOM_SEED
)
print(len(x_train), len(y_train))
print(len(x_val), len(y_val))

2698 2698
300 300


In [8]:
x_test, x_tags_test, y_test = w_ham_df[X_COLUMN], w_ham_df[TOKEN_TAG_COLUMN], w_ham_df[Y_COLUMN]
print(len(x_test), len(x_tags_test), len(y_test))

4141 4141 4141


### Remove train/test overlap

In [9]:
train_texts = ["".join(x) for x in X]

nondup_x_test = []
nondup_x_tags_test = []
nondup_y_test = []

n_leak = 0
for x, tags, y in zip(x_test, x_tags_test, y_test):
    if "".join(x) in train_texts:
        n_leak += 1
        # print(n_leak, "".join(x))
        continue
    nondup_x_test.append(x)
    nondup_x_tags_test.append(tags)
    nondup_y_test.append(y)
    
print("# leaks: %d"%n_leak)

# leaks: 1376


In [10]:
print(len(nondup_x_test), len(nondup_x_tags_test), len(nondup_y_test))

2765 2765 2765


## Data saving

In [11]:
def to_jsonl(
    _path: str,
    _X: List[str],
    _Y: List[str],
    _X_TAGS: List[str] = None
) -> None:
    samples = []
    if _X_TAGS is not None:
        for x, x_tags, y in zip(_X, _X_TAGS, _Y):
            samples.append({"Text": x, "Token Tags": x_tags, "Document Tag": y})
    else:
        for x, y in zip(_X, _Y):
            samples.append({"Text": x, "Document Tag": y})
            
    f = open(_path, 'w')
    for sample in samples:
        f.write("%s\n" % json.dumps(sample, ensure_ascii=False))
    f.close()

In [13]:
to_jsonl(
    "train_v1.jsonl",
    x_train, y_train
)

to_jsonl(
    "val_v1.jsonl",
    x_val, y_val
)

to_jsonl(
    "test_v1.jsonl",
    nondup_x_test, nondup_y_test, nondup_x_tags_test
)