In [1]:
from ast import literal_eval
import json
import os
from typing import List

import pandas as pd
from sklearn.model_selection import train_test_split

## Configs

In [2]:
HAM_CSV = "raw/data_v1.csv"
WOHAM_CSV = "raw/data_notagword_v1.csv"

VAL_FROM_TRAIN_RATIO = 0.10

X_COLUMN = "text"
TOKEN_TAG_COLUMN = "tag_by_word"
Y_COLUMN = "tags"
RANDOM_SEED = 42

## Read data

In [3]:
w_ham_converters = {
    "text": literal_eval,
    "tag_by_word": literal_eval
}

w_ham_df = pd.read_csv(HAM_CSV, converters=w_ham_converters)
print(len(w_ham_df))
w_ham_df.head()

4141


Unnamed: 0,text,tag_by_word,tags
0,"[‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏à‡∏µ‡∏ô, ‡∏ä‡∏≠‡∏ö, ‡∏õ‡∏•‡∏≠‡∏°, ‡πÄ‡∏≠‡∏≤, ‡∏¢‡∏≤‡∏á, ‡∏£‡∏ñ, ‡πÑ‡∏õ, ‡∏ó‡∏≥, ‡πÑ...","[I-Influencer, Fb-Refer, Fb-Refer, Fb-Refer, F...",Fake News
1,"[‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®, ‡∏Å‡∏é, ‡∏≠‡∏±‡∏¢‡∏Å‡∏≤‡∏£‡∏®‡∏∂‡∏Å, ‡∏õ‡∏¥‡∏î, ‡∏Å‡∏≤‡∏£, ‡πÄ‡∏Ç‡πâ‡∏≤, ‡∏≠‡∏≠‡∏Å, ‡∏ó...","[T-Clickbait, T-Clickbait, T-Clickbait, , , , ...",Undefine
2,"[‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏ä‡∏ô‡πå, ‡∏°‡∏≤‡∏Å, ‡∏Å‡πá, ‡πÉ‡∏ä‡πà, ‡∏ß‡πà‡∏≤, ‡∏à‡∏∞, ‡πÑ‡∏°‡πà, ‡∏°‡∏µ, ‡πÇ‡∏ó‡∏©...","[, , , , , , , , , , , M-Convincing, M-Convinc...",Undefine
3,"[‡∏°‡∏±‡πà‡∏ô‡πÉ‡∏à, ‡∏Å‡∏£‡∏∞‡∏ã‡∏¥‡∏ö, ‡∏´‡∏π, ‡∏´‡∏ô‡∏π, ‡∏û‡∏£‡∏∞‡∏û‡∏¥‡∏Ü‡πÄ‡∏ô‡∏®, ‡πÑ‡∏°‡πà, ‡∏ï‡∏¥‡∏î,...","[T-Clickbait, T-Clickbait, T-Clickbait, T-Clic...",Fake News
4,"[‡∏û‡∏ß‡∏Å, ‡∏ó‡∏µ‡πà, ‡πÄ‡∏ä‡∏∑‡πà‡∏≠, ‡∏ß‡πà‡∏≤, ‡∏Ñ‡∏ô, ‡πÑ‡∏°‡πà, ‡∏õ‡πà‡∏ß‡∏¢, ‡πÑ‡∏°‡πà, ‡∏ï‡πâ‡∏≠...","[, , , , , , , , , , , , , , , , , , , , , , ,...",Undefine


In [4]:
wo_ham_converters = {
    "title": lambda x: literal_eval(x) if x else [],
    "detail": lambda x: literal_eval(x) if x else []
}
wo_ham_df = pd.read_csv(WOHAM_CSV, converters=wo_ham_converters)
print(len(wo_ham_df))


def get_text(_title: List[str], _detail: List[str]) -> List[str]:
    if _detail and not _title:
        return _detail
    elif _title and not _detail:
        return title
    return [*_title, '|', '|', '|', *_detail]

wo_ham_df["text"] = wo_ham_df.apply(lambda x: get_text(x.title, x.detail), axis=1)
wo_ham_df.head()

3050


Unnamed: 0,title,detail,tags,text
0,[],"[‡∏û‡∏µ‡πà, ‡∏™‡∏≤‡∏ß, ‡∏Ñ‡∏ô, ‡∏Ç‡πâ‡∏≤‡∏á, ‡∏ö‡πâ‡∏≤‡∏ô, ‡πÄ‡∏õ‡πá‡∏ô, ‡∏°‡∏∞‡πÄ‡∏£‡πá‡∏á, ‡∏ï‡πà‡∏≠‡∏°,...",Fake News,"[‡∏û‡∏µ‡πà, ‡∏™‡∏≤‡∏ß, ‡∏Ñ‡∏ô, ‡∏Ç‡πâ‡∏≤‡∏á, ‡∏ö‡πâ‡∏≤‡∏ô, ‡πÄ‡∏õ‡πá‡∏ô, ‡∏°‡∏∞‡πÄ‡∏£‡πá‡∏á, ‡∏ï‡πà‡∏≠‡∏°,..."
1,[],"[‡∏Å‡∏¥‡∏ô, ‡πÅ‡∏ó‡∏ô, ‡∏Ç‡πâ‡∏≤‡∏ß, ‡∏´‡∏ô‡∏∏‡πà‡∏°, ‡πÄ‡∏ß‡∏µ‡∏¢‡∏î‡∏ô‡∏≤‡∏°, ‡∏õ‡∏ß‡∏î, ‡∏ó‡πâ‡∏≠‡∏á, 2...",Fake News,"[‡∏Å‡∏¥‡∏ô, ‡πÅ‡∏ó‡∏ô, ‡∏Ç‡πâ‡∏≤‡∏ß, ‡∏´‡∏ô‡∏∏‡πà‡∏°, ‡πÄ‡∏ß‡∏µ‡∏¢‡∏î‡∏ô‡∏≤‡∏°, ‡∏õ‡∏ß‡∏î, ‡∏ó‡πâ‡∏≠‡∏á, 2..."
2,[],"["", ‡∏ï‡πâ‡∏ô, ‡∏õ‡πà‡∏≤‡∏ä‡πâ‡∏≤, ‡πÄ‡∏´‡∏á‡∏≤, "", ‡∏™‡∏°‡∏∏‡∏ô‡πÑ‡∏û‡∏£, ‡πÑ‡∏ó‡∏¢, ‡∏î‡∏±‡∏á, ‡πÑ...",Fake News,"["", ‡∏ï‡πâ‡∏ô, ‡∏õ‡πà‡∏≤‡∏ä‡πâ‡∏≤, ‡πÄ‡∏´‡∏á‡∏≤, "", ‡∏™‡∏°‡∏∏‡∏ô‡πÑ‡∏û‡∏£, ‡πÑ‡∏ó‡∏¢, ‡∏î‡∏±‡∏á, ‡πÑ..."
3,[],"[‡∏ú‡∏á, ‡∏ä‡∏π@@, ‡∏£‡∏™, ‡∏ô‡∏µ‡πà, ‡∏Å‡πá, ‡∏™‡∏≤‡∏£, ‡∏û‡∏¥‡∏©, ‡∏Å‡πà‡∏≠, ‡πÄ‡∏Å‡∏¥‡∏î, ‡πÇ...",Fake News,"[‡∏ú‡∏á, ‡∏ä‡∏π@@, ‡∏£‡∏™, ‡∏ô‡∏µ‡πà, ‡∏Å‡πá, ‡∏™‡∏≤‡∏£, ‡∏û‡∏¥‡∏©, ‡∏Å‡πà‡∏≠, ‡πÄ‡∏Å‡∏¥‡∏î, ‡πÇ..."
4,"[‡∏£‡∏π‡πâ, ‡πÑ‡∏ß‡πâ, ‡∏°‡∏µ, ‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏ä‡∏ô‡πå, !, , , ‡πÑ‡∏°‡πà, ‡∏≠‡∏¢‡∏≤‡∏Å, ‡πÄ‡∏õ‡πá...","[‡πÑ‡∏°‡πà, ‡∏≠‡∏¢‡∏≤‡∏Å, ‡πÄ‡∏õ‡πá‡∏ô, ‡∏ô‡∏¥‡πà‡∏ß, , , ‡∏ï‡πâ‡∏≠‡∏á, ‡∏´‡∏•‡∏µ‡∏Å‡πÄ‡∏•‡∏µ‡πà‡∏¢‡∏á, ...",Fake News,"[‡∏£‡∏π‡πâ, ‡πÑ‡∏ß‡πâ, ‡∏°‡∏µ, ‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏ä‡∏ô‡πå, !, , , ‡πÑ‡∏°‡πà, ‡∏≠‡∏¢‡∏≤‡∏Å, ‡πÄ‡∏õ‡πá..."


In [5]:
print("Total sampling: %d"%(len(w_ham_df) + len(wo_ham_df)))

Total sampling: 7191


## Splitting

In [6]:
X, Y = wo_ham_df[X_COLUMN], wo_ham_df[Y_COLUMN]

In [7]:
x_train, x_val, y_train, y_val = train_test_split(
    X, Y, test_size=VAL_FROM_TRAIN_RATIO,
    stratify=Y, random_state=RANDOM_SEED
)
print(len(x_train), len(y_train))
print(len(x_val), len(y_val))

2745 2745
305 305


In [8]:
x_test, x_tags_test, y_test = w_ham_df[X_COLUMN], w_ham_df[TOKEN_TAG_COLUMN], w_ham_df[Y_COLUMN]
print(len(x_test), len(x_tags_test), len(y_test))

4141 4141 4141


### Remove train/test overlap

In [9]:
train_texts = ["".join(x) for x in X]

nondup_x_test = []
nondup_x_tags_test = []
nondup_y_test = []

n_leak = 0
for x, tags, y in zip(x_test, x_tags_test, y_test):
    if "".join(x) in train_texts:
        n_leak += 1
        print(n_leak, "".join(x))
        continue
    nondup_x_test.append(x)
    nondup_x_tags_test.append(tags)
    nondup_y_test.append(y)

1 ‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏à‡∏µ‡∏ô‡∏ä‡∏≠‡∏ö‡∏õ‡∏•‡∏≠‡∏°‡πÄ‡∏≠‡∏≤‡∏¢‡∏≤‡∏á‡∏£‡∏ñ‡πÑ‡∏õ‡∏ó‡∏≥‡πÑ‡∏Ç‡πà‡∏°‡∏∏‡∏Å‡∏Ñ‡πà‡∏∞
2 ‡∏°‡∏±‡∏ô‡∏Ñ‡∏∑‡∏≠‡∏ô‡πâ‡∏≥‡∏®‡∏±‡∏Å‡∏î‡∏¥‡πå‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡πå‡πÑ‡∏ß‡πâ‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏Ñ‡∏ô‡∏õ‡πà‡∏ß‡∏¢‡πÄ‡∏õ‡πá‡∏ô‡∏°‡∏∞‡πÄ‡∏£‡πá‡∏á
3 ‡∏û‡∏ß‡∏Å‡∏Ñ‡∏∏‡∏ì‡πÄ‡∏Ç‡πâ‡∏≤‡πÉ‡∏à‡∏ú‡∏¥‡∏î‡∏Å‡∏±‡∏ô‡πÑ‡∏õ‡πÉ‡∏´‡∏ç‡πà‡πÅ‡∏•‡πâ‡∏ß‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏û‡∏ß‡∏Å‡∏Ñ‡∏∏‡∏ì‡πÄ‡∏´‡πá‡∏ô‡∏°‡∏±‡∏ô‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡∏´‡∏ô‡πâ‡∏≤‡∏Å‡∏≤‡∏Å‡∏≠‡∏ô‡∏≤‡∏°‡∏±‡∏¢‡πÅ‡∏ï‡πà‡∏°‡∏±‡∏ô‡∏Ñ‡∏∑‡∏≠‡πÅ‡∏õ‡πâ‡∏á..
4 ‡∏ô‡πà‡∏≤‡∏Å‡∏•‡∏±‡∏ß‡∏°‡∏≤‡∏Å‡πÑ‡∏Ç‡πà‡∏°‡∏∏‡∏Å‡∏ó‡∏µ‡πà‡∏°‡∏µ‡πÉ‡∏ô‡∏ô‡πâ‡∏≥‡∏ñ‡πâ‡∏ß‡∏¢‡∏°‡∏±‡∏ô‡∏¢‡πà‡∏≠‡∏¢‡∏¢‡∏≤‡∏Å‡∏ö‡∏≤‡∏á‡∏Ñ‡∏ô‡πÑ‡∏°‡πà‡∏¢‡πà‡∏≠‡∏¢‡πÄ‡∏•‡∏¢‡∏Ñ‡∏≤‡πÉ‡∏ô‡∏ó‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏≠‡∏á‡∏ú‡πà‡∏≤‡∏ï‡∏±‡∏î‡∏≠‡∏≠‡∏Å‡∏°‡∏≤‡∏£‡∏∞‡∏ß‡∏±‡∏á‡∏î‡πâ‡∏ß‡∏¢
5 ‡∏ô‡∏±‡πà‡∏á‡∏ü‡∏±‡∏á‡πÄ‡πÄ‡∏°‡πà‡∏û‡∏π‡∏î‡∏ß‡πà‡∏≤‡πÉ‡∏´‡πâ‡∏Å‡∏¥‡∏ô‡∏ô‡πâ‡∏≥‡∏≠‡∏∏‡πà‡∏ô‡∏ú‡∏™‡∏°‡∏°‡∏∞‡∏ô‡∏≤‡∏ß‡∏ä‡πà‡∏ß‡∏¢‡∏ï‡πâ‡∏≤‡∏ô‡πÇ‡∏Ñ‡∏ß‡∏¥‡∏î‡∏≠‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏à‡∏£‡∏¥‡∏á‡∏°‡∏±‡πâ‡∏¢?!
6 ‡πÄ‡∏ö‡∏µ‡∏¢‡∏£‡πå1

In [10]:
print(len(nondup_x_test), len(nondup_x_tags_test), len(nondup_y_test))

4012 4012 4012


## Data saving

In [11]:
def to_jsonl(
    _path: str,
    _X: List[str],
    _Y: List[str],
    _X_TAGS: List[str] = None
) -> None:
    samples = []
    if _X_TAGS is not None:
        for x, x_tags, y in zip(_X, _X_TAGS, _Y):
            samples.append({"Text": x, "Token Tags": x_tags, "Document Tag": y})
    else:
        for x, y in zip(_X, _Y):
            samples.append({"Text": x, "Document Tag": y})
            
    f = open(_path, 'w')
    for sample in samples:
        f.write("%s\n" % json.dumps(sample, ensure_ascii=False))
    f.close()

In [12]:
to_jsonl(
    "train_v1.jsonl",
    x_train, y_train
)

to_jsonl(
    "val_v1.jsonl",
    x_val, x_val
)

to_jsonl(
    "test_v1.jsonl",
    nondup_x_test, nondup_y_test, nondup_x_tags_test
)