In [1]:
# Read from Jigsaw dataset -> Clean -> Binarize -> Remove duplicates -> partitioning -> Export CSV
import os, re, json
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

INPUT_ROOT = "/kaggle/input"
WORK_DIR   = "/kaggle/working"
OUT_DIR    = os.path.join(WORK_DIR, "data")
os.makedirs(OUT_DIR, exist_ok=True)

# Automatically locate Jigsaw dataset
JIGSAW_DIRS = [os.path.join(INPUT_ROOT, d) for d in os.listdir(INPUT_ROOT)
               if d.startswith("jigsaw-unintended-bias-in-toxicity-classification")]
assert len(JIGSAW_DIRS) >= 1, "The jigsaw dataset was not found."
JIGSAW_DIR = JIGSAW_DIRS[0]

# Identity column 
IDENTITY_COLS = [
    "male","female","transgender","other_gender",
    "black","white","asian","latino","other_race_or_ethnicity",
    "christian","jewish","muslim","hindu","buddhist","atheist","other_religion",
    "heterosexual","homosexual_gay_or_lesbian","bisexual","other_sexual_orientation",
    "physical_disability","intellectual_or_learning_disability","psychiatric_or_mental_illness","other_disability"
]

# Read the original train.csv file
train_path = os.path.join(JIGSAW_DIR, "train.csv")
df_raw = pd.read_csv(train_path)

# Text cleaning
URL_RE = re.compile(r"http\S+")
AT_RE  = re.compile(r"@\w+")
def clean_text(s: str) -> str:
    s = str(s) if pd.notna(s) else ""
    s = URL_RE.sub(" URL ", s)
    s = AT_RE .sub("@USER", s)
    s = s.replace("\n", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

use_id_cols = [c for c in IDENTITY_COLS if c in df_raw.columns]
df = pd.DataFrame({
    "id": df_raw["id"],
    "text": df_raw["comment_text"].map(clean_text),
    # Jigsaw target \in [0,1], Binarize by 0.5
    "label": (df_raw["target"] >= 0.5).astype(int)
})
for c in use_id_cols:
    df[f"g_{c}"] = (df_raw[c].fillna(0) >= 0.5).astype(int)

# Reweighting (by text)
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)

# Stratified 8/1/1 
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(sss1.split(df, df["label"]))
temp = df.iloc[temp_idx]
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_rel_idx, test_rel_idx = next(sss2.split(temp, temp["label"]))
val_idx  = temp_idx[val_rel_idx]
test_idx = temp_idx[test_rel_idx]

# Export standard CSV 
df.iloc[train_idx][["text","label"]].to_csv(os.path.join(OUT_DIR, "jigsaw_train.csv"), index=False)
df.iloc[val_idx  ][["text","label"]].to_csv(os.path.join(OUT_DIR, "jigsaw_val.csv"  ), index=False)
df.iloc[test_idx ][["text","label"]].to_csv(os.path.join(OUT_DIR, "jigsaw_test.csv" ), index=False)


splits = {
    "jigsaw": {
        "train_n": int(len(train_idx)),
        "val_n": int(len(val_idx)),
        "test_n": int(len(test_idx)),
        "pos_rate": {
            "train": float(df.iloc[train_idx]["label"].mean()),
            "val":   float(df.iloc[val_idx]["label"].mean()),
            "test":  float(df.iloc[test_idx]["label"].mean()),
        }
    }
}
with open(os.path.join(OUT_DIR, "protocols.json"), "w") as f:
    json.dump(splits, f, indent=2)

print("Export complete：", OUT_DIR)
!ls -lh {OUT_DIR}


Export complete： /kaggle/working/data
total 510M
-rw-r--r-- 1 root root  51M Nov 17 03:37 jigsaw_test.csv
-rw-r--r-- 1 root root 408M Nov 17 03:37 jigsaw_train.csv
-rw-r--r-- 1 root root  51M Nov 17 03:37 jigsaw_val.csv
-rw-r--r-- 1 root root  216 Nov 17 03:37 protocols.json
