CS483 .......

In [1]:
# Step 1: Imports & Paths
import os, re, json, textwrap, yaml, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, log_loss

INPUT_ROOT = "/kaggle/input"

JIGSAW_DIR = [os.path.join(INPUT_ROOT, d) for d in os.listdir(INPUT_ROOT) 
              if d.startswith("jigsaw-unintended-bias-in-toxicity-classification")]
assert len(JIGSAW_DIR) >= 1, "The jigsaw dataset directory was not found."
JIGSAW_DIR = JIGSAW_DIR[0]
JIGSAW_DIR


'/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification'

In [2]:
# Step 2: Load & Standardize Jigsaw

# Common identity columns 
IDENTITY_COLS = [
    "male","female","transgender","other_gender",
    "black","white","asian","latino","other_race_or_ethnicity",
    "christian","jewish","muslim","hindu","buddhist","atheist","other_religion",
    "heterosexual","homosexual_gay_or_lesbian","bisexual","other_sexual_orientation",
    "physical_disability","intellectual_or_learning_disability","psychiatric_or_mental_illness","other_disability"
]

# Read the train table 
train_path = os.path.join(JIGSAW_DIR, "train.csv")
df_raw = pd.read_csv(train_path)
print(df_raw.shape, list(df_raw.columns[:12]))

# Cleaning function
URL_RE = re.compile(r"http\S+")
AT_RE  = re.compile(r"@\w+")
def clean_text(s: str) -> str:
    s = str(s) if not pd.isna(s) else ""
    s = URL_RE.sub(" URL ", s)
    s = AT_RE .sub("@USER", s)
    s = s.replace("\n", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Unify to text / label (binary classification) + multiple group_* columns (boolean)
use_id_cols = [c for c in IDENTITY_COLS if c in df_raw.columns]
df = pd.DataFrame({
    "id": df_raw["id"],
    "text": df_raw["comment_text"].map(clean_text),
    "label": (df_raw["target"] >= 0.5).astype(int)
})
for c in use_id_cols:
    df[f"g_{c}"] = (df_raw[c].fillna(0) >= 0.5).astype(int)

# Deduplication 
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
print(df.shape, df.head(2))


(1804874, 45) ['id', 'target', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black']
(1776166, 27)       id                                               text  label  g_male  \
0  59848  This is so cool. It's like, 'would you want yo...      0       0   
1  59849  Thank you!! This would make my life a lot less...      0       0   

   g_female  g_transgender  g_other_gender  g_black  g_white  g_asian  ...  \
0         0              0               0        0        0        0  ...   
1         0              0               0        0        0        0  ...   

   g_atheist  g_other_religion  g_heterosexual  g_homosexual_gay_or_lesbian  \
0          0                 0               0                            0   
1          0                 0               0                            0   

   g_bisexual  g_other_sexual_orientation  g_physical_disability  \
0           0                           0            

In [3]:
# Step 3: Splits (80/10/10) stratified by label
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(sss1.split(df, df["label"]))
temp = df.iloc[temp_idx]
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_rel_idx, test_rel_idx = next(sss2.split(temp, temp["label"]))
val_idx  = temp_idx[val_rel_idx]
test_idx = temp_idx[test_rel_idx]

splits = {
    "jigsaw": {
        "in_domain": {
            "train": train_idx.tolist(),
            "val":   val_idx.tolist(),
            "test":  test_idx.tolist()
        }
    },
    # Reserved cross-domain entry: civil/hatexplain can be added later
    "protocols": {
        "Jigsaw->Jigsaw": {"train_set": "jigsaw.train", "test_set": "jigsaw.test"}
    }
}

def overview(name, idx):
    sub = df.iloc[idx]
    pos = int((sub["label"]==1).sum()); neg = int((sub["label"]==0).sum())
    print(f"{name:<8} | n={len(idx):>6} | pos={pos:>6} | neg={neg:>6} | pos%={pos/len(idx):.3f}")

overview("train", train_idx)
overview("val",   val_idx)
overview("test",  test_idx)


train    | n=1420932 | pos=113691 | neg=1307241 | pos%=0.080
val      | n=177617 | pos= 14212 | neg=163405 | pos%=0.080
test     | n=177617 | pos= 14211 | neg=163406 | pos%=0.080
