In [2]:
pip install datasets pandas numpy matplotlib seaborn krippendorff


Note: you may need to restart the kernel to use updated packages.


In [6]:
import krippendorff
from datasets import load_dataset
import pandas as pd
import numpy as np

from pandas.core.computation.check import NUMEXPR_INSTALLED

In [4]:
dataset = load_dataset("ucberkeley-dlab/measuring-hate-speech")


In [7]:
dataset["train"][0]


{'comment_id': 47777,
 'annotator_id': 10873,
 'platform': 3,
 'sentiment': 0.0,
 'respect': 0.0,
 'insult': 0.0,
 'humiliate': 0.0,
 'status': 2.0,
 'dehumanize': 0.0,
 'violence': 0.0,
 'genocide': 0.0,
 'attack_defend': 0.0,
 'hatespeech': 0.0,
 'hate_speech_score': -3.9,
 'text': 'Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn\'t have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!',
 'infitms': 0.81,
 'outfitms': 1.88,
 'annotator_severity': 0.36,
 'std_err': 0.34,
 'annotator_infitms': 1.35,
 'annotator_outfitms': 1.23,
 'hypothesis': -1.1301777576839678,
 'target_race_asian': True,
 'target_race_black': True,
 'target_race_latinx': True,
 'target_race_middle_eastern': True,
 'target_race_native_american': True,
 'target_race_pacific_islander': True,
 'target_race_white': True,
 'target_race_other': False,
 'target_race': True,
 

In [8]:
df = pd.DataFrame(dataset["train"])

 # Checking for duplicates

#### Checking exact duplicates

In [53]:
df.duplicated().sum()


np.int64(0)

#### Checking duplicates ignoring the index (same content, repeated)

In [19]:
df.reset_index(drop=True).duplicated().sum()


np.int64(0)

#### Checking duplicates on comment id and annotator id

In [20]:
df.duplicated(subset=["comment_id", "annotator_id"]).sum()


np.int64(0)

#### Checking duplicates on text and annotator id

In [21]:
df.duplicated(subset=["text", "annotator_id"]).sum()


np.int64(0)

## Data Splitting (70% Training, 20% Testing, 10%Validation)

In [37]:
from sklearn.model_selection import train_test_split

comment_ids = df["comment_id"].unique()

In [38]:
comment_ids

array([47777, 39773, 47101, ..., 30588, 21008, 37080])

In [39]:
train_ids, temp_ids = train_test_split(
    comment_ids,
    test_size=0.30,
    random_state=42
)

test_ids, val_ids = train_test_split(
    temp_ids,
    test_size=1/3,
    random_state=42
)


In [40]:
train_df = df[df["comment_id"].isin(train_ids)]
test_df  = df[df["comment_id"].isin(test_ids)]
val_df   = df[df["comment_id"].isin(val_ids)]


In [42]:
print("Original rows:", len(df))
print("Train rows:", len(train_df))
print("Test rows:", len(test_df))
print("Validation rows:", len(val_df))
print("Total after split:", len(train_df) + len(test_df) + len(val_df)) 

Original rows: 135556
Train rows: 93935
Test rows: 25413
Validation rows: 16208
Total after split: 135556


In [54]:
total_rows = len(df)

print("Train %:", round(len(train_df) / total_rows * 100, 2))
print("Test %:", round(len(test_df) / total_rows * 100, 2))
print("Validation %:", round(len(val_df) / total_rows * 100, 2))


Train %: 69.3
Test %: 18.75
Validation %: 11.96


*Note - Splitting is done based on the comments. After splitting, annotators are added for that comment and hence the variation in dataset splitting percentage (which is almost around 70-20-10).*

## Checking Data Leakage 

#### Checking comment-level leakage

In [46]:
train_ids = set(train_df["comment_id"])
test_ids  = set(test_df["comment_id"])
val_ids   = set(val_df["comment_id"])

print("Train and Test:", len(train_ids & test_ids))
print("Train and Val:", len(train_ids & val_ids))
print("Test and Val:", len(test_ids & val_ids))


Train and Test: 0
Train and Val: 0
Test and Val: 0


#### Checking annotation-level leakage

In [48]:
train_pairs = set(zip(train_df["comment_id"], train_df["annotator_id"]))
test_pairs  = set(zip(test_df["comment_id"], test_df["annotator_id"]))
val_pairs   = set(zip(val_df["comment_id"], val_df["annotator_id"]))

print("Train and Test (pairs):", len(train_pairs & test_pairs))
print("Train and Val (pairs):", len(train_pairs & val_pairs))
print("Test and Val (pairs):", len(test_pairs & val_pairs))


Train and Test (pairs): 0
Train and Val (pairs): 0
Test and Val (pairs): 0


#### Checking no rows were lost

In [51]:
print("Original rows:", len(df))
print("After split:", len(train_df) + len(test_df) + len(val_df))


Original rows: 135556
After split: 135556
