In [1]:
import random
import typing as t
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

In [3]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)

seed_everything(42)

In [4]:
DATASET_DIR = Path('/home/jovyan/jigsaw-toxic/data/datasets/ccc-2017-multilabel')
COMBINED_DIR = Path('/home/jovyan/jigsaw-toxic/data/datasets/combined')

CLS_LIST = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
!ls -la $COMBINED_DIR

total 679664
drwxr-xr-x  2 jovyan users      4096 Dec 28 08:20 .
drwxr-xr-x 10 jovyan users      4096 Jan 22 20:20 ..
-rw-r--r--  1 jovyan users  69255825 Dec 27 15:12 train_comment_classification_challenge_2017.csv
-rw-r--r--  1 jovyan users  64981283 Dec 28 08:20 train_comment_classification_challenge_2017_no_leak.csv
-rw-r--r--  1 jovyan users    132200 Dec 27 15:12 train_ruddit.csv
-rw-r--r--  1 jovyan users 552940742 Dec 27 15:13 train_unintended_bias_in_toxicity_classification.csv
-rw-r--r--  1 jovyan users   8633656 Dec 27 15:12 valid.csv


In [None]:
!mkdir -p $DATASET_DIR
!cp $COMBINED_DIR/train_comment_classification_challenge_2017_no_leak.csv $DATASET_DIR/train_no_leak.csv
!cp $COMBINED_DIR/valid.csv $DATASET_DIR/valid_pair.csv

In [None]:
!ls -la $COMBINED_DIR

In [7]:
!ls -la $DATASET_DIR

total 28091280
drwxr-xr-x  2 jovyan users        4096 Jan 14 12:03 .
drwxr-xr-x 10 jovyan users        4096 Jan 22 20:20 ..
-rw-r--r--  1 jovyan users        1699 Jan 12 11:46 label_toxicity.csv
-rw-r--r--  1 jovyan users    64981283 Jan 13 08:38 train_no_leak.csv
-rw-r--r--  1 jovyan users    67448851 Jan 13 08:40 train_no_leak_expanded.csv
-rw-r--r--  1 jovyan users    30080933 Jan  6 20:31 train_no_leak_pair.csv
-rw-r--r--  1 jovyan users    38018364 Dec 30 18:35 train_no_leak_pair_harder_1.csv
-rw-r--r--  1 jovyan users    38087288 Dec 30 18:35 train_no_leak_pair_harder_2.csv
-rw-r--r--  1 jovyan users    37854841 Dec 30 18:35 train_no_leak_pair_harder_3.csv
-rw-r--r--  1 jovyan users    37854841 Dec 30 12:17 train_no_leak_pair_harder.csv
-rw-r--r--  1 jovyan users  3418444331 Jan 12 12:03 train_no_leak_pair_v2.csv
-rw-r--r--  1 jovyan users 24790812945 Jan 12 12:02 train_no_leak_pair_v2_full.csv
-rw-r--r--  1 jovyan users    28057742 Jan 13 10:30 train_no_leak_pair_v3.csv
-rw-r--r

In [5]:
all_df = pd.read_csv(DATASET_DIR / 'train_no_leak.csv')
all_with_leak_df = pd.read_csv(COMBINED_DIR / 'train_comment_classification_challenge_2017.csv')
valid_df = pd.read_csv(COMBINED_DIR / 'valid.csv')

In [6]:
all_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...
151937,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,0.0
151938,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,0.0
151939,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,0.0
151940,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,0.0


In [7]:
all_df['n_flags'] = all_df.progress_apply(lambda row: row['toxic'] + row['severe_toxic'] + row['obscene'] + row['threat'] + row['insult'] + row['identity_hate'], axis=1)

  0%|          | 0/151942 [00:00<?, ?it/s]

In [8]:
def build_readable_label(row: t.Dict[str, int]) -> str:
    return ' '.join([cls for cls in CLS_LIST if row[cls]])

def build_bitmap_label(row: t.Dict[str, int]) -> str:
    return ' '.join([str(row[cls]) for cls in CLS_LIST])


all_df['bitmap_label'] = all_df.progress_apply(lambda row: build_bitmap_label(row), axis=1)
all_df['readable_label'] = all_df.progress_apply(lambda row: build_readable_label(row), axis=1)

  0%|          | 0/151942 [00:00<?, ?it/s]

  0%|          | 0/151942 [00:00<?, ?it/s]

In [13]:
all_df[all_df['n_flags'] > 0]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,n_flags,bitmap_label,readable_label
11,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0,0.133333,1,1 0 0 0 0 0,toxic
15,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0,0.133333,1,1 0 0 0 0 0,toxic
41,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1,0.533333,4,1 0 1 0 1 1,toxic obscene insult identity_hate
42,001956c382006abd,I'm Sorry \n\nI'm sorry I screwed around with ...,1,0,0,0,0,0,0.133333,1,1 0 0 0 0 0,toxic
48,001dc38a83d420cf,GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK T...,1,0,1,0,0,0,0.266667,2,1 0 1 0 0 0,toxic obscene
...,...,...,...,...,...,...,...,...,...,...,...,...
151866,fef4cf7ba0012866,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1,0.533333,4,1 0 1 0 1 1,toxic obscene insult identity_hate
151886,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1,0,0,0,1,0,0.266667,2,1 0 0 0 1 0,toxic insult
151912,ffa33d3122b599d6,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0,0.400000,3,1 0 1 0 1 0,toxic obscene insult
151917,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0,0.266667,2,1 0 0 0 1 0,toxic insult


In [None]:
all_df.to_csv(DATASET_DIR / 'train_no_leak_expanded.csv', index=False)

In [10]:
valid_df

Unnamed: 0,less_toxic,more_toxic,certainty
0,"I think people say its not so absurd, because ...",Re Vandalism. . . . that is because the user ...,0.666667
1,You must have the lowest IQ of anyone on wikip...,Wha's a reliable source mike? Something that c...,1.000000
2,"Thanks for removing more of his anonymous, un...","To Bad\nyou dont have the balls to sign in, I ...",1.000000
3,"""\n\nActually, it clearly does not involve pow...",", 18 November 2006 (UTC)\n\nWell comrade. Look...",0.666667
4,You are not cool \n\nYou are the most hated f...,"""== Hey Fuck-head==\nHey, just wanted to say ""...",1.000000
...,...,...,...
10103,Heeeeeeeeeeeeeeyyyyyyyyyyyy dude! Sup!,you deleted my page \nand i am extrememly mad ...,1.000000
10104,ATTENTION:''''''\n\nANYONE WHO OPPOSES MY OPIN...,suck dick u disrespectful swearing wank!!,1.000000
10105,"Oh yah, and Bayerischermann is another guy try...",Thanks. Also thanks for freezing the article. ...,1.000000
10106,I had to chop out a section specifically compa...,"""\n\nHorrifyingly enough, """"ritualistic penis ...",0.666667


In [11]:
# Get all the the label string for `less_toxic` and `more_toxic` from the leaked part of the train set.

LABEL_NOT_FOUND = '<not_found>'

def assign_label_to_comment(df: pd.DataFrame, labels_df: pd.DataFrame) -> pd.DataFrame:
    result_row_list = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        less_toxic_comment_text = str(row['less_toxic'])
        more_toxic_comment_text = str(row['more_toxic'])
        less_toxic_label_row_candidate_df = labels_df[labels_df['comment_text'] == less_toxic_comment_text]
        if len(less_toxic_label_row_candidate_df):
            less_toxic_readable_label = build_readable_label(less_toxic_label_row_candidate_df.iloc[0])
            less_toxic_bitmap_label = build_bitmap_label(less_toxic_label_row_candidate_df.iloc[0])
        else:
            less_toxic_readable_label = LABEL_NOT_FOUND
            less_toxic_bitmap_label = LABEL_NOT_FOUND
        more_toxic_label_row_candidate_df = labels_df[labels_df['comment_text'] == more_toxic_comment_text]
        if len(more_toxic_label_row_candidate_df):
            more_toxic_readable_label = build_readable_label(more_toxic_label_row_candidate_df.iloc[0])
            more_toxic_bitmap_label = build_bitmap_label(more_toxic_label_row_candidate_df.iloc[0])
        else:
            more_toxic_readable_label = LABEL_NOT_FOUND
            more_toxic_bitmap_label = LABEL_NOT_FOUND
        result_row_list.append({
            'less_toxic': less_toxic_comment_text,
            'less_toxic_readable_label': less_toxic_readable_label,
            'less_toxic_bitmap_label': less_toxic_bitmap_label,
            'more_toxic': more_toxic_comment_text,
            'more_toxic_readable_label': more_toxic_readable_label,
            'more_toxic_bitmap_label': more_toxic_bitmap_label,
        })
    return pd.DataFrame(result_row_list)

In [12]:
valid_with_labels_df = assign_label_to_comment(df=valid_df, labels_df=all_with_leak_df)

  0%|          | 0/10108 [00:00<?, ?it/s]

In [16]:
print('Both found:', len(valid_with_labels_df[(valid_with_labels_df['less_toxic_readable_label'] != LABEL_NOT_FOUND) & (valid_with_labels_df['more_toxic_readable_label'] != LABEL_NOT_FOUND)]))
print('Less not found:', len(valid_with_labels_df[(valid_with_labels_df['less_toxic_readable_label'] == LABEL_NOT_FOUND) & (valid_with_labels_df['more_toxic_readable_label'] != LABEL_NOT_FOUND)]))
print('More not found:', len(valid_with_labels_df[(valid_with_labels_df['less_toxic_readable_label'] != LABEL_NOT_FOUND) & (valid_with_labels_df['more_toxic_readable_label'] == LABEL_NOT_FOUND)]))
print('Both not found:', len(valid_with_labels_df[(valid_with_labels_df['less_toxic_readable_label'] == LABEL_NOT_FOUND) & (valid_with_labels_df['more_toxic_readable_label'] == LABEL_NOT_FOUND)]))

Both found: 2817
Less not found: 2573
More not found: 2541
Both not found: 2177


In [17]:
analyze_cls_label_df = valid_with_labels_df[(valid_with_labels_df['less_toxic_readable_label'] != LABEL_NOT_FOUND) & (valid_with_labels_df['more_toxic_readable_label'] != LABEL_NOT_FOUND)]

In [18]:
analyze_cls_label_df

Unnamed: 0,less_toxic,less_toxic_readable_label,less_toxic_bitmap_label,more_toxic,more_toxic_readable_label,more_toxic_bitmap_label
1,You must have the lowest IQ of anyone on wikip...,toxic insult,1 0 0 0 1 0,Wha's a reliable source mike? Something that c...,toxic severe_toxic obscene insult,1 1 1 0 1 0
3,"""\n\nActually, it clearly does not involve pow...",,0 0 0 0 0 0,", 18 November 2006 (UTC)\n\nWell comrade. Look...",obscene insult,0 0 1 0 1 0
5,"""\n\n=========================================...",toxic,1 0 0 0 0 0,"OH, GET REAL, WILL YA'!! (or, is THAT a totall...",toxic,1 0 0 0 0 0
8,"""\n\n""""Self-hating Jew"""" is with very rare exc...",,0 0 0 0 0 0,Why not? The man (Bkonrad) is an asshole. Loo...,toxic obscene insult,1 0 1 0 1 0
10,VaNdAlIsM tee hee. You have been vandalized,,0 0 0 0 0 0,"""\n \nPlease actually read what I'm saying. Yo...",,0 0 0 0 0 0
...,...,...,...,...,...,...
10086,""" 2009 (UTC)\n\nWould it be possible to sugges...",,0 0 0 0 0 0,Hey wanna go suck a candy cane dick?,toxic severe_toxic obscene,1 1 1 0 0 0
10090,"""\n\nPity that it is, (it was really hard to f...",,0 0 0 0 0 0,{unblock|anal leakage ban hammer!! User: Ned S...,toxic,1 0 0 0 0 0
10099,"""\n\nStormie is a loon who pops up every now a...",,0 0 0 0 0 0,Indian people say this is just British hogwash.,,0 0 0 0 0 0
10100,"This man is an imposter, for I am the prime mi...",toxic,1 0 0 0 0 0,"Be nice and don't be a dick, or I will kick yo...",toxic obscene threat insult,1 0 1 1 1 0


In [20]:
len(analyze_cls_label_df[(analyze_cls_label_df['less_toxic_readable_label'] == '') & (analyze_cls_label_df['more_toxic_readable_label'] == '')]) / len(analyze_cls_label_df)

0.291089811856585

In [21]:
len(analyze_cls_label_df[(analyze_cls_label_df['less_toxic_readable_label'] == '') & (analyze_cls_label_df['more_toxic_readable_label'] != '')]) / len(analyze_cls_label_df)

0.410010649627263

In [22]:
len(analyze_cls_label_df[(analyze_cls_label_df['less_toxic_readable_label'] != '') & (analyze_cls_label_df['more_toxic_readable_label'] == '')]) / len(analyze_cls_label_df)

0.0979765708200213

In [23]:
analyze_cls_label_df[(analyze_cls_label_df['less_toxic_readable_label'] != '') & (analyze_cls_label_df['more_toxic_readable_label'] == '')]

Unnamed: 0,less_toxic,less_toxic_readable_label,less_toxic_bitmap_label,more_toxic,more_toxic_readable_label,more_toxic_bitmap_label
101,well sorry about replacing bands.but you dumb ...,toxic obscene insult,1 0 1 0 1 0,"""\n\n /* Is she black? */ \n\nwhy dose it mat...",,0 0 0 0 0 0
139,"Go worship your jewish god, sub-human.",toxic insult identity_hate,1 0 0 0 1 1,""": Recently a group of I.P. addresses were use...",,0 0 0 0 0 0
261,The truth is vandalism? Only to you leftist tr...,toxic,1 0 0 0 0 0,"""\nWP:QUACK - same edits as you, IP resolves t...",,0 0 0 0 0 0
281,want a pic to masterbate over??? send me an em...,toxic obscene,1 0 1 0 0 0,"""\n\n Euthanasia \n\nPlease revert your revers...",,0 0 0 0 0 0
289,"""\n\n=Czechoslovak fan panslavist idiote\n""""Pl...",toxic,1 0 0 0 0 0,"""\n\nBiased/Uninformed Triva\n\nSmaller and li...",,0 0 0 0 0 0
...,...,...,...,...,...,...
9849,dont show me attitude i give u standing ovatio...,toxic,1 0 0 0 0 0,"""\n The only one who keeps bringing up """"Jews""...",,0 0 0 0 0 0
9898,you have a small penis,toxic insult,1 0 0 0 1 0,"So called, and what about you, A shill for you...",,0 0 0 0 0 0
9905,"""\n\n Hi nerd \n\nWould you please explain why...",obscene insult,0 0 1 0 1 0,"I DONT CARE, BAN ME IF U WANT !!! YOUR GOING T...",,0 0 0 0 0 0
9967,Crazy low-lives like you ruin Wikipedia and Br...,toxic,1 0 0 0 0 0,"You believe, sir, that I am a Nazi? Not that ...",,0 0 0 0 0 0


In [11]:
def get_label_pair_set(df: pd.DataFrame, th: t.Optional[float] = None, min_n: t.Optional[int] = None) -> t.Set[t.Tuple[str, str, float]]:
    label_set = set(df['less_toxic_readable_label'].unique()) | set(df['more_toxic_readable_label'].unique())
    label_pair_set: t.Set[t.Tuple[str, str, float]] = set()
    for li in tqdm(label_set):
        for lj in label_set:
            if li == lj:
                continue
            num_i_lt_j, num_j_lt_i = (
                len(df[(df['less_toxic_readable_label'] == li) & (df['more_toxic_readable_label'] == lj)]),
                len(df[(df['less_toxic_readable_label'] == lj) & (df['more_toxic_readable_label'] == li)]),
            )
            if (num_i_lt_j, num_j_lt_i) != (0, 0):
                if num_i_lt_j < num_j_lt_i and not set(li.split(' ')).issubset(lj.split(' ')) \
                        and (th is None or num_j_lt_i / (num_i_lt_j + num_j_lt_i) > th) \
                        and (min_n is None or num_j_lt_i >= min_n):
                    label_pair_set.add((lj, li, num_j_lt_i / (num_i_lt_j + num_j_lt_i)))
                elif num_j_lt_i < num_i_lt_j and not set(lj.split(' ')).issubset(li.split(' ')) \
                        and (th is None or num_i_lt_j / (num_i_lt_j + num_j_lt_i) > th) \
                        and (min_n is None or num_i_lt_j >= min_n):
                    label_pair_set.add((li, lj, num_i_lt_j / (num_i_lt_j + num_j_lt_i)))
    return label_pair_set


In [None]:
label_pair_set = get_label_pair_set(df=analyze_cls_label_df, th=0.7, min_n=2)

In [None]:
len(label_pair_set)

In [None]:
label_toxicity_df = pd.DataFrame([
    {'less_toxic': lt, 'more_toxic': mt, 'certainty': cert, 'is_subset': int(set(lt.split(' ')).issubset(set(mt.split(' '))))}
    for lt, mt, cert in label_pair_set
])

In [None]:
label_toxicity_df

In [None]:
label_toxicity_df[label_toxicity_df['is_subset'] == 0]

In [None]:
# insult < severe_toxic
# identity_hate < severe_toxic
# toxic < obscene + insult
# insult < identity_hate
# toxic < identity_hate
# threat < obscene + insult
# insult < severe_toxic + obscene
# obscene < identity_hate

# INFERRED:
# toxic < severe_toxic
# threat < obscene + severe_toxic
# threat < obscene + identity_hate
# obscene < severe_toxic

In [10]:
class L:
    TOXIC = 'toxic'
    SEVERE_TOXIC = 'severe_toxic'
    INSULT = 'insult'
    OBSCENE = 'obscene'
    IDENTITY_HATE = 'identity_hate'
    THREAT = 'threat'


class MoreCondition:

    def mask(self, less_label_set: t.Set[str], df: pd.DataFrame) -> t.Optional[pd.Series]:
        raise NotImplementedError()


class SimpleMoreCondition(MoreCondition):
    _ALL_LABEL_SET = {
        L.TOXIC,
        L.SEVERE_TOXIC,
        L.INSULT,
        L.OBSCENE,
        L.IDENTITY_HATE,
        L.THREAT,
    }

    def mask(self, less_label_set: t.Set[str], df: pd.DataFrame) -> t.Optional[pd.Series]:
        """
        More row contains all the labels from the `less_label_set` and 
        at least one label from the `self._ALL_LABEL_SET - less_label_set`.
        """
        all_less_labels_mask: t.Optional[pd.Series] = None
        for label in less_label_set:
            all_less_labels_mask = all_less_labels_mask & (df[label] == 1) \
                if all_less_labels_mask is not None else (df[label] == 1)
        at_least_one_more_label_mask: t.Optional[pd.Series] = None
        for label in self._ALL_LABEL_SET - less_label_set:
            at_least_one_more_label_mask = at_least_one_more_label_mask | (df[label] == 1) \
                if at_least_one_more_label_mask is not None else (df[label] == 1)
        if all_less_labels_mask is None and at_least_one_more_label_mask is None:
            return None
        elif all_less_labels_mask is not None and at_least_one_more_label_mask is None:
            return all_less_labels_mask
        elif all_less_labels_mask is None and at_least_one_more_label_mask is not None:
            return at_least_one_more_label_mask
        assert all_less_labels_mask is not None and at_least_one_more_label_mask is not None
        return all_less_labels_mask & at_least_one_more_label_mask


class ComparisonBasedMoreCondition(MoreCondition):

    def __init__(self, less_has: t.Set[str], more_has: t.Set[str]):
        self._less_has = less_has
        self._more_has = more_has

    def mask(self, less_label_set: t.Set[str], df: pd.DataFrame) -> t.Optional[pd.Series]:
        if not self._less_has.issubset(less_label_set):
            return None
        base_label_set = less_label_set - self._less_has
        mask: t.Optional[pd.Series] = None
        for label in base_label_set | self._more_has:
            mask = mask & (df[label] == 1) if mask is not None else (df[label] == 1)
        assert mask is not None
        return mask


more_condition_list = [
    # SimpleMoreCondition(),
    # Inferred directly from the valid data.
    ComparisonBasedMoreCondition({L.INSULT}, {L.SEVERE_TOXIC}),
    ComparisonBasedMoreCondition({L.IDENTITY_HATE}, {L.SEVERE_TOXIC}),
    ComparisonBasedMoreCondition({L.TOXIC}, {L.OBSCENE, L.INSULT}),
    ComparisonBasedMoreCondition({L.INSULT}, {L.IDENTITY_HATE}),
    ComparisonBasedMoreCondition({L.TOXIC}, {L.IDENTITY_HATE}),
    ComparisonBasedMoreCondition({L.THREAT}, {L.OBSCENE, L.INSULT}),
    ComparisonBasedMoreCondition({L.INSULT}, {L.SEVERE_TOXIC, L.OBSCENE}),
    ComparisonBasedMoreCondition({L.OBSCENE}, {L.IDENTITY_HATE}),
    # Inferred from the transitivity of < operation.
    # ComparisonBasedMoreCondition({L.TOXIC}, {L.SEVERE_TOXIC}),
    # ComparisonBasedMoreCondition({L.THREAT}, {L.OBSCENE, L.SEVERE_TOXIC}),
    # ComparisonBasedMoreCondition({L.THREAT}, {L.OBSCENE, L.IDENTITY_HATE}),
    # ComparisonBasedMoreCondition({L.OBSCENE}, {L.SEVERE_TOXIC}),
    # ComparisonBasedMoreCondition({L.THREAT}, {L.SEVERE_TOXIC, L.INSULT}),
    # ComparisonBasedMoreCondition({L.THREAT}, {L.IDENTITY_HATE, L.INSULT}),
    # Inferred from the common sense.
    # ComparisonBasedMoreCondition({L.TOXIC}, {L.THREAT}),
    # ComparisonBasedMoreCondition({L.OBSCENE}, {L.THREAT}),
]


def _label_set(readable_label_str: str) -> t.Set[str]:
    return set(readable_label_str.split(' ')) if readable_label_str else set()


def mine_pairs(
        df: pd.DataFrame,
        more_condition_list: t.List[MoreCondition],
        non_toxic_ratio: float = 0.1,
        max_n_flags_distance: int = 2,
        toxic_max_pairs_per_sample: int = 3,
        non_toxic_max_pairs_per_sample: int = 3) -> pd.DataFrame:
    simple_more_condition = SimpleMoreCondition()
    pair_row_list = []
    num_no_pairs = 0
    toxic_df = df[df['n_flags'] > 0]
    non_toxic_df = df[df['n_flags'] == 0]
    less_df = pd.concat([
        toxic_df,
        non_toxic_df.sample(frac=1.0).iloc[:int(len(toxic_df) * non_toxic_ratio)]
    ])
    it = tqdm(less_df.iterrows(), total=len(less_df))
    for idx, less_row in it:
        less_label_set = _label_set(less_row['readable_label'])
        base_mask = (df.index != idx) \
            & (df['readable_label'] != less_row['readable_label']) \
            & (df['n_flags'] >= less_row['n_flags']) \
            & (df['n_flags'] <= less_row['n_flags'] + max_n_flags_distance)
        cond_mask = None
        for c in more_condition_list:
            c_mask = c.mask(less_label_set, df)
            if c_mask is not None:
                cond_mask = cond_mask | c_mask if cond_mask is not None else c_mask
        if cond_mask is None and less_row['n_flags'] == 0:
           cond_mask = simple_more_condition.mask(less_label_set, df)
        # if cond_mask is None:
        #     raise RuntimeError(f'Conditions failed with label set: {less_label_set}')
        if cond_mask is None:
            continue
        more_df = df[base_mask & cond_mask]
        if len(more_df):
            for _, more_row in more_df.sample(
                    n=min(len(more_df), toxic_max_pairs_per_sample if less_row['n_flags'] > 0 else non_toxic_max_pairs_per_sample)).iterrows():
                pair_row_list.append({
                    'less_toxic': less_row['comment_text'],
                    'more_toxic': more_row['comment_text'],
                    'less_toxic_readable_label': less_row['readable_label'],
                    'less_toxic_bitmap_label': less_row['bitmap_label'],
                    'more_toxic_readable_label': more_row['readable_label'],
                    'more_toxic_bitmap_label': more_row['bitmap_label'],
                    'is_subset': int(less_label_set.issubset(_label_set(more_row['readable_label']))),
                })
        else:
            num_no_pairs += 1
        it.set_description(f'num_pairs: {len(pair_row_list)}, num_no_pairs: {num_no_pairs}')
    return pd.DataFrame(pair_row_list)

In [12]:
# pair_df = mine_pairs(df=all_df, more_condition_list=more_condition_list)
pair_df = mine_pairs(
    df=all_df,
    more_condition_list=more_condition_list,
    non_toxic_ratio=0.25,
    max_n_flags_distance=3,
    toxic_max_pairs_per_sample=9,
    non_toxic_max_pairs_per_sample=3)

  0%|          | 0/15918 [00:00<?, ?it/s]

In [13]:
pair_df

Unnamed: 0,less_toxic,more_toxic,less_toxic_readable_label,less_toxic_bitmap_label,more_toxic_readable_label,more_toxic_bitmap_label,is_subset
0,Hey... what is it..\n@ | talk .\nWhat is it......,"""\n\n""""Administrators are Elitist Bastards: I ...",toxic,1 0 0 0 0 0,toxic obscene insult,1 0 1 0 1 0,1
1,Hey... what is it..\n@ | talk .\nWhat is it......,FUCK YOU TO HELL BITCH. why the fuck you one t...,toxic,1 0 0 0 0 0,toxic obscene insult,1 0 1 0 1 0,1
2,Hey... what is it..\n@ | talk .\nWhat is it......,"Fuck them all. The SPAs, corrupt administrator...",toxic,1 0 0 0 0 0,toxic obscene insult,1 0 1 0 1 0,1
3,Hey... what is it..\n@ | talk .\nWhat is it......,because you are a tweeting cunt flap!!!!!!!,toxic,1 0 0 0 0 0,toxic severe_toxic obscene insult,1 1 1 0 1 0,1
4,Hey... what is it..\n@ | talk .\nWhat is it......,Notrhbysouthbanof sucks cocks \nNotrhbysouthba...,toxic,1 0 0 0 0 0,toxic obscene insult,1 0 1 0 1 0,1
...,...,...,...,...,...,...,...
123952,"""\n\n Blocked \n\nSWATJester Son of the Defend...","""\n\n The LGBT Barnstar awarded because your ...",,0 0 0 0 0 0,identity_hate,0 0 0 0 0 1,1
123953,"""\n\n Blocked \n\nSWATJester Son of the Defend...","""\n\nShe says """"Eunuchs too served in importan...",,0 0 0 0 0 0,toxic,1 0 0 0 0 0,1
123954,""" 2012 (UTC)\n\nEveryone in your house is glad...","""\nWait, You are threatening me with WP:CIVIL ...",,0 0 0 0 0 0,toxic,1 0 0 0 0 0,1
123955,""" 2012 (UTC)\n\nEveryone in your house is glad...","""very large penis and a wise man once said, """"...",,0 0 0 0 0 0,toxic obscene,1 0 1 0 0 0,1


In [14]:
pair_df[pair_df['is_subset'] == 0]

Unnamed: 0,less_toxic,more_toxic,less_toxic_readable_label,less_toxic_bitmap_label,more_toxic_readable_label,more_toxic_bitmap_label,is_subset
12,"Bye! \n\nDon't look, come or think of comming ...",I think that what is evident is that most of t...,toxic,1 0 0 0 0 0,obscene insult,0 0 1 0 1 0,0
13,"Bye! \n\nDon't look, come or think of comming ...","""\n\n John Cena cock lover \n\n Adam Rose's l...",toxic,1 0 0 0 0 0,obscene insult,0 0 1 0 1 0,0
18,You are gay or antisemmitian? \n\nArchangel WH...,Fuck u Fuck u Fuck u Fuck uuuuuuuuuuuuuuuuuuuu...,toxic obscene insult identity_hate,1 0 1 0 1 1,toxic severe_toxic obscene insult,1 1 1 0 1 0,0
19,You are gay or antisemmitian? \n\nArchangel WH...,"ASSHOLE! \n\nI just read this article, and man...",toxic obscene insult identity_hate,1 0 1 0 1 1,toxic severe_toxic obscene insult,1 1 1 0 1 0,0
21,You are gay or antisemmitian? \n\nArchangel WH...,"Hahaha, fuck you ESA, you worthless pieces of ...",toxic obscene insult identity_hate,1 0 1 0 1 1,toxic severe_toxic obscene insult,1 1 1 0 1 0,0
...,...,...,...,...,...,...,...
114385,Your absurd edits \n\nYour absurd edits on gre...,We need to talk... \n\nDo you suck dick privat...,toxic obscene insult,1 0 1 0 1 0,toxic severe_toxic obscene,1 1 1 0 0 0,0
114390,"""\n\nHey listen don't you ever!!!! Delete my e...",WEENIE \n\npecker\n\nfuckstick!!!\n\n dnjqo dq...,toxic insult,1 0 0 0 1 0,toxic severe_toxic obscene,1 1 1 0 0 0,0
114392,"""\n\nHey listen don't you ever!!!! Delete my e...","up, HES A FRAUD, A FUCKING FRAUD!!!!!!",toxic insult,1 0 0 0 1 0,toxic severe_toxic obscene,1 1 1 0 0 0,0
114401,and i'm going to keep posting the stuff u dele...,Fuck off. You have unwarranted self importance.,toxic obscene insult,1 0 1 0 1 0,toxic severe_toxic obscene,1 1 1 0 0 0,0


In [43]:
pair_df

Unnamed: 0,less_toxic,more_toxic,less_toxic_readable_label,less_toxic_bitmap_label,more_toxic_readable_label,more_toxic_bitmap_label,is_subset
0,Hey... what is it..\n@ | talk .\nWhat is it......,"""\nTruth is not always polite. Malleus does ac...",toxic,1 0 0 0 0 0,toxic obscene insult,1 0 1 0 1 0,1
1,Hey... what is it..\n@ | talk .\nWhat is it......,You're a motherfucking homo try getting a life...,toxic,1 0 0 0 0 0,toxic severe_toxic obscene insult,1 1 1 0 1 0,1
2,Hey... what is it..\n@ | talk .\nWhat is it......,honestly ==\nyou need to crawl under a rock an...,toxic,1 0 0 0 0 0,toxic severe_toxic obscene insult,1 1 1 0 1 0,1
3,Hey... what is it..\n@ | talk .\nWhat is it......,The bastard always does it to me. An eye for a...,toxic,1 0 0 0 0 0,toxic obscene insult,1 0 1 0 1 0,1
4,Hey... what is it..\n@ | talk .\nWhat is it......,"I will do Whatever i want and it is my right, ...",toxic,1 0 0 0 0 0,toxic obscene insult,1 0 1 0 1 0,1
...,...,...,...,...,...,...,...
228880,Semitic Christian speakers of Aramaic is what ...,Do not ever send me a message again i will hun...,,0 0 0 0 0 0,toxic threat,1 0 0 1 0 0,1
228881,Semitic Christian speakers of Aramaic is what ...,Shut your mouth and stop talking. good boy,,0 0 0 0 0 0,toxic,1 0 0 0 0 0,1
228882,AndyTheGrump \n\nMy issue is with AndyTheGrump...,sore I like to be violated: bugger,,0 0 0 0 0 0,toxic obscene,1 0 1 0 0 0,1
228883,AndyTheGrump \n\nMy issue is with AndyTheGrump...,I was not blocked for calling someone a Nazi. ...,,0 0 0 0 0 0,toxic insult,1 0 0 0 1 0,1


In [None]:
label_toxicity_df.to_csv(DATASET_DIR / 'label_toxicity.csv', index=False)

In [5]:
!ls -la $DATASET_DIR

total 27938268
drwxr-xr-x 2 jovyan users        4096 Jan 13 19:12 .
drwxr-xr-x 9 jovyan users        4096 Jan 10 19:05 ..
-rw-r--r-- 1 jovyan users        1699 Jan 12 11:46 label_toxicity.csv
-rw-r--r-- 1 jovyan users    64981283 Jan 13 08:38 train_no_leak.csv
-rw-r--r-- 1 jovyan users    67448851 Jan 13 08:40 train_no_leak_expanded.csv
-rw-r--r-- 1 jovyan users    30080933 Jan  6 20:31 train_no_leak_pair.csv
-rw-r--r-- 1 jovyan users    38018364 Dec 30 18:35 train_no_leak_pair_harder_1.csv
-rw-r--r-- 1 jovyan users    38087288 Dec 30 18:35 train_no_leak_pair_harder_2.csv
-rw-r--r-- 1 jovyan users    37854841 Dec 30 18:35 train_no_leak_pair_harder_3.csv
-rw-r--r-- 1 jovyan users    37854841 Dec 30 12:17 train_no_leak_pair_harder.csv
-rw-r--r-- 1 jovyan users  3418444331 Jan 12 12:03 train_no_leak_pair_v2.csv
-rw-r--r-- 1 jovyan users 24790812945 Jan 12 12:02 train_no_leak_pair_v2_full.csv
-rw-r--r-- 1 jovyan users    28057742 Jan 13 10:30 train_no_leak_pair_v3.csv
-rw-r--r-- 1 jovyan u

In [5]:
!ls -la $DATASET_DIR

total 28091280
drwxr-xr-x  2 jovyan users        4096 Jan 14 12:03 .
drwxr-xr-x 10 jovyan users        4096 Jan 22 20:20 ..
-rw-r--r--  1 jovyan users        1699 Jan 12 11:46 label_toxicity.csv
-rw-r--r--  1 jovyan users    64981283 Jan 13 08:38 train_no_leak.csv
-rw-r--r--  1 jovyan users    67448851 Jan 13 08:40 train_no_leak_expanded.csv
-rw-r--r--  1 jovyan users    30080933 Jan  6 20:31 train_no_leak_pair.csv
-rw-r--r--  1 jovyan users    38018364 Dec 30 18:35 train_no_leak_pair_harder_1.csv
-rw-r--r--  1 jovyan users    38087288 Dec 30 18:35 train_no_leak_pair_harder_2.csv
-rw-r--r--  1 jovyan users    37854841 Dec 30 18:35 train_no_leak_pair_harder_3.csv
-rw-r--r--  1 jovyan users    37854841 Dec 30 12:17 train_no_leak_pair_harder.csv
-rw-r--r--  1 jovyan users  3418444331 Jan 12 12:03 train_no_leak_pair_v2.csv
-rw-r--r--  1 jovyan users 24790812945 Jan 12 12:02 train_no_leak_pair_v2_full.csv
-rw-r--r--  1 jovyan users    28057742 Jan 13 10:30 train_no_leak_pair_v3.csv
-rw-r--r

In [13]:
def count_not_subset(df: pd.DataFrame) -> float:
    return len(df[df['is_subset'] == 0]) / len(df)

In [12]:
train_pair_v2_df = pd.read_csv(DATASET_DIR / 'train_no_leak_pair_v2.csv')
train_pair_v3_df = pd.read_csv(DATASET_DIR / 'train_no_leak_pair_v3.csv')
train_pair_v4_df = pd.read_csv(DATASET_DIR / 'train_no_leak_pair_v4.csv')
train_pair_v5_df = pd.read_csv(DATASET_DIR / 'train_no_leak_pair_v5.csv')

In [15]:
count_not_subset(train_pair_v4_df)

0.8667001759235989

In [21]:
train_pair_v4_df

Unnamed: 0,less_toxic,more_toxic,less_toxic_readable_label,less_toxic_bitmap_label,more_toxic_readable_label,more_toxic_bitmap_label,is_subset
0,Hey... what is it..\n@ | talk .\nWhat is it......,"""\n\n Geez Hesp, as well as being a coward you...",toxic,1 0 0 0 0 0,obscene insult,0 0 1 0 1 0,0
1,Hey... what is it..\n@ | talk .\nWhat is it......,Issues with Knox \n\nKnox run things like Gala...,toxic,1 0 0 0 0 0,obscene insult,0 0 1 0 1 0,0
2,You are gay or antisemmitian? \n\nArchangel WH...,YOU ARE FUCKING CALLING MY TALKING ON MY OWN F...,toxic obscene insult identity_hate,1 0 1 0 1 1,toxic severe_toxic obscene insult,1 1 1 0 1 0,0
3,You are gay or antisemmitian? \n\nArchangel WH...,thanks for all your hard work \n\nlook here yo...,toxic obscene insult identity_hate,1 0 1 0 1 1,toxic severe_toxic obscene threat insult,1 1 1 1 1 0,0
4,You are gay or antisemmitian? \n\nArchangel WH...,Thanks for being late to the party \n\nbut I h...,toxic obscene insult identity_hate,1 0 1 0 1 1,toxic severe_toxic obscene insult,1 1 1 0 1 0,0
...,...,...,...,...,...,...,...
59680,"""\n\nIt seems that a friend that I mentioned t...",What do you mean you arsehole? Thi IP is block...,,0 0 0 0 0 0,toxic obscene,1 0 1 0 0 0,1
59681,"""==Translation issues==\nIn the original Germa...","You don't listen so good, do you? I SAID STOP ...",,0 0 0 0 0 0,toxic,1 0 0 0 0 0,1
59682,Ultimates \n\nI'll delete it every time when t...,You reporting yourself for changing my edits m...,,0 0 0 0 0 0,toxic,1 0 0 0 0 0,1
59683,Plagarism\n\nThe introduction to this page is ...,removed his bullshits. 114.179.18.37 14:05,,0 0 0 0 0 0,toxic obscene,1 0 1 0 0 0,1


In [16]:
count_not_subset(train_pair_v5_df)

0.1354785154116696

In [21]:
def _merge_pair_df_list(df_list: t.List[pd.DataFrame]) -> pd.DataFrame:
    row_list = []
    comment_text_pair_set = set()
    for df in df_list:
        for _, row in tqdm(df.iterrows(), total=len(df)):
            comment_text_pair = (str(row['more_toxic']), str(row['less_toxic']))
            if comment_text_pair not in comment_text_pair_set:
                row_list.append({**row})
                comment_text_pair_set.add(comment_text_pair)
    return pd.DataFrame(row_list)

In [24]:
train_pair_v4v5_df = _merge_pair_df_list([train_pair_v4_df, train_pair_v5_df[train_pair_v5_df['is_subset'] == 1].sample(n=len(train_pair_v4_df))])

  0%|          | 0/59685 [00:00<?, ?it/s]

  0%|          | 0/59685 [00:00<?, ?it/s]

In [25]:
train_pair_v4v5_df.to_csv(DATASET_DIR / 'train_no_leak_pair_v6.csv', index=False)

In [16]:
train_pair_harder_df = pd.read_csv(DATASET_DIR / 'train_no_leak_pair_harder.csv')

In [18]:
train_pair_v7_df = pd.concat([
    pair_df[pair_df['is_subset'] == 0],
    pair_df[pair_df['is_subset'] == 1].sample(n=int(len(pair_df[pair_df['is_subset'] == 0]) * 1.5)),
])

In [21]:
train_pair_v7_df.to_csv(DATASET_DIR / 'train_no_leak_pair_v7.csv', index=False)