In [7]:
import os
from pathlib import Path


HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

In [8]:
import pandas as pd
df = pd.read_csv(data_path, index_col=0, sep='\t')
df.head(20)

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.618866,0.230769,0.999492,0.000131
7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.720482,0.1875,0.96368,0.14871
8,"Briggs, what the hell's happening?","Briggs, what the hell is going on?",0.920373,0.0,0.159096,0.841071
9,"Another one simply had no clue what to do, so ...","another simply didn't know what to do, so when...",0.87754,0.101695,0.055371,0.930472


# Initial remarks
from the very first we can see that the columns are mixed up and need to be fixed: mapping the sentence with the highest toxicity score to the 'source' and the other one to 'target'

In [9]:
import numpy as np
# fix the order
def fix_order_map(row):
    row['source'], row['target'] = (row['reference'], row['translation']) if row['ref_tox'] > row['trn_tox'] else (row['translation'], row['reference'])
    row['source_tox'], row['target_tox'] = (row['ref_tox'], row['trn_tox']) if row['ref_tox'] > row['trn_tox'] else (row['trn_tox'], row['ref_tox'])
    return row

df_fixed = df.apply(fix_order_map, axis=1)
df_fixed.drop(columns=['translation', 'reference', 'ref_tox', 'trn_tox'], inplace=True)

assert np.all(df_fixed['source_tox'] > df_fixed['target_tox'])

df_fixed.to_csv(os.path.join(DATA_FOLDER, 'fixed.csv'), index=False, sep=',')

In [10]:
df = pd.read_csv(os.path.join(DATA_FOLDER, 'fixed.csv'), sep=',')
df.head(10)

Unnamed: 0,similarity,lenght_diff,source,target,source_tox,target_tox
0,0.785171,0.010309,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t...",0.981983,0.014195
1,0.749687,0.071429,you're becoming disgusting.,Now you're getting nasty.,0.999039,0.065473
2,0.919051,0.268293,"well, we can spare your life.","Well, we could spare your life, for one.",0.985068,0.213313
3,0.664333,0.309524,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it.",0.994215,0.053362
4,0.726639,0.181818,I have orders to kill her.,I've got orders to put her down.,0.999348,0.009402
5,0.703185,0.206522,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.950956,0.035846
6,0.618866,0.230769,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.999492,0.000131
7,0.720482,0.1875,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.96368,0.14871
8,0.920373,0.0,"Briggs, what the hell is going on?","Briggs, what the hell's happening?",0.841071,0.159096
9,0.87754,0.101695,"another simply didn't know what to do, so when...","Another one simply had no clue what to do, so ...",0.930472,0.055371


## Summarization

Let's consider a simple hypothesis: Concise sentences tend to be less toxic than lengthy ones.

To elaborate more let's consider few examples from the dataset

In [11]:
# let's add the length of the source to the data 
df['len_source'] = df['source'].apply(len)
df.head()

df_sample  = df.sort_values(by='len_source', ascending=False).iloc[:10]

for i, (index, r) in enumerate(df_sample.iterrows()):
    if i < 3: 
        source = r['source']
        target = r['target']
        print(len(source))
        print(len(target))
        print("#" * 100)

1401
867
####################################################################################################
1032
891
####################################################################################################
1013
838
####################################################################################################


The source sentences tend to contain filler words. Processing such sentences without altering its meaning (which is crucial for our purposes) is extremely challenging using the simple, rule based processing methods, since most stop words are indeed parts of core meaning and might be crucial for the toxicity classificer.

In [12]:
# let's see how things go here!!
#


In [13]:
# let's summarize each sentence and see how things go 
