In [1]:
import os, sys
from pathlib import Path
from typing import Dict, Tuple, List
import spacy

HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

In [2]:
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = current

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

In [3]:
# let's first fix the data 
import src.text_processing.preprocess as pr
import src.data_preparation.prepare_data as prd 
# fixed_data = prd.fix_initial_data(data_path)
data = prd.prepare_all_data(fixed_data_file=os.path.join(DATA_FOLDER, 'fixed.csv'), save=False)

  from .autonotebook import tqdm as notebook_tqdm


## Process the data: Use NER to reduce the overall number of bi and uni-grams. 

In [None]:
import random
# sample = data.select(range(10 ** 4))

def process_text(text: str) -> str:
    return pr.no_extra_spaces(pr.no_extra_chars(pr.to_lower(text)))

def process_batch(batch: Dict, nlp):
    p = random.random()
    if p < 10 ** -5:
        print("really ?")
    return dict([(k, [process_text(t) for t in v]) for k, v in ({"source": pr.uniform_ne_batched(batch['source'], nlp), 
                                                                 "target": pr.uniform_ne_batched(batch['target'], nlp)}).items()]) 

# load the nlp object in advnace
nlp = spacy.load("en_core_web_sm")
processed_data = data.map(lambda b: process_batch(b, nlp), batched=True)
processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))
processed_data.to_csv(os.path.join(DATA_FOLDER, 'all_data_processed.csv'), index=False)

In [None]:
processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))
# save the data
processed_data.to_csv(os.path.join(DATA_FOLDER, 'all_data_processed.csv'), index=False)

In [4]:
from datasets import load_dataset
processed_data = load_dataset("csv", data_files=os.path.join(DATA_FOLDER, "all_data_processed.csv"), split='train')
processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))

## build the map between each uni-gram and its toxicity score.

In [5]:
# let's take a small smaple of 10000 rows
from src.models.baseline import n_grams as ng
import importlib
importlib.reload(ng)
uni, bi = ng.build_unigram_counter(processed_data)

Map: 100%|██████████| 597519/597519 [01:05<00:00, 9179.63 examples/s]


In [7]:
from empiricaldist import Cdf

u_scores = [v for k, v in uni.items()]
bi_scores = [v for k, v in bi.items()]

cdf_u = Cdf.from_seq(u_scores)
cdf_b = Cdf.from_seq(bi_scores)

toxicity_threshold_u = cdf_u.forward(0.4).item()
toxicity_threshold_bi = cdf_b.forward(0.4).item()
default_toxicitiy = cdf_u.forward(0.2).item()
toxicity_threshold_bi, toxicity_threshold_u, default_toxicitiy


(0.07776813343654909, 0.09865210318382524, 0.01777829421333953)

In [8]:
importlib.reload(ng)
s = "I hate your face !!"  
res = ng.get_toxicity_attributes(s, 
                                 uni_threshold=toxicity_threshold_u, 
                                 bi_threshold=toxicity_threshold_bi, 
                                 uni_gram=uni, 
                                 bi_gram=bi, 
                                 default_toxicity=default_toxicitiy)


print(res)

{'hate', 'face'}
