In [1]:
import os, sys
from pathlib import Path
from typing import Dict, Tuple, List
import spacy

HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

In [2]:
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = current

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

In [3]:
# let's first fix the data 
import src.text_processing.preprocess as pr
import src.data_preparation.prepare_data as prd 
# fixed_data = prd.fix_initial_data(data_path)
data = prd.prepare_all_data(fixed_data_file=os.path.join(DATA_FOLDER, 'fixed.csv'), save=False)

  from .autonotebook import tqdm as notebook_tqdm
Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 13797.05it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 674.22it/s]
Generating train split: 577777 examples [00:01, 468400.28 examples/s]


In [4]:
# import random
# # sample = data.select(range(10 ** 4))
# def process_text(text: str) -> str:
#     return pr.no_extra_spaces(pr.no_extra_chars(pr.to_lower(text)))

# def process_batch(batch: Dict, nlp):
#     p = random.random()
#     if p < 10 ** -5:
#         print("really ?")
#     return dict([(k, [process_text(t) for t in v]) for k, v in ({"source": pr.uniform_ne_batched(batch['source'], nlp), 
#                                                                  "target": pr.uniform_ne_batched(batch['target'], nlp)}).items()]) 

# # load the nlp object in advnace
# nlp = spacy.load("en_core_web_sm")
# processed_data = data.map(lambda b: process_batch(b, nlp), batched=True)

In [5]:
# processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))
# # save the data
# processed_data.to_csv(os.path.join(DATA_FOLDER, 'all_data_processed.csv'), index=False)

In [6]:
from datasets import load_dataset
processed_data = load_dataset("csv", data_files=os.path.join(DATA_FOLDER, "all_data_processed.csv"), split='train')
processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 13486.51it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 723.28it/s]
Generating train split: 597521 examples [00:00, 790267.72 examples/s]
Filter: 100%|██████████| 597521/597521 [00:01<00:00, 552869.66 examples/s]


In [7]:
# let's take a small smaple of 10000 rows
from src.toxicity_scores import n_grams as ng
import importlib
importlib.reload(ng)
# sample = processed_data.select(range(10 ** 4))
uni, bi = ng.build_unigram_counter(processed_data)
ignore_map = ng.build_ignore_toxic_map(0.1, pr.standard_stop_words())
# ignore_map

Map: 100%|██████████| 597519/597519 [01:05<00:00, 9102.76 examples/s]


8021 non alpha tokens


In [8]:
u, b = dict([(k, (v["source"] + 1) / (v["target"] + 1)) for k, v in uni.items()]) , dict([(k, (v["source"] + 1) / (v["target"] + 1)) for k, v in bi.items()]) 

In [33]:
from empiricaldist import Cdf

u_scores = [v for k, v in u.items()]
bi_scores = [v for k, v in b.items()]

cdf_u = Cdf.from_seq(u_scores)
cdf_b = Cdf.from_seq(bi_scores)

toxicity_threshold_u = cdf_u.forward(0.4).item()
toxicity_threshold_bi = cdf_b.forward(0.4).item()
default_toxicitiy = cdf_u.forward(0.2).item()
toxicity_threshold_bi, toxicity_threshold_u, default_toxicitiy


(0.07776813343654909, 0.09865210318382524, 0.01777829421333953)

In [28]:
importlib.reload(ng)
s = "I hate your face !!"  
res = ng.get_toxicity_attributes(s, 
                                 uni_threshold=toxicity_threshold_u, 
                                 bi_threshold=toxicity_threshold_bi, 
                                 uni_gram=u, 
                                 bi_gram=b, 
                                 default_toxicity=default_toxicitiy)


print(res)

{'hate', 'face'}


In [29]:
ls = pr.lemmatize(s)
# replace them with mask tokens
# pass it to a pretrained model.
# masked sentence 
    

In [38]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline
ckpnt = "distilbert-base-uncased"

mask_filler = pipeline("fill-mask", ckpnt)
# bert = AutoModelForMaskedLM.from_pretrained(ckpnt)
bert_tokenizer = AutoTokenizer.from_pretrained(ckpnt)
bert_tokenizer.mask_token

'[MASK]'

In [42]:
masked_ls = " ".join([(c if c not in res else bert_tokenizer.mask_token) for c in ls])
masked_ls
output = mask_filler(masked_ls, top_k=5)
print(output[0])


[{'score': 0.16346710920333862, 'token': 2572, 'token_str': 'am', 'sequence': '[CLS] i am your [MASK]!! [SEP]'}, {'score': 0.06906559318304062, 'token': 2293, 'token_str': 'love', 'sequence': '[CLS] i love your [MASK]!! [SEP]'}, {'score': 0.04854947701096535, 'token': 11693, 'token_str': 'beg', 'sequence': '[CLS] i beg your [MASK]!! [SEP]'}, {'score': 0.02307771146297455, 'token': 10312, 'token_str': 'stole', 'sequence': '[CLS] i stole your [MASK]!! [SEP]'}, {'score': 0.022854004055261612, 'token': 2215, 'token_str': 'want', 'sequence': '[CLS] i want your [MASK]!! [SEP]'}]


In [9]:
importlib.reload(ng)
s = "Are you a nice person ?"
ids = ng._prepare_sentence(s, pr.standard_stop_words())
ng.indices_toxicity_score(indices=ids, ignore_map=ignore_map, uni_gram=u, bi_gram=b, default_toxicity=0.1)

[0, 2.6141357727491576, 2.672540279138736]

In [10]:
from transformers import AutoTokenizer, DistilBertModel
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
# bert_tokenizer.convert_ids_to_tokens([6583, 2131])

In [11]:
import pickle
with open(os.path.join(PARENT_DIR, 'src', 'toxicity_scores', 'uni_gram.pk'), 'rb') as f:
    counter_loaded = pickle.load(f) 