In [1]:
import os, sys
from pathlib import Path
from typing import Dict, Tuple, List
import spacy

In [2]:

HOME = os.getcwd()
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = current

DATA_FOLDER = os.path.join(PARENT_DIR, 'src','data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

In [3]:
# let's first fix the data 
import src.text_processing.preprocess as pr
import src.data_preparation.prepare_data as prd 
# fixed_data = prd.fix_initial_data(data_path)
data = prd.prepare_all_data(fixed_data_file=os.path.join(DATA_FOLDER, 'fixed.csv'), save=False)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import random
def process_text(text: str) -> str:
    return pr.no_extra_spaces(pr.no_extra_chars(pr.to_lower(text)))

def process_batch(batch: Dict, nlp):
    p = random.random()
    if p < 10 ** -5:
        print("really ?")
    return dict([(k, [process_text(t) for t in v]) for k, v in ({"source": pr.uniform_ne_batched(batch['source'], nlp), 
                                                                 "target": pr.uniform_ne_batched(batch['target'], nlp)}).items()]) 

# load the nlp object in advnace
nlp = spacy.load("en_core_web_sm")
# processed_data = data.map(lambda b: process_batch(b, nlp), batched=True)

In [5]:
# processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))
# # save the data
# processed_data.to_csv(os.path.join(DATA_FOLDER, 'all_data_processed.csv'), index=False)

In [6]:
from datasets import load_dataset
processed_data = load_dataset("csv", data_files=os.path.join(DATA_FOLDER, "all_data_processed.csv"), split='train')
processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))
processed_data

Dataset({
    features: ['source', 'target'],
    num_rows: 597519
})

In [7]:
# let's take a small smaple of 10000 rows
from src.toxicity_scores import n_grams as ng
import importlib
importlib.reload(ng)
# sample = processed_data.select(range(10 ** 4))
uni, bi = ng.build_unigram_counter(processed_data, save_folder=os.getcwd())
# ignore_map = ng.build_ignore_toxic_map(0.1, pr.standard_stop_words())
# ignore_map

Map: 100%|██████████| 597519/597519 [01:06<00:00, 9034.24 examples/s]


In [14]:
u, b = dict([(k, (v["source"] + 1) / (v["target"] + 1)) for k, v in uni.items()]) , dict([(k, (v["source"] + 1) / (v["target"] + 1)) for k, v in bi.items()]) 

In [15]:
from empiricaldist import Cdf

u_scores = [v for k, v in u.items()]
bi_scores = [v for k, v in b.items()]

cdf_u = Cdf.from_seq(u_scores)
cdf_b = Cdf.from_seq(bi_scores)

toxicity_threshold_u = cdf_u.forward(0.4).item()
toxicity_threshold_bi = cdf_b.forward(0.4).item()
default_toxicitiy = cdf_u.forward(0.2).item()
toxicity_threshold_bi, toxicity_threshold_u, default_toxicitiy


(0.07776813343654909, 0.09865210318382524, 0.01777829421333953)

In [16]:
def mask_sentence(s: str, masks: List[str], mask_token):
    ls = pr.lemmatize(s)
    return " ".join([(c if c not in masks else mask_token) for c in ls])


In [30]:
import torch

def baseline_predict(sentences : str, 
                     model,
                     tokenizer,
                     uni_gram, 
                     bi_gram,  
                     ):

    mask_token = tokenizer.mask_token

    # first extract the toxicity attribued
    masks = [ng.get_toxicity_attributes(s, 
                                    uni_gram=uni_gram, 
                                    bi_gram=bi_gram) for s in sentences]
    
    masked_sentences = [mask_sentence(s, m, mask_token) for s, m in zip(sentences, masks)]

    inputs = tokenizer(masked_sentences
    , return_tensors="pt", padding=True)
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]


    logits = model(**inputs).logits
    mask_token_logits = logits[0, mask_token_index, :]
    top_tokens = torch.topk(mask_token_logits, 1, dim=1).indices.tolist()

    return [text.replace(tokenizer.mask_token, tokenizer.decode(token)) for text, token in zip(masked_sentences, top_tokens)]

In [35]:
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForMaskedLM

checkpoint = 'distilbert-base-uncased'
sentences = ['do not do that ', "you studid little bastard"]
model = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

baseline_predict(sentences=sentences, model=model, tokenizer=tokenizer, uni_gram=u, bi_gram=b)

['do not do that', 'you studid . .']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(sentences
, return_tensors="pt", padding=True)
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

In [29]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(checkpoint)
logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]
top_tokens = torch.topk(mask_token_logits, 1, dim=1).indices.tolist()

for token, text in zip(top_tokens, sentences):
    print(text.replace(tokenizer.mask_token, tokenizer.decode(token)))

Fuck off !
Please shut up you stupid !


In [None]:
from transformers import pipeline, AutoTokenizer

checkpoint = 'distilbert-base-uncased'
pipe = pipeline('fill-mask', model=checkpoint)
bert_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
sentences = ['Fuck off nigga', "Please shut up you stupid bitch"]
baseline_predict(sentences, masked_ml_pipe=pipe, uni_gram=u, bi_gram=b, mask_token=bert_tokenizer.mask_token)

Traceback (most recent call last):
  File "_pydevd_bundle/pydevd_cython.pyx", line 577, in _pydevd_bundle.pydevd_cython.PyDBFrame._handle_exception
  File "_pydevd_bundle/pydevd_cython.pyx", line 312, in _pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend
  File "/home/ayhem18/DEV/TextDetoxification/env/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 2070, in do_wait_suspend
    keep_suspended = self._do_wait_suspend(thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ayhem18/DEV/TextDetoxification/env/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 2106, in _do_wait_suspend
    time.sleep(0.01)
KeyboardInterrupt


TypeError: list indices must be integers or slices, not str

In [None]:
import n_grams as ng
importlib.reload(ng)
s = "Good boy!!"  
res = ng.get_toxicity_attributes(s, 
                                 uni_threshold=toxicity_threshold_u, 
                                 bi_threshold=toxicity_threshold_bi, 
                                 uni_gram=u, 
                                 bi_gram=b, 
                                 default_toxicity=default_toxicitiy)
print(res)

{'boy'}


In [None]:
ls = pr.lemmatize(s)
# replace them with mask tokens
# pass it to a pretrained model.
# masked sentence 
    

In [None]:
import pickle
with open(os.path.join(PARENT_DIR, 'src', 'toxicity_scores', 'uni_gram.pk'), 'rb') as f:
    counter_loaded = pickle.load(f) 

'[MASK]'

In [None]:
masked_ls = " ".join([(c if c not in res else bert_tokenizer.mask_token) for c in ls])
masked_ls
output = mask_filler([masked_ls, masked_ls], top_k=1)
output[0]

[{'score': 0.17653395235538483,
  'token': 2851,
  'token_str': 'morning',
  'sequence': 'good morning!!'}]

In [None]:
import pickle
with open(os.path.join(PARENT_DIR, 'src', 'toxicity_scores', 'uni_gram.pk'), 'rb') as f:
    counter_loaded = pickle.load(f) 

FileNotFoundError: [Errno 2] No such file or directory: '/home/ayhem18/DEV/TextDetoxification/src/toxicity_scores/uni_gram.pk'