[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aangelopoulos/conformal-prediction/blob/main/notebooks/toxic-text-outlier-detection.ipynb)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [2]:
# Load cached data from Detoxify model on Jigsaw dataset. See https://github.com/unitaryai/detoxify for details.
# The comments are from Wikipedia talk channels, and we are trying perform outlier detection
# We will only use the non-toxic data, and then with type-1 error control identify the toxic outliers.
if not os.path.exists('../data'):
    os.system('gdown 1h7S6N_Rx7gdfO3ZunzErZy6H7620EbZK -O ../data.tar.gz')
    os.system('tar -xf ../data.tar.gz -C ../')
    os.system('rm ../data.tar.gz')
    
data = np.load('../data/toxic-text/toxic-text-detoxify.npz')
preds = data['preds'] # Toxicity score in [0,1]
toxic = data['labels'] # Toxic (1) or not (0)

In [3]:
# Problem setup
alpha = 0.1 # 1-alpha is the desired type-1 error
n = 10000 # Use 200 calibration points

In [4]:
# Look at only the non-toxic data
nontoxic = toxic == 0
preds_nontoxic = preds[nontoxic]
preds_toxic = preds[np.invert(nontoxic)]

# Split nontoxic data into calibration and validation sets (save the shuffling)
idx = np.array([1] * n + [0] * (preds_nontoxic.shape[0]-n)) > 0
np.random.shuffle(idx)
cal_scores, val_scores = preds_nontoxic[idx], preds_nontoxic[np.invert(idx)]

### Conformal outlier detection happens here

In [5]:
# Use the outlier detection method to get a threshold on the toxicities
qhat = np.quantile(cal_scores, np.ceil((n+1)*(1-alpha))/n)
# Perform outlier detection on the ind and ood data
outlier_ind = val_scores > qhat # We want this to be no more than alpha on average
outlier_ood = preds_toxic > qhat # We want this to be as large as possible, but it doesn't have a guarantee

In [6]:
# Calculate type-1 and type-2 errors
type1 = outlier_ind.mean()
type2 = 1-outlier_ood.mean()
print(f"The type-1 error is {type1:.4f}, the type-2 error is {type2:.4f}, and the threshold is {qhat:.4f}.")

The type-1 error is 0.0983, the type-2 error is 0.3012, and the threshold is 0.4888.


In [7]:
# Show some examples of unflagged and flagged text
content = pd.read_csv('../generation-scripts/toxic_text_utils/test.csv')['content']
print("Unflagged text examples:")
print(list(np.random.choice(content[preds <= qhat],size=(5,))))
print("\n\nFlagged text examples:")
print(list(np.random.choice(content[preds > qhat],size=(5,))))

Unflagged text examples:
['ХОТЕЛОСЬ БЫ УЗНАТЬ ,КАК В ЭТОМ ГОДУ БУДУТ ПРОИЗВОДИТЬ СПУСК ВОДЫ И БУДУТ ЛИ ЗАТОПЛЕНИЯ РЕКОЙ ИШИМ В ГОРОДЕ Г ИШИМ ТАТЬЯНА', 'Certo che Luisa, a suo tempo, non l ha messo in bibliografia ma ci ha scopiazzato abbastanza pesantemente dal Coarelli... le frasi sono uguali all 80%, ho cercato di riformularne un po , ma a volte il confronto è davvero imbarazzante.   SailKo   ', 'el pueblo que no sabe ni leer ni escribir es el pueblo facil de enganyar y mas inculcando el miedo mediante la creacion de un ser imaginario como satàn. Grandes crueldades ha cometido la iglesia i la retencion de grandes procesos cientificos, si no, que se lo digan a galileo galilei.', 'Ceci est ta réponse ??? Tu réponds à mon argumentation choc ci-dessus ? Voilà je crois que tu as tout dit, tu peux t en allez maintenant et supprimer cet article ridicule si tu as un temps soit peu d éthique ! Et pour finir, l auteur devrait lire des livres, pas la presse.', 'Здравствуйте! Спасибо за интерес 