# yo-yo-maskr
## Part 1: Regex based anonymization

In [3]:
import dill
import regex as re
from src.utils.ano_regex import create_names_regex
from src.utils.trie import Trie

NAMES_FROM_PICKLE = False
USE_FULL = False

if NAMES_FROM_PICKLE:
    with open('./data/first_names_regex.pkl', 'rb') as f:
        first_names = dill.load(f)
    with open('./data/last_names_trie_regex.pkl', 'rb') as f:
        last_names = dill.load(f)
    with open('./data/first_names_trie_regex.pkl', 'rb') as f:
        first_names_trie = dill.load(f)
    with open('./data/last_names_trie_regex.pkl', 'rb') as f:
        last_names_trie = dill.load(f)
else:
    with open(f'./data/first_names{"_full" if USE_FULL else ""}.txt') as f:
        first_names = [l.strip() for l in f.read().split('\n') if len(l.strip()) > 0]
    first_names_regex = create_names_regex(first_names)
    with open('./data/first_names_regex.pkl', 'wb') as f:
            dill.dump(first_names_regex, f)
    first_trie = Trie()
    for name in sorted([f for f in first_names if len(f) > 0], key=len, reverse=True):
        first_trie.add(name)
    first_trie_regex = re.compile(r'\b' + first_trie.pattern() + r'\b')
    with open('./data/first_names_trie_regex.pkl', 'wb') as f:
        dill.dump(first_trie_regex, f)

    with open(f'./data/last_names{"_full" if USE_FULL else ""}.txt') as f:
        last_names = [l.strip() for l in f.read().split('\n') if len(l.strip()) > 0]
    last_names_regex = create_names_regex(last_names)
    with open('./data/last_names_regex.pkl', 'wb') as f:
        dill.dump(last_names_regex, f)
    last_trie = Trie()
    for name in sorted([l for l in last_names if len(l) > 0], key=len, reverse=True):
        last_trie.add(name)
    last_trie_regex = re.compile(r'\b' + last_trie.pattern() + r'\b')
    with open('./data/last_names_trie_regex.pkl', 'wb') as f:
        dill.dump(last_trie_regex, f)

with open('data/_all_orig.txt') as f:
    sample_texts = f.read().split('\n')

In [4]:
import regex as re
from tqdm.auto import tqdm
from src.utils.ano_regex import anonymize_entities

last_trie_regex = re.compile(r'\b' + last_trie.pattern() + r'\b')
first_trie_regex = re.compile(r'\b' + first_trie.pattern() + r'\b')

result = [anonymize_entities(text, by_names='NAME', first_names=first_trie_regex, last_names=last_trie_regex) for text in tqdm(sample_texts[:])]
for i in range(len(result[:30])):
    print(result[i])

  0%|          | 0/2195 [00:00<?, ?it/s]

{'text': 'ggsdgg', 'replace_dict': {}}
{'text': 'rien à ajouter', 'replace_dict': {}}
{'text': 'Alles bestens abgelaufen. Sehr freundlicher Kundenkontakt.', 'replace_dict': {}}
{'text': 'Chez la bâloise on Présume coupable et fraudeur', 'replace_dict': {}}
{'text': 'Schnelle unkomplizierte Erledigung', 'replace_dict': {}}
{'text': 'Ich wurde in Mallorca bestohlen. Habe alles reibungslos mit dem Schadenexperten am #DATE_1# besprochen. Er war wirklich sehr kompetent und freundlich.  Auch die Auszahlung war nach meiner eingereichter Aufstellung super schnell und grosszügig.  Besten Dank.', 'replace_dict': {'#DATE_1#': {'matches': {'14.Mai'}, 'replacement': '14.Mai'}}}
{'text': 'Unkomplizierte und zeitgemässe Abhandlung des Schadenfalls.', 'replace_dict': {}}
{'text': 'très satisfaite de traitement de sinistre, rapide et efficace', 'replace_dict': {}}
{'text': 'Kompetent, unkompliziert und immer freundlich!  #NAME_2# #NAME_1# ist immer sehr hilfsbereit und man merkt, dass ihm seine Kunden 

In [5]:
from flashtext2 import KeywordProcessor
from tqdm.auto import tqdm

ln = KeywordProcessor(case_sensitive=True)
for name in last_names:
    ln.add_keyword(name)

fn = KeywordProcessor(case_sensitive=True)
for name in first_names:
    fn.add_keyword(name)

result = [{'text': text, 'first_names': fn.extract_keywords_with_span(text),
           'last_names': ln.extract_keywords_with_span(text)} for text in tqdm(sample_texts)]

  0%|          | 0/2195 [00:00<?, ?it/s]