In [3]:
import json
import numpy as np
import random

class CharacterSwap:
    def __init__(self):
        pass

    def insert_misspelling(self, word, idx):
        chars = [*word]
        n_chars = len(word)
        if idx < n_chars - 1:
            temp_char = chars[idx]
            chars[idx] = chars[idx + 1]
            chars[idx + 1] = temp_char
        else:
            temp_char = chars[idx]
            chars[idx] = chars[idx - 1]
            chars[idx - 1] = temp_char
        return "".join(chars)

class CharacterDuplicate:
    def __init__(self):
        pass

    def insert_misspelling(self, word, idx):
        chars = [*word]
        n_chars = len(word)
        chars.insert(idx, chars[idx])
        return "".join(chars)

class CharacterDelete:
    def __init__(self):
        pass
    def insert_misspelling(self, word, idx):
        chars = [*word]
        n_chars = len(word)
        del chars[idx]
        return "".join(chars)


class CharacterKeyboard:
    def __new__(cls):
        if not hasattr(cls, "instance"):
            cls.instance = super(CharacterKeyboard, cls).__new__(cls)
        return cls.instance

    def __init__(self):
        with open('./keybord_es.json', 'r') as f:
            self.keyboard_misspellings = json.load(f)

    def insert_misspelling(self, word, idx):
        chars = [*word]
        n_chars = len(word)
        misspellings = self.keyboard_misspellings[chars[idx]]
        chars[idx] = random.choice(misspellings)

        return "".join(chars)

class CharacterMS:
    def __init__(self, char_max = 2, char_probs = [0.89,0.11]):
        self.char_max = char_max,
        self.char_probs = char_probs

    def get_idxes(self, word):
        [num_idxes]  = random.choices([1,2], weights= self.char_probs)
        self.chars = [*word]
        self.n_chars = len(word)
        idxes = random.sample(list(range(self.n_chars)), k = num_idxes)
        return idxes

    def generate_misspellings(self, word):
        idxs = self.get_idxes(word)
        ms_types = random.choices([1,2,3,4], k= len(idxs))
        for idx, ms_type in zip(idxs, ms_types):
            word = self.insert_misspelling(word, idx, ms_type)
        
        return word

    def insert_misspelling(self, word, idx, ms_type):
        if ms_type == 1:
            return CharacterSwap().insert_misspelling(word, idx)
        elif ms_type == 2:
            return CharacterDuplicate().insert_misspelling(word, idx)
        elif ms_type == 3:
            return CharacterDelete().insert_misspelling(word, idx)
        elif ms_type == 4:
            return CharacterKeyboard().insert_misspelling(word, idx)     

In [4]:
import numpy as np
import random
import random
import spacy
import re
from nltk.corpus import stopwords
import nltk

nlp = spacy.load('es_core_news_md')
nltk.download('stopwords')


def find_span_token(text, tokens, tok_idx):
    token = tokens[tok_idx]
    
    if tok_idx > 0:
        start_find_span = len("".join(tokens[0:tok_idx - 1]))
    else:
        start_find_span = 0
    
    match_token = re.search(token, text[start_find_span:])

    if match_token:
        span_start, span_end = match_token.span()
        return (span_start + start_find_span, span_end + start_find_span)
    
    return None

## tokenizar

def tokenize(text):
    tokens = [str(token) for token in nlp(text)]
    return tokens

class MisspellingsAug:
    def __init__(self, aug_pct = 0.5, tok_pct = 0.1):
        self.aug_pct = aug_pct
        self.tok_pct = tok_pct
        self.load_natural_misspellings()

    def load_natural_misspellings(self):
        with open('errores_ortograficos.json', "r", encoding= "utf-8") as f:
            errores_ortograficos = json.load(f)

        self.word2missp = {}

        for e in errores_ortograficos:
            word = e['palabra']
            misspellings = e['errores_ortograficos']
            self.word2missp[word] = misspellings
    
    def get_words_with_natural_misspellings(self):
        return [ k for k,v in self.word2missp.items() if len(v) > 0 ]

    def augument_texts(self, texts , stop_words):
        aug_size = np.ceil(self.aug_pct * len(texts))
        #print("aug_size:", aug_size)
        texts_choices = random.sample(texts, k = int(aug_size))
        aug_texts = []
        ## Hacer una funcion find span token y esto usar para reemplazar o para modificar el span del otro
        words_replace_log = []

        for text in texts_choices:
            tokens = tokenize(text)
            idx2token  = { str(idx):token for idx, token in enumerate(tokens) if not token in stop_words}
            num_tokens = len(idx2token)
            num_misspellings = int(np.ceil(tok_pct * num_tokens))

            words_with_natural_miss = self.get_words_with_natural_misspellings()

            idxs_tok_wnm = [ int(idx) for idx, tok  in idx2token.items() if tok in words_with_natural_miss]
            
            idxs_tok_not_wnm = [ int(idx) for idx, tok  in idx2token.items() if not tok in words_with_natural_miss]

            types_misspellings = random.choices([1,2], k = num_misspellings)
            misspelling_words_replace = []

            for type_ms in types_misspellings:
                
                if len(idxs_tok_wnm) == 0:
                    type_ms = 2

                if type_ms == 1:
                    
                    idx_elem = random.choice(list(range(len(idxs_tok_wnm ))))
                    #print("idxs_tok_wnm:", idxs_tok_wnm)
                    #print("idx_elem:", idx_elem)
                    ## Index of token 
                    idx = idxs_tok_wnm[idx_elem]
                    ## get misspelling word
                    word = tokens[idx]
                    ms_word = self.insert_natural_misspelling(word.lower())
                    
                    span_token = find_span_token(text, tokens, idx)
                    misspelling_words_replace.append({"word": word ,"ms_word": ms_word, "span_word":  span_token})

                    #tokens[idx] = self.insert_natural_misspelling(word)
                    
                    del idxs_tok_wnm[idx_elem]

                if type_ms == 2:
                    idx_tokens = idxs_tok_wnm + idxs_tok_not_wnm
                    idx_elem = random.choice(list(range(len( idx_tokens ))))
                    #print("idx_tok_wnm:", idx_tok_wnm)
                    #print("idx_elem:", idx_elem)
                    ## Index of token 
                    idx = idx_tokens[idx_elem]
                    #tokens[idx] = self.insert_synthetic_misspelling(word)
                    
                    ## get misspelling word
                    word = tokens[idx]
                    ms_word = self.insert_synthetic_misspelling(word.lower())
                    
                    span_token = find_span_token(text, tokens, idx)

                    misspelling_words_replace.append({"word": word ,"ms_word": ms_word, "span_word":  span_token})

                    if len(idxs_tok_wnm) == 0 or len(idxs_tok_wnm) <= idx_elem:
                        del idxs_tok_not_wnm[idx_elem - len(idxs_tok_wnm)]
                    else:
                        del idxs_tok_wnm[idx_elem]      

            ## Replace misspelling in text
            
            if len(misspelling_words_replace) > 0:
                misspelling_words_replace.sort(key= lambda ms: ms['span_word'][0])
                new_text = text[ : misspelling_words_replace[0]['span_word'][0]]
            else:
                continue
            
            for n, misspelling in  enumerate(misspelling_words_replace):
                ms_word = misspelling['ms_word']
                span_start_word, span_end_word  = misspelling['span_word']
                if n == (len(misspelling_words_replace) -1):
                    new_text = new_text + ms_word + text[span_end_word:]
                else:
                    new_text = new_text + ms_word + text[span_end_word : misspelling_words_replace[n + 1]['span_word'][0]]
                ## Update entities espan

            #new_text = " ".join(tokens)
            words_replace_log.append({"text": text, "aug_text": new_text , "misspelling_words_replace": misspelling_words_replace})
            aug_texts.append(new_text)
        return aug_texts, words_replace_log

    def insert_misspelling(self, word, ms_type):
        idx = random.choice(list(range(len(word))))
        
        if ms_type == 1:
            return CharacterSwap().insert_misspelling(word, idx)
        elif ms_type == 2:
            return CharacterDuplicate().insert_misspelling(word, idx)
        elif ms_type == 3:
            return CharacterDelete().insert_misspelling(word, idx)
        elif ms_type == 4:
            return CharacterKeyboard().insert_misspelling(word, idx)
        
    def insert_synthetic_misspelling(self, word , char_max = 2, char_probs = [0.89,0.11]):
        [num_idxes]  = random.choices(list(range(1,char_max + 1)), weights= char_probs, k=1)
        chars = [*word]
        n_chars = len(word)
        #idxes = random.sample(list(range(n_chars)), k = num_idxes)
        ms_char_types = random.choices([1,2,3,4], k = num_idxes)
        
        for ms_type in ms_char_types:
            word = self.insert_misspelling(word, ms_type)
        return word

    def insert_natural_misspelling(self, word):
        misspellings = self.word2missp[word]
        
        if len(misspellings) > 0:
            misspelling = random.choice(misspellings)
            return misspelling
        return word

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anthony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from utils import load_json
path_train = "./data/train_nlu.json"
train_data = load_json(path_train)

path_test = "./data/test_nlu.json"
test_data = load_json(path_test)

print("Size train data:", len(train_data))

print("Size test data:", len(test_data))


Size train data: 1272
Size test data: 319


In [18]:
from nltk.corpus import stopwords
import random
## fixed random 
random.seed(42)

## Aumentar errores otrograficos
aug_pct = 0.6 # porcentaje de numero maximo en textos escogidos
tok_pct  = 0.3 # porcentaje de numero maximo de tokens reemplado en el texto

## Obtener stopwords
stop_words = stopwords.words('spanish')

stop_words.extend(["?",".", ",","¿"])

question_words = ['qué','que','quién','quien', 'cuál','cual', 'dónde','donde', 'cuándo','cuando', 'cómo','como', 'por qué','por que','porqué','porque', 
'cuánto','cuanto','cuantos']

## remover palabras de preguntas de stopwords
stop_words = [ sw for sw in stop_words if not sw in question_words ]

## Generador de errores ortográficos

misspellingsAug = MisspellingsAug(aug_pct, tok_pct)

## Train data augumenter misspelling

intents = list(set(([sample['intent_name'] for sample in train_data])))

aug_train_data = []

for intent_name in intents:
    intent_examples = [ sample for sample in train_data if intent_name == sample['intent_name']]
    intent_texts = [ sample['text'] for sample in intent_examples]
    ## Aumentar con errores ortograficos
    aug_texts, words_replace_log = misspellingsAug.augument_texts(texts= intent_texts, stop_words = stop_words)
    intent_with_entities_examples = [ sample for sample in train_data if intent_name == sample['intent_name'] and len(sample['entities']) > 0]
    intent_with_entities_texts = [ sample['text'] for sample in intent_with_entities_examples]

    for aug_idx , aug_words_log in enumerate(words_replace_log):
        if len(aug_words_log) == 0:
            continue
        
        if aug_words_log['text'] in intent_with_entities_texts:
            idx_sample = intent_with_entities_texts.index(aug_words_log['text'])
            intent_sample  = intent_with_entities_examples[idx_sample]
            new_sample = intent_sample.copy()
            new_sample['text'] = aug_texts[aug_idx]

            entities = new_sample['entities']
            print("entities 1: ", entities)
            
            if len(entities) > 0:
                displace = 0
                for aug_log_word in aug_words_log['misspelling_words_replace']:
                    word = aug_log_word['word']
                    ms_word = aug_log_word['ms_word']
                    displace = displace + len(ms_word) - len(word)
                    #print(displace)
                    for ent in entities:
                        if word in ent['value'] and ent['span_start'] >= aug_log_word['span_word'][0] and ent['span_start'] < aug_log_word['span_word'][1]:
                            ent['span_start']  = displace + ent['span_start']
                            displace = displace + len(ms_word) - len(word) 
                            ent['span_end'] = displace + ent['span_end']
                
                for ent in entities:
                    ent['value'] = aug_texts[aug_idx][ent['span_start']:ent['span_end']]
                
                print("entities 2: ", entities)
                new_sample['entities'] = entities
            
            aug_train_data.append(new_sample)
            
        else: 
            new_sample =  {
            "text": aug_texts[aug_idx] ,
            "intent_name": intent_name,
            "entities": [],
            "text_label_entities": ""
            }
            aug_train_data.append(new_sample)

entities 1:  [{'value': 'etiro total?', 'entity': 'procedimiento', 'span_start': 44, 'span_end': 56}]
entities 2:  [{'value': 'etiro total?', 'entity': 'procedimiento', 'span_start': 44, 'span_end': 56}]
entities 1:  [{'value': 'retiro toatl', 'entity': 'procedimiento', 'span_start': 38, 'span_end': 50}]
entities 2:  [{'value': 'retiro toatl', 'entity': 'procedimiento', 'span_start': 38, 'span_end': 50}]
entities 1:  [{'value': 'retiro toal?', 'entity': 'procedimiento', 'span_start': 53, 'span_end': 65}]
entities 2:  [{'value': 'retiro toal?', 'entity': 'procedimiento', 'span_start': 53, 'span_end': 65}]
entities 1:  [{'value': 'retiro total', 'entity': 'procedimiento', 'span_start': 19, 'span_end': 31}]
entities 2:  [{'value': 'retiro total', 'entity': 'procedimiento', 'span_start': 19, 'span_end': 31}]
entities 1:  [{'value': 'retiro totla', 'entity': 'procedimiento', 'span_start': 42, 'span_end': 54}]
entities 2:  [{'value': 'retiro totla', 'entity': 'procedimiento', 'span_start': 4

In [188]:
entities

[{'value': ' retro parcia',
  'entity': 'procedimiento',
  'span_start': 44,
  'span_end': 57}]

In [19]:
aug_train_data[255]

{'text': 'Hola qu tal, me podria indicar comi puedo tramitar una constansia de estudios',
 'intent_name': 'constancia_de_estudios__procedimiento',
 'entities': [{'value': 'onstansia de estudios',
   'entity': 'constancia',
   'span_start': 56,
   'span_end': 78}],
 'text_label_entities': 'Hola que tal, me podria indicar como puedo tramitar una <constancia de estudios>[constancia]'}

In [17]:
a = "Hola qu tal, me podria indicar comi puedo tramitar una constansia de estudios"
a[56:78]

'onstansia de estudios'

In [181]:
text = "Hola que tal, que prosedimiento puedo seguir para realizar el retiro parcila?"
text[62: 76]

'retiro parcila'