# Tokenizer

In [1]:
import json
import nltk
from nltk import word_tokenize
import re

In [2]:
data = []
with open('dump_small_clean.jsonln', 'r', encoding="utf8") as file:
    for line in file:
        data.append(json.loads(line))

In [3]:
def pickLowercaseWords(tokens):
    return [token.lower() for token in tokens if re.fullmatch('\w+', token)]

In [4]:
from tqdm import tqdm

all_words = []
for item in tqdm(data):
    texto = item['body']
    tokens = word_tokenize(texto)
    tokens = pickLowercaseWords(tokens)
    all_words += tokens

100%|███████████████████████████████████████████████████████████████████████████| 11225/11225 [00:12<00:00, 887.23it/s]


In [5]:
len(all_words)

942514

In [6]:
def removeStopwords(word_list):
    stopwords = nltk.corpus.stopwords.words('portuguese')
    stopwords += nltk.corpus.stopwords.words('english')
    return [i for i in word_list if i not in stopwords]

In [7]:
from collections import Counter
word_counts = Counter(removeStopwords(all_words))

In [8]:
word_counts_list = list(word_counts.items())

In [9]:
word_counts_list_sorted = sorted(word_counts_list, key=lambda x: (-x[1], x[0]))

In [10]:
vocab = word_counts_list_sorted[:10000] #10000 mais frequentes

In [11]:
vocab = dict(vocab) #Vocabulario de palavras

In [12]:
LOWERCASE = [chr(x) for x in range(ord('a'), ord('z') + 1)]
LOWERCASE_OTHERS = [chr(x) for x in range(129, 164)] # ü até ú https://theasciicode.com.ar/
LETTERS = LOWERCASE + LOWERCASE_OTHERS

def edit1(text):
    words = []
    
    # Fase 1: as remoçoes.
    for p in range(len(text)):
        new_word = text[:p] + text[p + 1:]
        if len(new_word) > 0:
            words.append(new_word)
        
    # Fase 2: as adições.
    for p in range(len(text) + 1):
        for c in LETTERS:
            new_word = text[:p] + c + text[p:]
            words.append(new_word)
    
    # Fase 3: as substituições.
    for p in range(len(text)):
        orig_c = text[p]
        for c in LETTERS:
            if orig_c != c:
                new_word = text[:p] + c + text[p + 1:]
                words.append(new_word)
    
    return set(words)

def edit2(text):
    words1 = edit1(text)
    words2 = set()
    for w in words1:
        candidate_words2 = edit1(w)
        candidate_words2 -= words1
        words2.update(candidate_words2)
    words2 -= set([text])
    return words2


In [13]:
# truque do norvig
# def candidates(word):
#     candidatos = \
#         set([i for i in [word] if i in vocab]) or \
#         set([w for w in edit1(word) if w in vocab]) or \
#         set([w for w in edit2(word) if w in vocab]) or\
#         set([word])
#     return candidatos

# truque do norvig "traduzido"
def candidates(word):
    editD1 = [w for w in edit1(word) if w in vocab]
    editD2 = [w for w in edit2(word) if w in vocab]
    if word in vocab:
        return [word]
    elif editD1:
        return editD1
    elif editD2:
        return editD2
    else:
        return [word]

In [14]:
V = 1e5
def P(word, N=sum(vocab.values())):
    count = vocab[word] if word in vocab else 0
    return (count+1) / (N+V) # Perguntar tecnica de smoothing.

In [15]:
def correction(word):
    # candidates_ = candidates(word)
    # best, prob = None, 0
    # for i in candidates_:
    #     p = P(i)
    #     if p > prob:
    #         prob = p
    #         best = i
    # return best
    return max(candidates(word), key=P)

correction('cavako')

'cavalo'

In [16]:
re.fullmatch(r'\w+', 'Variações Varia\u00e7\u00f5es')

In [17]:
from nltk.tokenize import RegexpTokenizer

In [18]:
def tokenCorrection(text):
    tokenizer = RegexpTokenizer("(?:[\w']+)|(?:[,.;!?:])")
    tokens = tokenizer.tokenize(text)
    corrected_tokens = []
    for t in tokens:
        if t in ",.;!?:":
            corrected_tokens += [t]
        elif t.isdigit():
            corrected_tokens += [t]
        else:
            if t in nltk.corpus.stopwords.words('portuguese'):
                corrected_tokens += [t]
            else:
                corrected_tokens += [correction(t)]

    return " ".join(corrected_tokens)

tokenCorrection("o matheus, e o thiago 1434 é efiiente 123")

'o mateus , e o tiago 1434 é eficiente 123'

In [19]:
def punctuationCorrection(text):
    text = re.sub(r"\s([,.;!?:](?:\s|$))", r"\1", text)
    return re.sub(r"(^|[.?!])\s*(\w)", lambda p: p.group(0).upper(), text)

In [20]:
def textCorrection(text):
    funcs = [
        tokenCorrection,
        punctuationCorrection
    ]
    
    x = text
    for func in funcs:
        x = func(x)
    
    return x

In [23]:
strings = [
    "o matheus é efiiente",
    "o trabalo está bom",
    "bom di, pesoal",
    "irei ao mercaso. quer de algo?",
    "o andré é muito bacana"
]

res = list(map(textCorrection, strings))
res

['O mateus é eficiente',
 'O trabalho está bom',
 'Bom di, pessoal',
 'Rei ao mercado. Quer de algo?',
 'O andré é muito banana']

In [22]:
#Perguntar tecnica de smoothing na probabiliadde da palavra
#