In [1]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 27.2 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4


In [2]:
import string
from unidecode import unidecode

class Vectorizer:
  def standardize(self, text):
    text = unidecode(text.lower())
    return "".join(char for char in text
      if char not in string.punctuation)
  
  def tokenize(self, text):
    text = self.standardize(text) 
    return text.split()
  
  def make_vocabulary(self, texts): 
    self.vocabulary = {"": 0, "[UNK]": 1} 
    for text in texts:
      text = self.standardize(text) 
      tokens = self.tokenize(text) 
      for token in tokens:
        if token not in self.vocabulary: 
          self.vocabulary[token] = len(self.vocabulary)
    self.inverse_vocabulary = dict(
      (v, k) for k, v in self.vocabulary.items())
    
  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    return [self.vocabulary.get(token, 1) for token in tokens]
  
  def decode(self, int_sequence): return " ".join(
    self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

In [3]:
vectorizer = Vectorizer()

vectorizer.standardize("O gato comeu o rato!")

'o gato comeu o rato'

In [4]:
vectorizer.tokenize("O gato comeu o rato!")

['o', 'gato', 'comeu', 'o', 'rato']

In [5]:
from nltk.stem.snowball import PortugueseStemmer

stemmer = PortugueseStemmer()

def stem(text):
  return [stemmer.stem(w) for w in vectorizer.tokenize(text)]

stem("Os gatos caçaram os ratos"), stem("O gato caçou o rato")

(['os', 'gat', 'cac', 'os', 'rat'], ['o', 'gat', 'cac', 'o', 'rat'])

In [6]:
!python -m spacy download pt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pt_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2 MB)
[K     |████████████████████████████████| 21.2 MB 1.3 MB/s 
Building wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.2.5-py3-none-any.whl size=21186281 sha256=9b4e137e0034e2aa7a4bcaaa34fdfef8c53f4fb984cf43e40b0de7c735021a80
  Stored in directory: /tmp/pip-ephem-wheel-cache-3tuk31gq/wheels/c3/f9/0c/5c014a36941a00f5df5fc0756cb961d7c457a978e697a6ce3b
Successfully built pt-core-news-sm
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')
[38;5;2m

In [7]:
import spacy
nlp = spacy.load('pt')

def lemma(text):
  doc = nlp(vectorizer.standardize(text))
  return [token.lemma_ for token in doc]

lemma("Amigos, amizade e carreira são importantes"), lemma("Amigas, amizades e carreira é importante")

(['amigo', 'amizade', 'e', 'carreira', 'sao', 'importante'],
 ['amigo', 'amizade', 'e', 'carreira', 'e', 'importante'])

In [8]:
def standardize(text):
    text = text.lower()
    return "".join(char for char in text
      if char not in string.punctuation)
  
def tokenize(text):
    text = standardize(text) 
    return text.split()

def make_vocabulary(texts): 
    vocabulary = {"": 0, "[UNK]": 1} 
    for text in texts:
      text = standardize(text) 
      tokens = tokenize(text) 
      for token in tokens:
        if token not in vocabulary: 
          vocabulary[token] = len(vocabulary)
    inverse_vocabulary = dict(
      (v, k) for k, v in vocabulary.items())
      
    return vocabulary, inverse_vocabulary

print(make_vocabulary(["O gato comeu o rato", "Gato e cachorro são animais"]))

({'': 0, '[UNK]': 1, 'o': 2, 'gato': 3, 'comeu': 4, 'rato': 5, 'e': 6, 'cachorro': 7, 'são': 8, 'animais': 9}, {0: '', 1: '[UNK]', 2: 'o', 3: 'gato', 4: 'comeu', 5: 'rato', 6: 'e', 7: 'cachorro', 8: 'são', 9: 'animais'})


In [9]:
vocabulary,_ = make_vocabulary(["O gato comeu o rato", "Gato e cachorro são animais"])

import numpy as np
def one_hot_encode(text, vocabulary):
    tokens = tokenize(text)
    vectors = np.zeros((len(tokens), len(vocabulary)))
    for i,token in enumerate(tokens):
      token_idx = vocabulary.get(token, 1)
      vectors[i,token_idx] = 1  
    return vectors

one_hot_encode("O gato comeu o rato", vocabulary)

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])