<a href="https://colab.research.google.com/github/candido05/ML_Projects/blob/main/Tradu%C3%A7%C3%A3o_ingles_portugues_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def download_kaggle_dataset(dataset_url, output_path='.', kaggle_username=None, kaggle_key=None):
    """
    Baixa um dataset do Kaggle diretamente no Colab usando o link do repositório, sem interação do usuário.

    Parâmetros:
    dataset_url (str): URL do dataset no Kaggle (ex: https://www.kaggle.com/datasets/username/dataset-name)
    output_path (str): Caminho onde os arquivos serão salvos (padrão é o diretório atual)
    kaggle_username (str): Seu username do Kaggle
    kaggle_key (str): Sua chave API do Kaggle
    """
    !pip install -q kaggle

    try:
        dataset_path = '/'.join(dataset_url.split('/')[-2:]).split('?')[0]  # Remove parâmetros extras da URL
    except:
        raise ValueError("URL inválida. Use o formato: https://www.kaggle.com/datasets/username/dataset-name")

    import os
    if kaggle_username is None or kaggle_key is None:
        raise ValueError("Forneça seu kaggle_username e kaggle_key como argumentos.")

    os.makedirs('/root/.kaggle', exist_ok=True)
    with open('/root/.kaggle/kaggle.json', 'w') as f:
        f.write(f'{{"username":"{kaggle_username}","key":"{kaggle_key}"}}')
    !chmod 600 /root/.kaggle/kaggle.json

    os.makedirs(output_path, exist_ok=True)

    print(f"Baixando dataset: {dataset_path}")
    !kaggle datasets download -d {dataset_path} -p {output_path}

    import zipfile
    zip_files = [f for f in os.listdir(output_path) if f.endswith('.zip')]
    if zip_files:
        for zip_file in zip_files:
            with zipfile.ZipFile(f"{output_path}/{zip_file}", 'r') as zip_ref:
                zip_ref.extractall(output_path)
            print(f"Arquivo {zip_file} descompactado em {output_path}")
            os.remove(f"{output_path}/{zip_file}")
    else:
        print("Download concluído, mas nenhum arquivo .zip foi encontrado. Verifique se o dataset foi baixado corretamente.")

kaggle_username = ""
kaggle_key = ""
download_kaggle_dataset(
    "https://www.kaggle.com/datasets/nageshsingh/englishportuguese-translation",
    "/content/englishportuguese-translation",
    kaggle_username=kaggle_username,
    kaggle_key=kaggle_key
)

Baixando dataset: nageshsingh/englishportuguese-translation
Dataset URL: https://www.kaggle.com/datasets/nageshsingh/englishportuguese-translation
License(s): unknown
Downloading englishportuguese-translation.zip to /content/englishportuguese-translation
 92% 5.00M/5.41M [00:01<00:00, 5.53MB/s]
100% 5.41M/5.41M [00:01<00:00, 3.72MB/s]
Arquivo englishportuguese-translation.zip descompactado em /content/englishportuguese-translation


In [2]:
!pip install chart-studio

Collecting chart-studio
  Downloading chart_studio-1.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting retrying>=1.3.3 (from chart-studio)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, chart-studio
Successfully installed chart-studio-1.1.0 retrying-1.3.4


In [3]:
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import time
import string

import chart_studio.plotly
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

In [4]:
file_path = '/content/englishportuguese-translation/por.txt'
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines[5000:5010]

['Will it rain?\tSerá que chove?\tCC-BY 2.0 (France) Attribution: tatoeba.org #8918600 (CK) & #8930552 (JGEN)',
 'Wish me luck.\tDeseje-me sorte.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2254917 (CK) & #872788 (alexmarcelo)',
 "Won't you go?\tVocê não vai?\tCC-BY 2.0 (France) Attribution: tatoeba.org #241051 (CK) & #6212788 (bill)",
 'Write in ink.\tEscreva à tinta.\tCC-BY 2.0 (France) Attribution: tatoeba.org #3258764 (CM) & #7351595 (alexmarcelo)',
 'Write in ink.\tEscreva a tinta.\tCC-BY 2.0 (France) Attribution: tatoeba.org #3258764 (CM) & #7351606 (alexmarcelo)',
 'Write to Tom.\tEscreva para o Tom.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2240357 (CK) & #5985551 (Ricardo14)',
 'Years passed.\tPassaram os anos.\tCC-BY 2.0 (France) Attribution: tatoeba.org #282197 (CK) & #977841 (alexmarcelo)',
 'Years passed.\tAnos se passaram.\tCC-BY 2.0 (France) Attribution: tatoeba.org #282197 (CK) & #2324530 (Matheus)',
 'You amuse me.\tVocê me diverte.\tCC-BY 2.0 (France) Attributio

In [5]:
print("numero total de registros: ", len(lines))

numero total de registros:  168903


In [6]:
exclude = set(string.punctuation)
remove_digits = str.maketrans('', '', string.digits)

### Função para pré-processamento de sentenças em inglês

In [7]:
def preprocess_eng_sentence(sent):
    sent = sent.lower()
    sent = re.sub("'", '', sent)
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.translate(remove_digits)
    sent = sent.strip()
    sent = re.sub(" +", " ", sent)
    sent = '<start> ' + sent + ' <end>'
    return sent

### Função para pré-processamento de sentenças em português

In [8]:
def preprocess_port_sentence(sent):
    sent = re.sub("'", '', sent)
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.strip()
    sent = re.sub(" +", " ", sent)
    sent = '<start> ' + sent + ' <end>'
    return sent

### Geração de pares de senteças limpas com tokens de início e fim

In [9]:
sent_pairs = []
for line in lines:
    sent_pair = []
    eng, port = line.rstrip().split('\t')[:2]
    eng = preprocess_eng_sentence(eng)
    port = preprocess_port_sentence(port)
    sent_pair.extend([eng, port])
    sent_pairs.append(sent_pair)

sent_pairs[5000:5010]

[['<start> will it rain <end>', '<start> Será que chove <end>'],
 ['<start> wish me luck <end>', '<start> Desejeme sorte <end>'],
 ['<start> wont you go <end>', '<start> Você não vai <end>'],
 ['<start> write in ink <end>', '<start> Escreva à tinta <end>'],
 ['<start> write in ink <end>', '<start> Escreva a tinta <end>'],
 ['<start> write to tom <end>', '<start> Escreva para o Tom <end>'],
 ['<start> years passed <end>', '<start> Passaram os anos <end>'],
 ['<start> years passed <end>', '<start> Anos se passaram <end>'],
 ['<start> you amuse me <end>', '<start> Você me diverte <end>'],
 ['<start> you are late <end>', '<start> Você está atrasado <end>']]

### Criação de uma classe para mapear cada palavra de um índice para qualquer vocabulário fornecido e vice-versa.

In [10]:
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))
        self.vocab = sorted(self.vocab)
        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [11]:
def max_length(tensor):
    return max(len(t) for t in tensor)

### Tokenização e Padding

In [12]:
def load_dataset(pairs, num_examples):
    inp_lang = LanguageIndex(en for en, ma in pairs)
    targ_lang = LanguageIndex(ma for en, ma in pairs)

    input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en, ma in pairs]
    target_tensor = [[targ_lang.word2idx[s] for s in ma.split(' ')] for en, ma in pairs]

    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, padding='post')

    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [13]:
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(sent_pairs, len(lines))

### Criação dos dados de treinamento e validação

In [14]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.1, random_state=101
)

len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(152012, 152012, 16891, 16891)

In [15]:
# Define hiperparâmetros e prepara o dataset de treinamento para um modelo de sequência.
# BUFFER_SIZE: tamanho do buffer para embaralhar os dados (total de exemplos).
# BATCH_SIZE: número de exemplos por lote (64).
# N_BATCH: número total de lotes por época.
# embedding_dim: dimensão dos vetores de embedding (256).
# units: número de unidades em camadas do modelo (1024).
# vocab_inp_size e vocab_tar_size: tamanhos dos vocabulários de entrada e alvo.
# O dataset é criado a partir dos tensores de entrada e alvo, embaralhado e dividido em lotes de 64 exemplos.

BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

### Construção do modelo Seq2Seq com API funcional

In [16]:
def build_seq2seq_model(vocab_inp_size, vocab_tar_size, embedding_dim, units):
    # Encoder
    encoder_inputs = tf.keras.Input(shape=(None,), name="encoder_inputs")
    enc_emb = tf.keras.layers.Embedding(vocab_inp_size, embedding_dim)(encoder_inputs)
    enc_outputs, state_h, state_c = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)(enc_emb)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = tf.keras.Input(shape=(None,), name="decoder_inputs")
    dec_emb = tf.keras.layers.Embedding(vocab_tar_size, embedding_dim)(decoder_inputs)
    dec_lstm = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)
    dec_outputs, _, _ = dec_lstm(dec_emb, initial_state=encoder_states)

    # Atenção
    attention = tf.keras.layers.Attention()
    context = attention([dec_outputs, enc_outputs])
    dec_outputs = tf.keras.layers.Concatenate(axis=-1)([dec_outputs, context])

    # Camada densa
    outputs = tf.keras.layers.Dense(vocab_tar_size, activation='softmax')(dec_outputs)

    # Modelo
    model = tf.keras.Model([encoder_inputs, decoder_inputs], outputs)
    return model


In [17]:
model = build_seq2seq_model(vocab_inp_size, vocab_tar_size, embedding_dim, units)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [18]:
model.summary()

### Preparação dos dados do decoder (teacher forcing)

In [19]:
decoder_input_data = np.zeros_like(target_tensor_train)
decoder_input_data[:, :-1] = target_tensor_train[:, 1:]  # Desloca para a esquerda
decoder_input_data[:, -1] = 0  # Adiciona padding no final

### Treinamento do modelo

In [20]:
# Habilite precisão mista
from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')

In [21]:
# Reduza o batch_size e o número de unidades
BATCH_SIZE = 32
units = 512

# Treinamento do modelo
EPOCHS = 3
with tf.device('/GPU:0'):
    history = model.fit(
        [input_tensor_train, decoder_input_data],
        target_tensor_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=0.2,
        callbacks=[tf.keras.callbacks.ModelCheckpoint(
            '/content/checkpoints/model_{epoch:02d}.weights.h5',
            save_weights_only=True,
            save_best_only=True,
            monitor='val_loss'
        )]
    )

Epoch 1/3
[1m3801/3801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1380s[0m 361ms/step - accuracy: 0.9075 - loss: 0.6729 - val_accuracy: 0.9850 - val_loss: 0.1169
Epoch 2/3
[1m3801/3801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1405s[0m 363ms/step - accuracy: 0.9895 - loss: 0.0718 - val_accuracy: 0.9918 - val_loss: 0.0675
Epoch 3/3
[1m3801/3801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1414s[0m 366ms/step - accuracy: 0.9965 - loss: 0.0180 - val_accuracy: 0.9930 - val_loss: 0.0599
