# one

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
#Dataset
input_texts = [
    "How are you?",
    "What's your name?",
    "What do you do?",
    "Ali?"
]

output_texts = [
     "I'm fine, thank you.",
    "My name is John.",
    "i'm engineer",
    "Hello"
]

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# global max_len
max_len= max(max(len(seq) for seq in input_texts), max(len(seq) for seq in output_texts))

class Translator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Translator, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.encoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, input_seq):

        embedded = self.embedding(input_seq)
        encoder_output, (encoder_hidden, encoder_cell) = self.encoder(embedded)
        
        decoder_output, _ = self.decoder(encoder_output, (encoder_hidden, encoder_cell))

        output = self.fc(decoder_output)
        output = self.softmax(output)

        return output


class TranslationDataset(Dataset):
    def __init__(self, input_texts, output_texts,max_len):
        self.input_texts = input_texts
        self.output_texts = output_texts
        self.max_len=max_len
    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, index):
        input_text = self.input_texts[index]
        output_text = self.output_texts[index]
        input_seq = text_to_tensor(input_text).squeeze(0)
        output_seq = text_to_tensor(output_text).squeeze(0)
        input_seq = pad_sequences([input_seq], max_len=self.max_len).squeeze(0)
        output_seq = pad_sequences([output_seq], max_len=self.max_len).squeeze(0)

        return input_seq, output_seq



def tensor_to_text(tensor):
    text = ''.join([chr(c.item()-1) if 0 <= c.item()-1 <= 0x10FFFF else '' for c in tensor])
    return text


def text_to_tensor(text, max_len=None):
    seq = [ord(c) + 1 for c in text]
    tensor = torch.tensor(seq)
    tensor = pad_sequences([tensor], max_len=max_len).squeeze(0)
    return tensor.unsqueeze(0)

def pad_sequences(sequences, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    padded_sequences = torch.full((len(sequences), max_len), padding_value)
    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = torch.tensor(seq)
    return padded_sequences


def collate_fn(batch):
    # Extracting inputs and outputs from batch
    input_seqs = [item[0] for item in batch]
    target_seqs = [item[1] for item in batch]

    # Find sequences with longest sequence length
    input_seqs = pad_sequences(input_seqs, max_len=max_len)
    target_seqs = pad_sequences(target_seqs, max_len=max_len)

    return input_seqs, target_seqs



input_size = 128
hidden_size = 256
output_size = 128
learning_rate = 0.001
num_epochs = 150


translator = Translator(input_size, hidden_size, output_size)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(translator.parameters(), lr=learning_rate)

dataset = TranslationDataset(input_texts, output_texts,max_len=max_len)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True,drop_last=True,collate_fn=collate_fn)

for epoch in range(num_epochs):
    for input_seq, target_seq in dataloader:
        optimizer.zero_grad()

        output = translator(input_seq)
        loss = criterion(output.view(-1, output_size), target_seq.view(-1))

        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

Epoch [10/150], Loss: 1.991115927696228
Epoch [20/150], Loss: 0.8571838140487671
Epoch [30/150], Loss: 0.9578377604484558
Epoch [40/150], Loss: 0.221352219581604
Epoch [50/150], Loss: 0.10890562832355499
Epoch [60/150], Loss: 0.04212278500199318
Epoch [70/150], Loss: 0.26683688163757324
Epoch [80/150], Loss: 0.010468418709933758
Epoch [90/150], Loss: 0.19642940163612366
Epoch [100/150], Loss: 0.013356350362300873
Epoch [110/150], Loss: 0.011202382855117321
Epoch [120/150], Loss: 0.015830423682928085
Epoch [130/150], Loss: 0.004458738956600428
Epoch [140/150], Loss: 0.007111417595297098
Epoch [150/150], Loss: 0.0032454091124236584


In [5]:
input="Ali?"
test_input_seq = text_to_tensor(input,max_len=max_len)
translator.eval()
with torch.no_grad():
    output = translator(test_input_seq)

    k = 1  # تعداد بهترین پیش‌بینی‌ها که می‌خواهید انتخاب کنید
    _, topk_indices = torch.topk(output, k, dim=2)
    predicted_output_seq = topk_indices.squeeze(0)
      
    predicted_output_text = tensor_to_text(predicted_output_seq.squeeze())

    print("Input: {}".format(input))
    print("Predicted Output:", predicted_output_text)

Input: Ali?
Predicted Output: Hello


# Two

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
input_texts = [
    "چطوری؟",
    "اسمت چیه؟",
    "چکار می‌کنی؟",
    "علی؟"
]

output_texts = [
    "من خوبم، ممنون.",
    "اسم من جان است.",
    "من مهندس هستم.",
    "سلام"
]

# سایر بخش‌های کد



In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# global max_len
max_len = max(max(len(seq) for seq in input_texts), max(len(seq) for seq in output_texts))

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

class Translator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Translator, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.encoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, input_seq):
        # print(input_seq.shape)
        embedded = self.embedding(input_seq)
        encoder_output, (encoder_hidden, encoder_cell) = self.encoder(embedded)
        
        decoder_output, _ = self.decoder(encoder_output, (encoder_hidden, encoder_cell))

        output = self.fc(decoder_output)
        output = self.softmax(output)

        return output


class TranslationDataset(Dataset):
    def __init__(self, input_texts, output_texts, max_len):
        self.input_texts = input_texts
        self.output_texts = output_texts
        self.max_len = max_len
    
    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, index):
        input_text = self.input_texts[index]
        output_text = self.output_texts[index]
        input_seq = text_to_tensor(input_text, self.max_len).squeeze(0)
        output_seq = text_to_tensor(output_text, self.max_len).squeeze(0)

        return input_seq, output_seq


def text_to_tensor(text, max_len=None):
    encoded = tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=max_len)
    tensor = torch.tensor(encoded)
    return tensor.unsqueeze(0)

def tensor_to_text(tensor):
    text = tokenizer.decode(tensor)
    text = text.replace("[CLS]", "").replace("[SEP]", "").replace("[PAD]", "")
    return text



def pad_sequences(sequences, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    padded_sequences = torch.full((len(sequences), max_len), padding_value)
    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = torch.tensor(seq)
    return padded_sequences


def collate_fn(batch):
    # Extracting inputs and outputs from batch
    input_seqs = [item[0] for item in batch]
    target_seqs = [item[1] for item in batch]

    # Find sequences with longest sequence length
    input_seqs = pad_sequences(input_seqs, max_len=max_len)
    target_seqs = pad_sequences(target_seqs, max_len=max_len)

    return input_seqs, target_seqs


input_size = tokenizer.vocab_size
hidden_size = 256
output_size = tokenizer.vocab_size
learning_rate = 0.001
num_epochs = 150


translator = Translator(input_size, hidden_size, output_size)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(translator.parameters(), lr=learning_rate)

dataset = TranslationDataset(input_texts, output_texts,max_len=max_len)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True,drop_last=True,collate_fn=collate_fn)

for epoch in range(num_epochs):
    for input_seq, target_seq in dataloader:
        optimizer.zero_grad()

        output = translator(input_seq)
        loss = criterion(output.view(-1, output_size), target_seq.view(-1))

        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

Epoch [10/150], Loss: 1.3462210893630981
Epoch [20/150], Loss: 0.7009117007255554
Epoch [30/150], Loss: 0.6859925389289856
Epoch [40/150], Loss: 0.48846742510795593
Epoch [50/150], Loss: 0.4069752097129822
Epoch [60/150], Loss: 0.04743042215704918
Epoch [70/150], Loss: 0.21529562771320343
Epoch [80/150], Loss: 0.15566202998161316
Epoch [90/150], Loss: 0.24217753112316132
Epoch [100/150], Loss: 0.013216380029916763
Epoch [110/150], Loss: 0.04095226526260376
Epoch [120/150], Loss: 0.007901791483163834
Epoch [130/150], Loss: 0.006592517718672752
Epoch [140/150], Loss: 0.005630132742226124
Epoch [150/150], Loss: 0.01746489852666855


In [8]:
input="چکار می‌کنی؟"
test_input_seq = text_to_tensor(input,max_len=max_len)
translator.eval()
with torch.no_grad():
    output = translator(test_input_seq)

    k = 1  # تعداد بهترین پیش‌بینی‌ها که می‌خواهید انتخاب کنید
    _, topk_indices = torch.topk(output, k, dim=2)
    predicted_output_seq = topk_indices.squeeze(0)
      
    predicted_output_text = tensor_to_text(predicted_output_seq.squeeze())

    print("Input: {}".format(input))
    print("Predicted Output:", predicted_output_text)

Input: چکار می‌کنی؟
Predicted Output:  من مهندس هستم.          
