In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import torch
from torch import nn, optim

class Tokenizer:
    def __init__(self, data_path):
        with open(data_path, 'r', encoding='utf8') as f:
            self.data = f.read().replace('\n', '')

    def tokenize(self):
        return list(self.data)

class Vocab:
    def __init__(self, tokens):
        self.token_to_index = {}
        self.index_to_token = {}
        self.token_frequency = {}

        self.add_token('<PAD>')
        self.add_token('<UNK>')

        for token in tokens:
            self.add_token(token)

    def add_token(self, token):
        if token not in self.token_to_index:
            index = len(self.token_to_index)
            self.token_to_index[token] = index
            self.index_to_token[index] = token
            self.token_frequency[token] = 1
        else:
            self.token_frequency[token] += 1

    def lookup_token(self, token):
        if token in self.token_to_index:
            return self.token_to_index[token]
        else:
            return self.token_to_index['<UNK>']

    def lookup_index(self, index):
        if index in self.index_to_token:
            return self.index_to_token[index]
        else:
            return '<UNK>'

class SimpleLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(SimpleLanguageModel, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input, hidden=None):
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden

def train_model(model, train_data, num_epochs, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0
        for input, target in train_data:
            model.zero_grad()

            output, hidden = model(input)
            loss = criterion(output.view(-1, model.vocab_size), target.view(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print('Epoch:', epoch + 1, 'Loss:', total_loss)

    return model

def predict(model, input_text, vocab, tokenizer):
    #input_tokens = tokenizer.tokenize(input_text)
    input_tokens = word_tokenize(input_text)

    input_token_ids = [vocab.lookup_token(token) for token in input_tokens]
    input_tensor = torch.LongTensor([input_token_ids])

    hidden = None
    output_tokens = []
    used_tokens = {}
    with torch.no_grad():
        while True:
            output, hidden = model(input_tensor, hidden)
            last_output = output.squeeze(0)[-1]
            probabilities = nn.functional.softmax(last_output, dim=0)
            next_token_id = torch.argmax(probabilities).item()
            output_tokens.append(next_token_id)
            if vocab.lookup_index(next_token_id) == '<EOS>' or len(output_tokens) >= 1000:
                break
            input_tensor = torch.LongTensor([[next_token_id]])
        

    output_text = [vocab.lookup_index(token_id) for token_id in output_tokens]
    output_text = ' '.join(output_text)
    return output_text

In [None]:
# متن آموزشی را می‌خوانیم
tokenizer = Tokenizer('/content/data.txt')
tokens = tokenizer.tokenize()
vocab = Vocab(tokens)
# داده‌ها را تبدیل به بردارهای عددی می‌کنیم
token_ids = [vocab.lookup_token(token) for token in tokens]
input_sequence = torch.LongTensor(token_ids[:-1]).unsqueeze(0)
target_sequence = torch.LongTensor(token_ids[1:]).unsqueeze(0)
# تعریف مدل و آموزش آن
model = SimpleLanguageModel(len(vocab.token_to_index), 50, 100)
model = train_model(model, [(input_sequence, target_sequence)], 500, 0.001)



Epoch: 55 Loss: 1.3982572555541992
Epoch: 56 Loss: 1.3685566186904907
Epoch: 57 Loss: 1.3390238285064697
Epoch: 58 Loss: 1.3096495866775513
Epoch: 59 Loss: 1.2804265022277832
Epoch: 60 Loss: 1.2513537406921387
Epoch: 61 Loss: 1.2224376201629639
Epoch: 62 Loss: 1.193691372871399
Epoch: 63 Loss: 1.1651310920715332
Epoch: 64 Loss: 1.1367707252502441
Epoch: 65 Loss: 1.1086207628250122
Epoch: 66 Loss: 1.0806893110275269
Epoch: 67 Loss: 1.052986979484558
Epoch: 68 Loss: 1.0255272388458252
Epoch: 69 Loss: 0.9983252286911011
Epoch: 70 Loss: 0.9713958501815796
Epoch: 71 Loss: 0.9447532892227173
Epoch: 72 Loss: 0.9184126853942871
Epoch: 73 Loss: 0.8923919200897217
Epoch: 74 Loss: 0.8667120933532715
Epoch: 75 Loss: 0.8413950800895691
Epoch: 76 Loss: 0.8164600133895874
Epoch: 77 Loss: 0.791924238204956
Epoch: 78 Loss: 0.767803430557251
Epoch: 79 Loss: 0.744114875793457
Epoch: 80 Loss: 0.7208753824234009
Epoch: 81 Loss: 0.6981005668640137
Epoch: 82 Loss: 0.6758043169975281
Epoch: 83 Loss: 0.6539982

In [None]:
# پیش‌بینی کلمه بعدی در یک جمله ساده
input_text = 'alireza nansouri work as1'
predicted_text = predict(model, input_text, vocab, tokenizer)
a=predicted_text.replace('  ',',')
a=a.replace(' ','')
a=a.replace(',',' ')
#print(a)
from collections import Counter

sentences = a.split('.')


# شمارش تعداد تکرار هر جمله
sentence_counts = Counter(sentences)
# جملاتی که بیش از یک بار تکرار شده‌اند
repeated_sentences = [sentence for sentence in sentences if sentence_counts[sentence] > 1]

# جملات بدون تکرار
unique_sentences = list(set(sentences))
unique_sentences=[i for i in unique_sentences if input_text in i]
print(max(unique_sentences,key=len))

balireza nansouri work as1 ai enginieering a
