In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import unicodedata
import string

# Set random seed for PyTorch CPU operations
torch.manual_seed(42)


# Preprocessing functions
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_text(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    text =  "<sos> " +  text + " <eos>"
    
    return text
    
    

# Custom Dataset class
class DialogDataset(Dataset):
    def __init__(self, questions, answers, src_vocab, tgt_vocab):
        self.questions = questions
        self.answers = answers
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.tokenizer = get_tokenizer('basic_english')

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        src = [self.src_vocab[token] for token in self.tokenizer(self.questions[idx])]
        tgt = [self.tgt_vocab[token] for token in self.tokenizer(self.answers[idx])]
        return torch.tensor(src), torch.tensor(tgt)

    @staticmethod
    def collate_fn(batch):
        src_batch, tgt_batch = [], []
        for src_item, tgt_item in batch:
            src_batch.append(torch.cat([torch.tensor([src_vocab["<sos>"]]), src_item, torch.tensor([src_vocab["<eos>"]])], dim=0))
            tgt_batch.append(torch.cat([torch.tensor([tgt_vocab["<sos>"]]), tgt_item, torch.tensor([tgt_vocab["<eos>"]])], dim=0))
        src_batch = pad_sequence(src_batch, padding_value=src_vocab["<pad>"]).transpose(0, 1)
        tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_vocab["<pad>"]).transpose(0, 1)
        return src_batch, tgt_batch

# Load and preprocess data
data = pd.read_csv("./dialogs.txt", sep='\t', header=None, names=['question', 'answer'])
data["question"] = data.question.apply(clean_text)
data["answer"] = data.answer.apply(clean_text)

# Split data
train_data, val_data = train_test_split(data, test_size=0.2)

# Build vocabularies
tokenizer = get_tokenizer('basic_english')

def build_vocab(data):
    vocab = build_vocab_from_iterator(map(tokenizer, data), specials=["<pad>", "<sos>", "<eos>"])
    vocab.set_default_index(vocab["<pad>"])
    return vocab


src_vocab = build_vocab(train_data['question'])
tgt_vocab = build_vocab(train_data['answer'])

# Create datasets
train_dataset = DialogDataset(train_data['question'].tolist(), train_data['answer'].tolist(), src_vocab, tgt_vocab)
val_dataset = DialogDataset(val_data['question'].tolist(), val_data['answer'].tolist(), src_vocab, tgt_vocab)

# DataLoader
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=DialogDataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=DialogDataset.collate_fn)


# checking dimension batch from DataLoader
for src, tgt in train_loader:
    print("Batch shapes:", src.shape, tgt.shape)
    break
from torch import nn

# Set random seed for PyTorch CPU operations
torch.manual_seed(42)
device = torch.device('cpu') 
print('Device:',device)

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, enc_units, batch_first=True)

    def forward(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, hidden)
        return output, state

    def initialize_hidden_state(self, batch_size):
        return torch.zeros((1, batch_size, self.enc_units))


# Parameters for the model
embedding_dim = 256
units = 1024
vocab_inp_size = len(src_vocab)
vocab_tar_size = len(tgt_vocab)
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE).to(device)

import torch.nn.functional as F

class BahdanauAttention(nn.Module):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(units, units)
        self.W2 = nn.Linear(units, units)
        self.V = nn.Linear(units, 1)

    def forward(self, query, values):
        query_with_time_axis = query.unsqueeze(1)  # Shape: [batch_size, 1, hidden_size]
        query_layer = self.W1(query_with_time_axis)  # Shape: [batch_size, 1, hidden_size]
        values_layer = self.W2(values)  # Shape: [batch_size, max_len, hidden_size]

        #print("query_layer shape:", query_layer.shape)
        #print("values_layer shape:", values_layer.shape)

        # Broadcasting query_layer to match the shape of values_layer
        query_layer = query_layer.expand_as(values_layer)

        # Calculate the score
        score = self.V(torch.tanh(query_layer + values_layer))  # Shape: [batch_size, max_length, 1]
        attention_weights = F.softmax(score, dim=1)

        context_vector = attention_weights * values
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector, attention_weights


# Initialize the attention layer
attention_layer = BahdanauAttention(units)

class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim + dec_units, dec_units, batch_first=True)
        self.fc = nn.Linear(dec_units, vocab_size)

        # used for attention
        self.attention = BahdanauAttention(dec_units)

    def forward(self, x, hidden, enc_output):
        # Ensure hidden state is 2D [batch_size, hidden_size]
        if hidden.dim() == 3:
            hidden = hidden.squeeze(0)  # Removes the first dimension if it's of size 1

        # Attention layer
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # Concatenate context vector and x
        x = torch.cat((context_vector.unsqueeze(1), x), -1)

        # Passing the concatenated vector to the GRU
        output, state = self.gru(x, hidden.unsqueeze(0))

        # output shape == (batch_size, 1, hidden_size)
        output = output.reshape(-1, output.size(2))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state.squeeze(0), attention_weights

    
# Initialize the decoder

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE).to(device)

import torch
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm



# Optimizer
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)

# Custom Loss Function
def loss_function(real, pred):
    # Mask for non-zero tokens in the target
    mask = real.ne(0)
    loss = F.cross_entropy(pred, real, reduction='none')
    loss = loss * mask
    return loss.mean()

# Training Step Function
def train_step(inp, targ, enc_hidden):
    # Move data to the device
    inp, targ, enc_hidden = inp.to(device), targ.to(device), enc_hidden.to(device)

    loss = 0
    optimizer.zero_grad()

    current_batch_size = inp.size(0)
    enc_hidden = enc_hidden[:, :current_batch_size, :]

    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden

    sos_token_index = tgt_vocab['<sos>']
    dec_input = torch.full((current_batch_size, 1), sos_token_index, dtype=torch.long, device=inp.device)

    for t in range(1, targ.size(1)):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss += loss_function(targ[:, t], predictions.squeeze(1))
        dec_input = targ[:, t].unsqueeze(1)

    batch_loss = loss / int(targ.size(1))
    batch_loss.backward()
    optimizer.step()

    return batch_loss.item()

# Training Loop
EPOCHS = 40
for epoch in tqdm(range(EPOCHS)):
    total_loss = 0

    for inp, targ in train_loader:
        current_batch_size = inp.size(0)
        # Initialize hidden state with the correct current batch size
        enc_hidden = encoder.initialize_hidden_state(current_batch_size).to(device)

        
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if epoch % 4 == 0:
        print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {total_loss / len(train_loader)}')

        
        
#save the model 
torch.save(encoder.state_dict(), 'encoder.pth')
torch.save(decoder.state_dict(), 'decoder.pth')


Batch shapes: torch.Size([64, 20]) torch.Size([64, 16])
Device: cpu


  2%|█                                        | 1/40 [02:04<1:20:54, 124.48s/it]

Epoch 1/40, Loss: 2.271122564660742


 12%|█████▏                                   | 5/40 [10:40<1:15:56, 130.19s/it]

Epoch 5/40, Loss: 1.0798931451553995


 22%|█████████▏                               | 9/40 [23:22<1:32:11, 178.43s/it]

Epoch 9/40, Loss: 0.16374219906456927


 32%|█████████████                           | 13/40 [33:38<1:08:14, 151.66s/it]

Epoch 13/40, Loss: 0.025988549865940784


 42%|█████████████████▊                        | 17/40 [39:45<40:39, 106.05s/it]

Epoch 17/40, Loss: 0.017777425613492093


 52%|██████████████████████▌                    | 21/40 [45:54<30:15, 95.57s/it]

Epoch 21/40, Loss: 0.014749614383153458


 62%|██████████████████████████▉                | 25/40 [51:58<23:05, 92.35s/it]

Epoch 25/40, Loss: 0.013067327241631264


 72%|███████████████████████████████▏           | 29/40 [58:03<16:46, 91.46s/it]

Epoch 29/40, Loss: 0.01263407063967687


 82%|█████████████████████████████████▊       | 33/40 [1:04:08<10:40, 91.44s/it]

Epoch 33/40, Loss: 0.012071249828218146


 92%|█████████████████████████████████████▉   | 37/40 [1:10:16<04:35, 91.85s/it]

Epoch 37/40, Loss: 0.011943061900463826


100%|████████████████████████████████████████| 40/40 [1:14:50<00:00, 112.27s/it]


In [7]:
max_length_targ = max(len(t.split()) for t in train_data['answer'])

def evaluate(sentence):
    sentence = clean_text(sentence)

    inputs = [src_vocab[token] for token in sentence.split(' ')]
    inputs = torch.tensor([inputs]).to(device)

    result = ''

    # Initialize the hidden state with zeros
    hidden = torch.zeros((1, 1, units)).to(device)  # Modify the shape according to your GRU layer
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = torch.tensor([[tgt_vocab['<sos>']]], dtype=torch.long).to(device)

    for t in range(max_length_targ):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_out)

        predicted_id = torch.argmax(predictions[0]).item()

        # Reverse lookup function
        def index_to_word(vocab, index):
            return vocab.get_itos()[index]

        if index_to_word(tgt_vocab, predicted_id) == '<eos>':
            break

        result += index_to_word(tgt_vocab, predicted_id) + ' '

        # The predicted ID is fed back into the model
        dec_input = torch.tensor([[predicted_id]], dtype=torch.long).to(device)

    return result, sentence


# Example usage
def ask(sentence):
    result, sentence = evaluate(sentence)

    print('Question: %s' % (sentence))
    print('Predicted answer: {}'.format(result))

# Load questions and answers from a file
questions = []
answers = []
with open("./dialogs.txt", 'r') as f:
    for line in f:
        line = line.split('\t')
        questions.append(line[0])
        answers.append(line[1])

print(len(questions) == len(answers))




True
Question: <sos> good luck with school <eos>
Predicted answer: <sos> i think you should go to go 


In [14]:
print(answers[15])
print(questions[15])

i'm actually in school right now.

i've actually been pretty good. you?


In [17]:
# Example usage with a specific question
print(ask(questions[20]))
print(answers[20])

Question: <sos> it is not bad there are a lot of people there <eos>
Predicted answer: <sos> good luck with that 
None
good luck with that.



In [18]:
# Function to interactively ask questions and get answers
def interact_with_model():
    while True:
        # Get user input
        user_input = input("Type your question (or 'exit' to quit): ")

        # Check if the user wants to exit
        if user_input.lower() == 'exit':
            break

        # Get the model's answer
        answer = evaluate(user_input)

        # Display the model's answer
        print("Model's answer:", answer)
        print("\n")

# Start the interactive loop
interact_with_model()


Type your question (or 'exit' to quit): How are you?
Model's answer: ('<sos> i am fine how about yourself ', '<sos> how are you <eos>')


Type your question (or 'exit' to quit): who are you?
Model's answer: ('<sos> i am not sure ', '<sos> who are you <eos>')


Type your question (or 'exit' to quit): can you help me ?
Model's answer: ('<sos> what time does mi mean ', '<sos> can you help me  <eos>')


Type your question (or 'exit' to quit): Hi, there are a lot of people here.
Model's answer: ('<sos> there are lots of police ', '<sos> hi there are a lot of people here <eos>')


Type your question (or 'exit' to quit): exit
