In [2]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
# Metrics
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
# Text box
import ipywidgets as widgets
from IPython.display import display

In [3]:
# Read-in movie lines and conversations datasets
movie_lines_path = 'movie_lines.txt'
movie_conversations_path = 'movie_conversations.txt'

In [4]:
lines = open(movie_lines_path, encoding='utf-8', errors='ignore').read().split('\n')
convers = open(movie_conversations_path, encoding='utf-8', errors='ignore').read().split('\n')

**Text Preprocessing**

In [5]:
class TextPreprocessor:
    def __init__(self, max_len=13):
        self.max_len = max_len
        self._compile_regex()

    def _compile_regex(self):
        self.regex_patterns = {
            r"i'm": "i am",
            r"he's": "he is",
            r"she's": "she is",
            r"that's": "that is",
            r"what's": "what is",
            r"where's": "where is",
            r"\'ll": " will",
            r"\'ve": " have",
            r"\'re": " are",
            r"\'d": " would",
            r"won't": "will not",
            r"can't": "cannot",
            r"[^\w\s]": "",
        }
        self.compiled_patterns = {re.compile(pattern): repl for pattern, repl in self.regex_patterns.items()}

    def clean_text(self, txt):
        txt = txt.lower()
        for pattern, repl in self.compiled_patterns.items():
            txt = pattern.sub(repl, txt)
        return txt

    def preprocess_data(self, convers, lines):
        exchange = [conver.split(' +++$+++ ')[-1][1:-1].replace("'", " ").replace(",", "").split() for conver in convers]
        diag = {line.split(' +++$+++ ')[0]: line.split(' +++$+++ ')[-1] for line in lines}
        questions, answers = self._extract_questions_answers(exchange, diag)
        return questions, answers

    def _extract_questions_answers(self, exchange, diag):
        questions, answers = [], []
        for conver in exchange:
            for i in range(len(conver) - 1):
                questions.append(diag.get(conver[i], ''))
                answers.append(diag.get(conver[i + 1], ''))
        sorted_ques = [q for q in questions if len(q.split()) < self.max_len]
        sorted_ans = [a for q, a in zip(questions, answers) if len(q.split()) < self.max_len]
        return sorted_ques, sorted_ans

In [6]:
# Apply preprocessing and cleaning
max_len = 13
preprocessor = TextPreprocessor(max_len=max_len)
sorted_ques, sorted_ans = preprocessor.preprocess_data(convers, lines)
clean_ques = [preprocessor.clean_text(q) for q in sorted_ques]
clean_ans = [preprocessor.clean_text(a) for a in sorted_ans]

# Trimming answers and lists
clean_ans = [' '.join(ans.split()[:11]) for ans in clean_ans]
clean_ans = clean_ans[:30000]
clean_ques = clean_ques[:30000]

In [7]:
# Before cleaning
print(sorted_ques[:5])
print(sorted_ans[:5])

["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "You're asking me out.  That's so cute. What's your name again?", 'Cameron.', 'Why?']
['Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?", 'Forget it.', "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.", 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.']


In [8]:
# After cleaning
print(clean_ques[:5])
print(clean_ans[:5])

['well i thought we would start with pronunciation if that is okay with you', 'not the hacking and gagging and spitting part  please', 'you are asking me out  that is so cute what is your name again', 'cameron', 'why']
['not the hacking and gagging and spitting part please', 'okay then how bout we try out some french cuisine saturday', 'forget it', 'the thing is cameron i am at the mercy of a', 'unsolved mystery she used to be really popular when she started']


In [9]:
# Counting word occurrences and creating vocab based on threshold
word2count = {}
for line in clean_ques + clean_ans:
    for word in line.split():
        word2count[word] = word2count.get(word, 0) + 1

# Creating the vocabulary based on threshold
thresh = 5
vocab = {word: num for num, (word, count) in enumerate(word2count.items()) if count >= thresh}

In [10]:
# Adding special tokens to the vocabulary
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    vocab[token] = len(vocab)

# Creating the inverse vocabulary
inv_vocab = {num: word for word, num in vocab.items()}

# Encoding the questions and answers
encoder_inp = [[vocab.get(word, vocab['<OUT>']) for word in line.split()] for line in clean_ques]
decoder_inp = [[vocab.get(word, vocab['<OUT>']) for word in ('<SOS> ' + line + ' <EOS>').split()] for line in clean_ans]

# Sample output for verification
encoder_inp[:5], decoder_inp[:5]


([[0, 1, 2, 3, 4, 5, 6, 4088, 8, 9, 10, 11, 6, 12],
  [13, 14, 4088, 16, 4088, 16, 4088, 19, 20],
  [12, 21, 22, 23, 24, 9, 10, 25, 26, 27, 10, 28, 29, 30],
  [31],
  [32]],
 [[4089, 13, 14, 4088, 16, 4088, 16, 4088, 19, 20, 4087],
  [4089, 11, 93, 56, 1129, 3, 738, 24, 361, 283, 4088, 141, 4087],
  [4089, 166, 159, 4087],
  [4089, 14, 140, 10, 31, 1, 103, 133, 14, 3364, 81, 38, 4087],
  [4089, 4088, 8415, 79, 271, 75, 125, 430, 945, 365, 79, 1107, 4087]])

In [11]:
# Convert lists to PyTorch tensors
encoder_inp_tensors = [torch.LongTensor(seq) for seq in encoder_inp]
decoder_inp_tensors = [torch.LongTensor(seq) for seq in decoder_inp]

# Padding
max_seq_len = max(max(len(seq) for seq in encoder_inp_tensors), max(len(seq) for seq in decoder_inp_tensors))
encoder_inp_tensors_padded = [torch.cat([seq, torch.LongTensor([vocab['<PAD>']] * (max_seq_len - len(seq)))]) for seq in encoder_inp_tensors]
decoder_inp_tensors_padded = [torch.cat([seq, torch.LongTensor([vocab['<PAD>']] * (max_seq_len - len(seq)))]) for seq in decoder_inp_tensors]

# Convert lists of tensors to 2D tensors
encoder_inp_padded = torch.stack(encoder_inp_tensors_padded, dim=0)
decoder_inp_padded = torch.stack(decoder_inp_tensors_padded, dim=0)
decoder_final_output = decoder_inp_padded[:, 1:]

In [12]:
print(encoder_inp_padded.shape)
print(decoder_inp_padded.shape)
print(decoder_final_output.shape)


torch.Size([30000, 15])
torch.Size([30000, 15])
torch.Size([30000, 14])


In [13]:
decoder_final_output.shape

torch.Size([30000, 14])

In [14]:
# Verify
encoder_inp_padded.shape, decoder_inp_padded.shape, decoder_final_output.shape

(torch.Size([30000, 15]), torch.Size([30000, 15]), torch.Size([30000, 14]))

In [15]:
encoder_inp_padded[22]

tensor([  93,    9,   10,   94,   12,   95,   75,   96, 4086, 4086, 4086, 4086,
        4086, 4086, 4086])

In [16]:
test_batch = torch.tensor(encoder_inp_padded[2].to(dtype=torch.long))
test_batch2 = torch.tensor(encoder_inp_padded[1].to(dtype=torch.long))
batch_tester = torch.stack([test_batch.T,test_batch2])

  test_batch = torch.tensor(encoder_inp_padded[2].to(dtype=torch.long))
  test_batch2 = torch.tensor(encoder_inp_padded[1].to(dtype=torch.long))
  batch_tester = torch.stack([test_batch.T,test_batch2])


**Encoder, Decoder and Seq2Seq**

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu(), enforce_sorted=False)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [18]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
            
    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [19]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self , attn_model , embedding , hidden_size , output_size , n_layers = 1 , dropout = 0.1):
        super(LuongAttnDecoderRNN , self).__init__()
        self.attn_model = attn_model 
        self.hidden_size = hidden_size
        self.output_size =output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding 
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout = 0 , )
        self.concat = nn.Linear(hidden_size*2 , hidden_size)
        self.out = nn.Linear(hidden_size , output_size)
        self.attn = Attn(attn_model , hidden_size)
    def forward(self , input_step , last_hidden , encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output , hidden = self.gru(embedded , last_hidden)
        attn_weights = self.attn(rnn_output , encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1)) 
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output , context) , 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output , dim = 1)
        return output , hidden  


input_size = len(vocab)
src_vocab_size = len(vocab)
trg_vocab_size = len(vocab)
hidden_size = 200 

encoder = Encoder(input_size, hidden_size)

test_input = encoder_inp_padded[1].unsqueeze(1)
test_input_lengths = torch.tensor([len(test_input)]) 

# Double checking sequence length:
assert test_input.shape[0] >= torch.max(test_input_lengths), "Declared sequence length exceeds actual length"

# Encoder
output, hidden = encoder(test_input, test_input_lengths)

In [20]:
batch_size = 2

In [21]:
attn = Attn("general", 200)

In [22]:
embedding = nn.Embedding(len(vocab), hidden_size)

In [23]:
Decoder = LuongAttnDecoderRNN(attn, embedding,  hidden_size, 100)

In [24]:
enc = Encoder(1, hidden_size)

In [25]:
input_size = len(vocab)
src_vocab_size = len(vocab)
trg_vocab_size = len(vocab)
hidden_size = 200 

encoder = Encoder(input_size, hidden_size)

test_input = encoder_inp_padded[1].unsqueeze(1)
test_input_lengths = torch.tensor([len(test_input)]) 

# Double checking sequence length:
assert test_input.shape[0] >= torch.max(test_input_lengths), "Declared sequence length exceeds actual length"

# Encoder
output, hidden = encoder(batch_tester.T, torch.tensor([15,15]))

In [26]:
decoder_hidden = hidden[:Decoder.n_layers]


In [27]:
decoder_hidden.squeeze().shape

torch.Size([2, 200])

In [28]:
hidden.shape

torch.Size([2, 2, 200])

In [29]:
vocab['<SOS>']

4089

In [30]:
decoder_input = torch.LongTensor([[vocab['<SOS>'] for _ in range(200)]])

In [31]:
decoder_hidden

tensor([[[-0.3228, -0.1422, -0.5230,  0.4568,  0.3626,  0.2242, -0.1720,
          -0.0801,  0.1019, -0.1879, -0.4286,  0.3959,  0.2633,  0.6614,
           0.1586,  0.2969,  0.1604, -0.0280,  0.1978, -0.2136,  0.1204,
           0.0258,  0.2063, -0.1129, -0.1605, -0.0709, -0.1737,  0.2445,
           0.6190, -0.4989,  0.3018,  0.2556,  0.3671, -0.2218, -0.3314,
           0.1250, -0.3679, -0.2191, -0.1800,  0.0756, -0.1976,  0.2722,
          -0.3354, -0.4069,  0.2706, -0.0576,  0.3223,  0.0678, -0.0725,
          -0.0770,  0.3896,  0.0561,  0.2765,  0.2332, -0.1978, -0.1619,
          -0.1649,  0.3653, -0.6702, -0.2230,  0.5320, -0.2824,  0.0691,
          -0.3573,  0.3368, -0.1237,  0.1509,  0.3799, -0.3058,  0.2999,
           0.3328,  0.2259, -0.1222, -0.1998,  0.0750,  0.1757, -0.1996,
          -0.0266,  0.2411, -0.2798,  0.2200,  0.0684, -0.2340,  0.3991,
           0.1005, -0.3144, -0.1802,  0.4687, -0.3278, -0.4043, -0.5034,
           0.1587,  0.6539, -0.0572,  0.0345,  0.43

In [32]:
decoder_output, decoder_hidden = Decoder(decoder_input, decoder_hidden, output)

RuntimeError: Expected hidden size (1, 200, 200), got [1, 2, 200]

In [160]:
batch_tester.shape

torch.Size([2, 15])

In [161]:
hidden.shape

torch.Size([2, 2, 200])

In [87]:
class TransformerSeq2Seq(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers):
        super(TransformerSeq2Seq, self).__init__()

        # Encoder
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead),
            num_layers=num_encoder_layers
        )

        # Decoder
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, nhead),
            num_layers=num_decoder_layers
        )

        # Embeddings
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)

        # Output layer
        self.output_layer = nn.Linear(d_model, trg_vocab_size)

    def forward(self, src, trg):
        # Embed source and target sequences
        src_embedded = self.src_embedding(src)
        trg_embedded = self.trg_embedding(trg)

        # Encode source sequence
        encoder_output = self.encoder(src_embedded)

        # Decode target sequence with encoder output
        decoder_output = self.decoder(trg_embedded, encoder_output)

        # Final prediction
        output = self.output_layer(decoder_output)

        return output

In [20]:
EPOCHS = 2
LR = 0.001

# Defining parameters
d_model = 512
nhead = 8
num_encoder_layers = 6
num_decoder_layers = 6

model = TransformerSeq2Seq(src_vocab_size, trg_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)

criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    data_loader = DataLoader(
        list(zip(encoder_inp_padded, decoder_inp_padded, decoder_final_output)),
        batch_size=3,
        shuffle=True
    )
    for i, (encoder_input, decoder_input, decoder_output) in enumerate(data_loader):
        optimizer.zero_grad()
        
        if (encoder_input >= src_vocab_size).any() or (decoder_input >= trg_vocab_size).any():
            continue

        outputs = model(encoder_input, decoder_input)
        
        loss = criterion(outputs[:, 1:].contiguous().view(-1, outputs.shape[-1]), decoder_output.view(-1))
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f"Epoch [{epoch + 1}/{EPOCHS}], Loss: {loss.item():.4f}")

Epoch [1/2], Loss: 6.1925
Epoch [1/2], Loss: 5.1615
Epoch [1/2], Loss: 5.8120
Epoch [1/2], Loss: 4.7357
Epoch [1/2], Loss: 5.3043
Epoch [1/2], Loss: 4.6702
Epoch [1/2], Loss: 5.9605
Epoch [1/2], Loss: 5.8685
Epoch [1/2], Loss: 5.3909
Epoch [1/2], Loss: 5.7470
Epoch [1/2], Loss: 5.3402
Epoch [1/2], Loss: 5.0852
Epoch [1/2], Loss: 5.5482
Epoch [1/2], Loss: 5.4563
Epoch [1/2], Loss: 5.8816
Epoch [1/2], Loss: 5.0328
Epoch [1/2], Loss: 4.5945
Epoch [1/2], Loss: 6.1544
Epoch [1/2], Loss: 5.7120
Epoch [1/2], Loss: 4.5364
Epoch [1/2], Loss: 5.5490
Epoch [1/2], Loss: 5.0505
Epoch [1/2], Loss: 5.0775
Epoch [1/2], Loss: 5.9997
Epoch [1/2], Loss: 5.6600
Epoch [1/2], Loss: 4.6193
Epoch [2/2], Loss: 5.2666
Epoch [2/2], Loss: 4.7714
Epoch [2/2], Loss: 5.0579
Epoch [2/2], Loss: 4.8656
Epoch [2/2], Loss: 4.3671
Epoch [2/2], Loss: 5.0789
Epoch [2/2], Loss: 5.1490
Epoch [2/2], Loss: 4.7253
Epoch [2/2], Loss: 5.2326
Epoch [2/2], Loss: 5.7042
Epoch [2/2], Loss: 4.2576
Epoch [2/2], Loss: 6.0411
Epoch [2/2],

In [24]:
# Save the model
torch.save(model.state_dict(), 'transformer_model.pth')

In [None]:
from flask import Flask, request, jsonify
import torch

app = Flask(__name__)

# Load the saved model
model = TransformerSeq2Seq(src_vocab_size, trg_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
model.load_state_dict(torch.load('transformer_model.pth'))
model.eval()

@app.route('/predict', methods=['POST'])
def predict():
    input_data = request.json  # Assuming you send data in JSON format
    # Preprocess input_data if needed
    # Perform inference using the model
    # Return the model's predictions

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8080
 * Running on http://192.168.0.205:8080
Press CTRL+C to quit
