<a href="https://colab.research.google.com/github/marveltimothyy/Generative-Chatbot-/blob/main/Main_Code_Chatbot_With_PyTorch%5BUndergraduate_Thesis%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data and Library preparation

##Import Library and init

In [None]:
#Lib for Preprocessing and load data 
import os
from io import open
import csv
import random
import re
import unicodedata
import itertools
from sklearn.model_selection import train_test_split
import time

#Lib for Modeling
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

#Bleu Evaluation
from nltk.translate.bleu_score import sentence_bleu

#Setup Cuda
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
torch.manual_seed(1)

<torch._C.Generator at 0x7f4465707870>

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


##Load Daset 

In [None]:
import os
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("/content/drive/MyDrive/Skripsi/Code/data", corpus_name)

In [None]:
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
save_dir =  os.path.join("/content/drive/MyDrive/Skripsi")

#Text Preprocessing

## Vocabulary 

In [None]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", 
                           SOS_token: "SOS",
                           EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1

## Text Preprocessing

In [None]:
MAX_LENGTH = 10  # Maximum sentence length to consider
class TextPreprocessing:
  def unicodeToAscii(s):
      return ''.join(
          c for c in unicodedata.normalize('NFD', s)
          if unicodedata.category(c) != 'Mn')
  def normalizeString(s):
      s = TextPreprocessing.unicodeToAscii(s.lower().strip())
      s = re.sub(r"[^a-z]+", r" ", s)
      s = re.sub(r"\s+", r" ", s).strip()
      return s
  def readVocs(datafile, corpus_name):
      lines = open(datafile, encoding='utf-8').read().strip().split('\n')
      pairs = [[TextPreprocessing.normalizeString(s) for s in l.split('\t')] for l in lines]
      voc = Voc(corpus_name)
      return voc, pairs
  def filterPair(p):
      return (len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH)
  def filterPairs(pairs):
      return [pair for pair in pairs if TextPreprocessing.filterPair(pair)]
  def dropNull(pairs):
      return [pair for pair in pairs if pair[0] != '' and pair[1] != '']
  def loadPrepareData(corpus_name, datafile):
      voc, pairs = TextPreprocessing.readVocs(datafile, corpus_name)
      pairs = TextPreprocessing.filterPairs(pairs)
      pairs = TextPreprocessing.dropNull(pairs)
      for pair in pairs:
          voc.addSentence(pair[0])
          voc.addSentence(pair[1])
      return voc, pairs
voc, pair = TextPreprocessing.loadPrepareData(corpus_name, datafile)
pairs = []
for sentences in pair[:10]:
    print(pair)
pairs_train, pairs_test = train_test_split(pairs, test_size=0.1, random_state=42)

['gosh if only we could find kat a boyfriend', 'let me see what i can do']
['c esc ma tete this is my head', 'right see you re ready for the quiz']
['that s because it s such a nice one', 'forget french']
['there', 'where']
['you have my word as a gentleman', 'you re sweet']
['hi', 'looks like things worked out tonight huh']
['you know chastity', 'i believe we share an art instructor']
['have fun tonight', 'tons']
['well no', 'then that s all you had to say']
['then that s all you had to say', 'but']


#Data adjustment

In [None]:
import itertools
class DataAdjustment:
  def tokenization(voc, sentence):
      return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

  def zeroPadding(l, fillvalue=PAD_token):
      return list(itertools.zip_longest(*l, fillvalue=fillvalue))

  def dataVar(l, voc, con = True):
      indexes_batch = [DataAdjustment.tokenization(voc, sentence) for sentence in l]
      padList = DataAdjustment.zeroPadding(indexes_batch)
      padVar = torch.LongTensor(padList)
      if con:
        lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
        return padVar, lengths
      else:
        max_target_len = max([len(indexes) for indexes in indexes_batch])
        return padVar, max_target_len

  def adjustBatchData(voc, pair_batch):
      pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
      input_batch, output_batch = [], []
      for pair in pair_batch:
          input_batch.append(pair[0])
          output_batch.append(pair[1])
      inp, lengths = DataAdjustment.dataVar(input_batch, voc)
      output, max_target_len = DataAdjustment.dataVar(output_batch, voc, False)
      return inp, lengths, output, max_target_len

  def batching(batch_size, iterable):
      args = [iter(iterable)] * batch_size
      return ([e for e in t if e != None] for t in itertools.zip_longest(*args))

# small_batch_size = 5
# batches = DataAdjustment.batch2TrainData(voc, [random.choice(pairs) for _ in 
#                                 range(small_batch_size)])
# input_variable, lengths, target_variable, mask, max_target_len = batches

# print("input_variable:", input_variable)
# print("lengths:", lengths)
# print("target_variable:", target_variable)
# print("mask:", mask)
# print("max_target_len:", max_target_len)

#Architecure Sequence-to-sequence

## ENCODER

In [None]:
class EncoderGRU(nn.Module):
    def __init__(self, hidden_size, embedding):
        super(EncoderGRU, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, 
                          hidden_size,
                          bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :,
                          :self.hidden_size] + outputs[:, :,
                                                       self.hidden_size:]
        return outputs, hidden

## Decoder + Attention implement

In [None]:
class AttnDecoderGRU(nn.Module):
    def __init__(self, embedding, hidden_size, 
                 output_size):
        super(AttnDecoderGRU, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        # Define layers
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, 
                          hidden_size,
                          bidirectional = False)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
 
        embedded = self.embedding(input_step)
  
        # Forward through unidirectional GRU
        gru_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention weights from the current GRU output
        luong_dot_score = torch.sum(gru_output * encoder_outputs, dim=2)
        attn_energies = luong_dot_score.t()
        attn_weights = F.softmax(attn_energies, dim=1).unsqueeze(1)

        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
  
        gru_output = gru_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((gru_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))

        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden

#Train-Test Step

## Train step

In [None]:
def train(input_variable, lengths, target_variable, max_target_len, 
          encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, 
          batch_size, clip, max_length=MAX_LENGTH):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    lengths = lengths.to("cpu")

    loss = 0
    print_losses = []

    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # decoder_hidden = encoder_hidden
    decoder_hidden = encoder_hidden[:1]

    # Forward batch of sequences through decoder one time step at a time
    for t in range(max_target_len):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        # Teacher forcing: next input is current target
        decoder_input = target_variable[t].view(1, -1)
        # Calculate and accumulate loss
        decoder_output = torch.log(decoder_output)
        mask_loss = criterion(decoder_output,
                                        target_variable[t])
        loss += mask_loss
        print_losses.append(mask_loss.item())

    loss.backward()

    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses)/max_target_len

##Test Step **EVALUATION**

In [None]:
def test(input_variable, lengths, target_variable, max_target_len, 
          encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, 
          batch_size, clip, max_length=MAX_LENGTH):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    lengths = lengths.to("cpu")

    loss = 0
    print_losses = []

    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    decoder_hidden = encoder_hidden[:1]

    encoder.eval()
    decoder.eval()

    with torch.no_grad():
          for t in range(max_target_len):
              decoder_output, decoder_hidden = decoder(
                  decoder_input, decoder_hidden, encoder_outputs
              )
              _, topi = decoder_output.topk(1)
              decoder_input = torch.LongTensor(
                  [[topi[i][0] for i in range(batch_size)]])
              decoder_input = decoder_input.to(device)
              decoder_output = torch.log(decoder_output)
              mask_loss = criterion(decoder_output,
                                              target_variable[t])
              loss += mask_loss
              print_losses.append(mask_loss.item())

    encoder.train()
    decoder.train()
    return sum(print_losses) / max_target_len



---
#Result


---



##Save State for (1536-128-15-0.0001)

In [None]:
hidden_size = 1536
batch_size = 128
epoch = 15
learning_rate = 0.0001
clip = 50.0
teacher_forcing_ratio = 1.0
loss_list = [] 
start_iteration = 1
print_loss = 0

torch.manual_seed(1)

print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)

# Initialize encoder & decoder models
encoder = EncoderGRU(hidden_size, embedding)
decoder = AttnDecoderGRU(embedding, 
                         hidden_size, 
                         voc.num_words)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)

encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

criterion = nn.NLLLoss(ignore_index=PAD_token, reduction='mean')

train_batch = list(DataAdjustment.batching(batch_size, pairs))
training_batches = [DataAdjustment.adjustBatchData(voc, 
                                    [train_batch[j][i] 
                                    for i in range(len(train_batch[j]))])
                  for j in range(len(train_batch)-1)]

for ep in range(epoch):
  random.shuffle(training_batches)
  print("Epoch {}".format(ep+1))
  #data train session 
  for iteration in range(start_iteration, len(train_batch)-1):
            training_batch = training_batches[iteration - 1]

            input_variable, lengths, target_variable, max_target_len = training_batch
            
            loss = train(input_variable, lengths, target_variable, 
                        max_target_len, encoder, decoder, embedding, 
                        encoder_optimizer, decoder_optimizer, batch_size, clip)
            print_loss += loss
            loss_list=print_loss
            print("\tIteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration/(len(train_batch)-1) * 100, print_loss))
            print_loss = 0

# Save checkpoint
# directory = os.path.join(save_dir)
# if not os.path.exists(directory):
#     os.makedirs(directory)
# torch.save({
#     'en': encoder.state_dict(),
#     'de': decoder.state_dict(),
#     'en_opt': encoder_optimizer.state_dict(),
#     'de_opt': decoder_optimizer.state_dict(),
#     'voc_dict': voc.__dict__,
#     'embedding': embedding.state_dict()
# }, os.path.join(directory, '3{}-{}-{}-{}.tar'.format(hidden_size, batch_size, epoch, learning_rate)))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
	Iteration: 256; Percent complete: 39.0%; Average loss: 2.2811
	Iteration: 257; Percent complete: 39.1%; Average loss: 2.4702
	Iteration: 258; Percent complete: 39.3%; Average loss: 2.4026
	Iteration: 259; Percent complete: 39.4%; Average loss: 2.4296
	Iteration: 260; Percent complete: 39.6%; Average loss: 2.3145
	Iteration: 261; Percent complete: 39.7%; Average loss: 2.2819
	Iteration: 262; Percent complete: 39.9%; Average loss: 2.1498
	Iteration: 263; Percent complete: 40.0%; Average loss: 2.7428
	Iteration: 264; Percent complete: 40.2%; Average loss: 2.4293
	Iteration: 265; Percent complete: 40.3%; Average loss: 2.3843
	Iteration: 266; Percent complete: 40.5%; Average loss: 1.8575
	Iteration: 267; Percent complete: 40.6%; Average loss: 2.1292
	Iteration: 268; Percent complete: 40.8%; Average loss: 2.2363
	Iteration: 269; Percent complete: 40.9%; Average loss: 3.0284
	Iteration: 270; Percent complete: 41.1%; Average los

###Response

In [None]:
class DecoderPredict(nn.Module):
    def __init__(self, encoder, decoder):
        super(DecoderPredict, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        decoder_hidden = encoder_hidden[:1]
        decoder_input = torch.ones(1, 1, device=device,
                                   dtype=torch.long) * SOS_token
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        for _ in range(max_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input,
                                                          decoder_hidden,
                                                          encoder_outputs)
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        return all_tokens, all_scores

In [None]:
encoder.eval()
decoder.eval()
searcher = DecoderPredict(encoder, decoder)
class InputProcessing:
  def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    indexes_batch = [DataAdjustment.tokenization(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    tokens, scores = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words
  def response_only(input_sentence):
      input_sentence = TextPreprocessing.normalizeString(input_sentence)
      output_words = InputProcessing.evaluate(encoder, decoder, searcher, voc, input_sentence)
      outword = []
      for i in output_words:
        if i == 'EOS' or i =='PAD':
          break
        else:
          outword.append(i)
      return ' '.join(outword)

In [None]:
def set_response():
    input_list = ['can play music?','Hello','how are you?','can you smile?', 'good morning', 'what is your name?', 'are you okay?', 'thanks', 'can you help me?', 'do you love me?', 'what are you doing?', 'i love you', 'good night', 'bye']
    out_dict= {} 
    for i in input_list:

      input_sentence = TextPreprocessing.normalizeString(i)
      output_words = InputProcessing.evaluate(encoder, decoder, searcher, voc, input_sentence)

      outword = []
      for j in output_words:
        if j == 'EOS':
          break
        elif j != 'PAD':

          outword.append(j)
      string = ' '.join(outword)
      string = re.sub(' ll ', "'ll ",string)
      string = re.sub(' t ', "'t ",string)
      string = re.sub(' d ', "'d ",string)
      string = re.sub(' re ', "'re ",string)
      string = re.sub(' s ', "'s ",string)
      string = re.sub(' m ', " am ",string)
      string = re.sub(' ve ', "'ve ",string)
      out_dict[i] = string

    for j in input_list:
      print("Human :", j)
      print("Bot   :", out_dict[j])
set_response()

Human : can play music?
Bot   : sure
Human : Hello
Bot   : hello
Human : how are you?
Bot   : i am fine
Human : can you smile?
Bot   : why yes he is he does he
Human : good morning
Bot   : morning neighbors morning
Human : what is your name?
Bot   : my name is sir robin of camelot
Human : are you okay?
Bot   : i am fine
Human : thanks
Bot   : you're welcome
Human : can you help me?
Bot   : how can i help you
Human : do you love me?
Bot   : you mean i love you
Human : what are you doing?
Bot   : i am trying to slash my wrists
Human : i love you
Bot   : i love you too
Human : good night
Bot   : good night
Human : bye
Bot   : bye
