<a href="https://colab.research.google.com/github/aanchal0431/chatbot/blob/main/SEP_728_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chatbot

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
print(tf.__version__)

2.6.0


Git Commands to clone repository, pull and push data

In [38]:
#!git clone https://github.com/aanchal0431/chatbot.git
#!git pull
#%cd chatbot/
#!git config --global user.name "aanchal0431"
#!git config --global user.email "aanchal0431@gmail.com"
#!git remote add aanchal0431 https://github.com/aanchal0431/chatbot.git
#!git --help
#!git remote -v
#!git init
#%ls
#!git add 
#!git status 
#!git commit -m 'New Commit'

^C


### Data Preprocessing

*   Load datasets
*   Append question and answer datasets
*   Remove duplicate questions
*   Convert data to lower case
*   Split into train and test
*   Drop irrelvant columns






In [4]:
cur_path = 'Data/Question_Answer_Dataset_v1.2/'
data_s8 = pd.read_csv(cur_path + 'S08/question_answer_pairs.txt', delimiter="\t")
data_s9 = pd.read_csv(cur_path + 'S09/question_answer_pairs.txt', delimiter="\t")
data_s10 = pd.read_csv(cur_path + 'S10/question_answer_pairs.txt', delimiter="\t")
print("Shape s8:", data_s8.shape)
print("Shape s9:", data_s9.shape)
print("Shape s10:", data_s10.shape)
data_s8.head()

Shape s8: (1715, 6)
Shape s9: (825, 6)
Shape s10: (1458, 6)


Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,data/set3/a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,data/set3/a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,data/set3/a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,data/set3/a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,data/set3/a4


In [5]:
#append all questions into one data set
data_all = data_s8.append(data_s9.append(data_s10))
#data_all = data_s8.append(data_s9)
print("Shape:", data_all.shape)
data_all.head()


Shape: (3998, 6)


Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,data/set3/a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,data/set3/a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,data/set3/a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,data/set3/a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,data/set3/a4


In [6]:
#remove duplicate questions
data_all = data_all.drop_duplicates(subset=['Question'])
print("Shape:", data_all.shape)
data_all.head()



Shape: (2457, 6)


Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,data/set3/a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,data/set3/a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,data/set3/a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,data/set3/a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,data/set3/a4


In [7]:
# convert text to lower case
data_all = data_all.apply(lambda x: x.astype(str).str.lower())
data_all.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,abraham_lincoln,was abraham lincoln the sixteenth president of...,yes,easy,easy,data/set3/a4
2,abraham_lincoln,did lincoln sign the national banking act of 1...,yes,easy,medium,data/set3/a4
4,abraham_lincoln,did his mother die of pneumonia?,no,easy,medium,data/set3/a4
6,abraham_lincoln,how many long was lincoln's formal education?,18 months,medium,easy,data/set3/a4
8,abraham_lincoln,when did lincoln begin his political career?,1832,medium,easy,data/set3/a4


In [39]:
#Tokenization Example for Questions
# import APIs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# segregating questions and answers into different lists
questions = list(data_all['Question'])
answers = list(data_all['Answer'])

#initialize the tokenizer
tokenizer = Tokenizer(oov_token="<oov>") #oov: out of vocabulary token => used to handle newly encountered words in the training dataset, new

#creating word index
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1

#create sequences using tokenizer
sequences = tokenizer.texts_to_sequences(questions)

#padding the sequences to make sure length matches for all the sequences
padded_sequences_questions = pad_sequences(sequences, padding='pre')

print("Vocabulary size for questions:", vocab_size)
#print(word_index)
#print(sequences)
print(padded_sequences_questions.shape)


Vocabulary size for questions: 4604
(2457, 99)


In [40]:
#Tokenization Example for Answers

#creating word index
tokenizer.fit_on_texts(answers)
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1

#create sequences using tokenizer
sequences = tokenizer.texts_to_sequences(answers)

#padding the sequences to make sure length matches for all the sequences
padded_sequences_answers = pad_sequences(sequences, padding='pre')

print("Vocabulary size for questions:", vocab_size)
#print(word_index)
#print(sequences)
print(padded_sequences_answers.shape)


Vocabulary size for questions: 5817
(2457, 156)


In [10]:
# divide into train and test
X_train, X_test, y_train, y_test = train_test_split(data_s8['Question'], data_s8['Answer'],
          shuffle=True, test_size=0.1, random_state=5)


In [11]:
#Format for simpleT5
train = pd.DataFrame({'source_text': X_train, 'target_text': y_train})
# Add prefix for simpleT5
train['source_text']    = "answer question:" + train['source_text']
test = pd.DataFrame({'source_text': X_test, 'target_text': y_test}) 
test['source_text']    = "answer question:" + test['source_text']
train.head()

Unnamed: 0,source_text,target_text
1214,answer question:Are otters playful animals?,yes
123,answer question:Did the scientific community n...,yes
1084,answer question:How many municipalities are wi...,6.
917,answer question:What information did he record...,He wrote descriptions of events and impression...
823,"answer question:What does ""Era of Good Feeling...","Monroe allowed his political base to decay, wh..."


### Train a Simple Model
A pretrained t5 model is used to test the question/answer process. No tokenization or context is required for this model.

In [12]:
pip install --upgrade simplet5



In [20]:
# import
from simplet5 import SimpleT5


# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 models)
model.from_pretrained("t5","t5-base")

# train
model.train(train_df=train.applymap(str), # pandas dataframe with 2 columns: source_text & target_text
            eval_df=test.applymap(str), # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 64, #Issue: not sure of max len - 512
            target_max_token_len = 32, #Issue: not sure of max len - 128
            batch_size = 8,
            max_epochs = 1,
            use_gpu = False,
            #outputdir = '/model/simpleT5',
            early_stopping_patience_epochs = 0,
            )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 42
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [22]:
#Issue:Unable to load and predict with model
# load trained T5 model
model.load_model("t5",'outputs/simplet5-epoch-0-train-loss-3.0084', use_gpu=False)
# for each test data perform prediction
model.predict("Did Lincoln start his political career in 1832?")

['nan']