# **Custom ChatBot**

In [None]:
# Import packages

import json
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load data
with open('intents.json') as file:
    data = json.load(file)

In [None]:
# Visualize data
data

{'intents': [{'tag': 'greeting',
   'patterns': ['Hi there',
    'How are you',
    'Is anyone there?',
    'Hey',
    'Hola',
    'Hello',
    'Good day'],
   'responses': ['Hello, thanks for asking',
    'Good to see you again',
    'Hi there, how can I help?'],
   'context': ['']},
  {'tag': 'goodbye',
   'patterns': ['Bye',
    'See you later',
    'Goodbye',
    'Nice chatting to you, bye',
    'Till next time'],
   'responses': ['See you!', 'Have a nice day', 'Bye! Come back again soon.'],
   'context': ['']},
  {'tag': 'thanks',
   'patterns': ['Thanks',
    'Thank you',
    "That's helpful",
    'Awesome, thanks',
    'Thanks for helping me'],
   'responses': ['Happy to help!', 'Any time!', 'My pleasure'],
   'context': ['']},
  {'tag': 'noanswer',
   'patterns': [],
   'responses': ["Sorry, can't understand you",
    'Please give me more info',
    'Not sure I understand'],
   'context': ['']},
  {'tag': 'options',
   'patterns': ['How you could help me?',
    'What you can do

In [None]:
# Load items
training_sentences = []
training_labels = []
labels = []
responses = []
for intent in data['intents']:
    for patterns in intent['patterns']:
        training_sentences.append(patterns)
        training_labels.append(intent['tag'])
    responses.append(intent['responses'])

    if intent['tag'] not in labels:
        labels.append(intent['tag'])


In [None]:
training_sentences

['Hi there',
 'How are you',
 'Is anyone there?',
 'Hey',
 'Hola',
 'Hello',
 'Good day',
 'Bye',
 'See you later',
 'Goodbye',
 'Nice chatting to you, bye',
 'Till next time',
 'Thanks',
 'Thank you',
 "That's helpful",
 'Awesome, thanks',
 'Thanks for helping me',
 'How you could help me?',
 'What you can do?',
 'What help you provide?',
 'How you can be helpful?',
 'What support is offered',
 'How to check Adverse drug reaction?',
 'Open adverse drugs module',
 'Give me a list of drugs causing adverse behavior',
 'List all drugs suitable for patient with adverse reaction',
 'Which drugs dont have adverse reaction?',
 'Open blood pressure module',
 'Task related to blood pressure',
 'Blood pressure data entry',
 'I want to log blood pressure results',
 'Blood pressure data management',
 'I want to search for blood pressure result history',
 'Blood pressure for patient',
 'Load patient blood pressure result',
 'Show blood pressure results for patient',
 'Find blood pressure results by

In [None]:
training_labels

['greeting',
 'greeting',
 'greeting',
 'greeting',
 'greeting',
 'greeting',
 'greeting',
 'goodbye',
 'goodbye',
 'goodbye',
 'goodbye',
 'goodbye',
 'thanks',
 'thanks',
 'thanks',
 'thanks',
 'thanks',
 'options',
 'options',
 'options',
 'options',
 'options',
 'adverse_drug',
 'adverse_drug',
 'adverse_drug',
 'adverse_drug',
 'adverse_drug',
 'blood_pressure',
 'blood_pressure',
 'blood_pressure',
 'blood_pressure',
 'blood_pressure',
 'blood_pressure_search',
 'blood_pressure_search',
 'blood_pressure_search',
 'blood_pressure_search',
 'blood_pressure_search',
 'pharmacy_search',
 'pharmacy_search',
 'pharmacy_search',
 'pharmacy_search',
 'pharmacy_search',
 'hospital_search',
 'hospital_search',
 'hospital_search',
 'hospital_search',
 'hospital_search']

In [None]:
responses

[['Hello, thanks for asking',
  'Good to see you again',
  'Hi there, how can I help?'],
 ['See you!', 'Have a nice day', 'Bye! Come back again soon.'],
 ['Happy to help!', 'Any time!', 'My pleasure'],
 ["Sorry, can't understand you",
  'Please give me more info',
  'Not sure I understand'],
 ['I can guide you through Adverse drug reaction list, Blood pressure tracking, Hospitals and Pharmacies',
  'Offering support for Adverse drug reaction, Blood pressure, Hospitals and Pharmacies'],
 ['Navigating to Adverse drug reaction module'],
 ['Navigating to Blood Pressure module'],
 ['Please provide Patient ID', 'Patient ID?'],
 ['Loading Blood pressure result for Patient'],
 ['Please provide pharmacy name'],
 ['Loading pharmacy details'],
 ['Please provide hospital name or location'],
 ['Please provide hospital type'],
 ['Loading hospital details']]

In [None]:
labels

['greeting',
 'goodbye',
 'thanks',
 'noanswer',
 'options',
 'adverse_drug',
 'blood_pressure',
 'blood_pressure_search',
 'search_blood_pressure_by_patient_id',
 'pharmacy_search',
 'search_pharmacy_by_name',
 'hospital_search',
 'search_hospital_by_params',
 'search_hospital_by_type']

In [None]:
# Encoder training labels
encoder= LabelEncoder()
encoder.fit(training_labels)
training_labels = encoder.transform(training_labels)

In [None]:
training_labels

array([4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 5, 5,
       5, 5, 5])

In [None]:
# Encode sentence
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>" # Out of Vocabulary

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
sequences

[[36, 24],
 [10, 37, 4],
 [25, 38, 24],
 [39],
 [40],
 [41],
 [42, 43],
 [26],
 [44, 4, 45],
 [46],
 [47, 48, 6, 4, 26],
 [49, 50, 51],
 [14],
 [52, 4],
 [53, 27],
 [54, 14],
 [14, 5, 55, 11],
 [10, 4, 56, 28, 11],
 [15, 4, 29, 57],
 [15, 28, 4, 58],
 [10, 4, 29, 59, 27],
 [15, 60, 25, 61],
 [10, 6, 62, 8, 63, 16],
 [30, 8, 12, 31],
 [64, 11, 32, 17, 33, 12, 65, 8, 66],
 [17, 67, 12, 68, 5, 7, 69, 8, 16],
 [70, 12, 71, 72, 8, 16],
 [30, 2, 3, 31],
 [73, 74, 6, 2, 3],
 [2, 3, 18, 75],
 [19, 20, 6, 76, 2, 3, 21],
 [2, 3, 18, 77],
 [19, 20, 6, 22, 5, 2, 3, 34, 78],
 [2, 3, 5, 7],
 [79, 7, 2, 3, 34],
 [80, 2, 3, 21, 5, 7],
 [23, 2, 3, 21, 81, 82],
 [23, 11, 32, 13],
 [23, 13],
 [17, 33, 83, 84],
 [85, 13],
 [22, 13],
 [35, 5, 9],
 [86, 5, 9, 6, 87, 7],
 [19, 20, 6, 22, 9, 18],
 [9, 35, 5, 7],
 [88, 89, 9, 90]]

In [None]:
# Align sequences
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)
padded_sequences

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0, 36, 24],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 10, 37,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 25, 38, 24],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 39],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 40],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 41],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0, 42, 43],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 26],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 44,  4, 45],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 46],
       [ 0,  0,  0,  0,  0,  0

In [None]:
# Buil model
num_classes = 9
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 16)            16000     
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 9)                 153       
                                                                 
Total params: 16697 (65.22 KB)
Trainable params: 16697 (65.22 KB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [None]:
epochs = 500
history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [None]:
!pip install colorama

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


In [None]:
import colorama
colorama.init()
from colorama import Fore, Style, Back
import time
import random
import pickle

with open("intents.json") as file:
    data = json.load(file)


def chat():

    # parameters
    max_len = 20

    while True:
        print(Fore.LIGHTBLUE_EX + "User: " + Style.RESET_ALL, end="")
        inp = input()
        if inp.lower() == "quit":
            break

        result = model.predict(keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences([inp]),
                                             truncating='post', maxlen=max_len), verbose=0)
        tag = encoder.inverse_transform([np.argmax(result)])

        for i in data['intents']:
            if i['tag'] == tag:
                seed = int(time.time())
                np.random.seed(seed)
                print(Fore.GREEN + "ChatBot:" + Style.RESET_ALL , np.random.choice(i['responses']))

        # print(Fore.GREEN + "ChatBot:" + Style.RESET_ALL,random.choice(responses))

print(Fore.YELLOW + "Start messaging with the bot (type quit to stop)!" + Style.RESET_ALL)


Start messaging with the bot (type quit to stop)!


In [None]:
training_sentences

['Hi there',
 'How are you',
 'Is anyone there?',
 'Hey',
 'Hola',
 'Hello',
 'Good day',
 'Bye',
 'See you later',
 'Goodbye',
 'Nice chatting to you, bye',
 'Till next time',
 'Thanks',
 'Thank you',
 "That's helpful",
 'Awesome, thanks',
 'Thanks for helping me',
 'How you could help me?',
 'What you can do?',
 'What help you provide?',
 'How you can be helpful?',
 'What support is offered',
 'How to check Adverse drug reaction?',
 'Open adverse drugs module',
 'Give me a list of drugs causing adverse behavior',
 'List all drugs suitable for patient with adverse reaction',
 'Which drugs dont have adverse reaction?',
 'Open blood pressure module',
 'Task related to blood pressure',
 'Blood pressure data entry',
 'I want to log blood pressure results',
 'Blood pressure data management',
 'I want to search for blood pressure result history',
 'Blood pressure for patient',
 'Load patient blood pressure result',
 'Show blood pressure results for patient',
 'Find blood pressure results by

In [None]:
chat()

User: hello
ChatBot: Hello, thanks for asking
User: hello
ChatBot: Good to see you again
User: sarch pharmacy
ChatBot: Please provide pharmacy name
User: i want to search hospital to transfer patient
ChatBot: Please provide hospital name or location
User: good bye
ChatBot: Have a nice day
User: good bye
ChatBot: See you!
User: good bye
ChatBot: Have a nice day
User: 

# **ChatBot with Encoder and Decoder**

In [None]:
# Import packages

import json
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [None]:
# Load data
with open('intents.json') as file:
    data = json.load(file)

In [None]:
# Visualize data
data

{'intents': [{'tag': 'greeting',
   'patterns': ['Hi there',
    'How are you',
    'Is anyone there?',
    'Hey',
    'Hola',
    'Hello',
    'Good day'],
   'responses': ['Hello, thanks for asking',
    'Good to see you again',
    'Hi there, how can I help?'],
   'context': ['']},
  {'tag': 'goodbye',
   'patterns': ['Bye',
    'See you later',
    'Goodbye',
    'Nice chatting to you, bye',
    'Till next time'],
   'responses': ['See you!', 'Have a nice day', 'Bye! Come back again soon.'],
   'context': ['']},
  {'tag': 'thanks',
   'patterns': ['Thanks',
    'Thank you',
    "That's helpful",
    'Awesome, thanks',
    'Thanks for helping me'],
   'responses': ['Happy to help!', 'Any time!', 'My pleasure'],
   'context': ['']},
  {'tag': 'noanswer',
   'patterns': [],
   'responses': ["Sorry, can't understand you",
    'Please give me more info',
    'Not sure I understand'],
   'context': ['']},
  {'tag': 'options',
   'patterns': ['How you could help me?',
    'What you can do

In [None]:
# Load items
items = []
for intent in data['intents']:
    for patterns in intent['patterns']:
        for responses in intent['responses']:
            items.append((patterns, responses))

items

[('Hi there', 'Hello, thanks for asking'),
 ('Hi there', 'Good to see you again'),
 ('Hi there', 'Hi there, how can I help?'),
 ('How are you', 'Hello, thanks for asking'),
 ('How are you', 'Good to see you again'),
 ('How are you', 'Hi there, how can I help?'),
 ('Is anyone there?', 'Hello, thanks for asking'),
 ('Is anyone there?', 'Good to see you again'),
 ('Is anyone there?', 'Hi there, how can I help?'),
 ('Hey', 'Hello, thanks for asking'),
 ('Hey', 'Good to see you again'),
 ('Hey', 'Hi there, how can I help?'),
 ('Hola', 'Hello, thanks for asking'),
 ('Hola', 'Good to see you again'),
 ('Hola', 'Hi there, how can I help?'),
 ('Hello', 'Hello, thanks for asking'),
 ('Hello', 'Good to see you again'),
 ('Hello', 'Hi there, how can I help?'),
 ('Good day', 'Hello, thanks for asking'),
 ('Good day', 'Good to see you again'),
 ('Good day', 'Hi there, how can I help?'),
 ('Bye', 'See you!'),
 ('Bye', 'Have a nice day'),
 ('Bye', 'Bye! Come back again soon.'),
 ('See you later', 'See

In [None]:
len(items)

91

In [None]:
# Get input and output
input_texts = [i[0] for i in items]
output_texts = [i[1] for i in items]

In [None]:
input_texts[:20]

['Hi there',
 'Hi there',
 'Hi there',
 'How are you',
 'How are you',
 'How are you',
 'Is anyone there?',
 'Is anyone there?',
 'Is anyone there?',
 'Hey',
 'Hey',
 'Hey',
 'Hola',
 'Hola',
 'Hola',
 'Hello',
 'Hello',
 'Hello',
 'Good day',
 'Good day']

In [None]:
output_texts[:20]

['Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again']

In [None]:
# Create input encoder, output encoder (add start token), output decoder (add end token)
input_encoder = input_texts
input_decoder = ['start '+text for text in output_texts]
output_decoder = [text+ ' end' for text in output_texts]
input_decoder[:10]

['start Hello, thanks for asking',
 'start Good to see you again',
 'start Hi there, how can I help?',
 'start Hello, thanks for asking',
 'start Good to see you again',
 'start Hi there, how can I help?',
 'start Hello, thanks for asking',
 'start Good to see you again',
 'start Hi there, how can I help?',
 'start Hello, thanks for asking']

In [None]:
output_decoder[:10]

['Hello, thanks for asking end',
 'Good to see you again end',
 'Hi there, how can I help? end',
 'Hello, thanks for asking end',
 'Good to see you again end',
 'Hi there, how can I help? end',
 'Hello, thanks for asking end',
 'Good to see you again end',
 'Hi there, how can I help? end',
 'Hello, thanks for asking end']

In [None]:
# Create tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( input_encoder + input_decoder + output_decoder )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

VOCAB SIZE : 112


In [None]:
tokenizer.word_index

{'start': 1,
 'end': 2,
 'you': 3,
 'to': 4,
 'blood': 5,
 'pressure': 6,
 'for': 7,
 'adverse': 8,
 'reaction': 9,
 'provide': 10,
 'drug': 11,
 'please': 12,
 'patient': 13,
 'help': 14,
 'can': 15,
 'i': 16,
 'see': 17,
 'again': 18,
 'thanks': 19,
 'how': 20,
 'module': 21,
 'id': 22,
 'pharmacies': 23,
 'there': 24,
 'hospitals': 25,
 'and': 26,
 'navigating': 27,
 'name': 28,
 'hi': 29,
 'hello': 30,
 'good': 31,
 'bye': 32,
 'hospital': 33,
 'pharmacy': 34,
 'asking': 35,
 'day': 36,
 'nice': 37,
 'time': 38,
 'list': 39,
 'support': 40,
 'a': 41,
 'have': 42,
 'come': 43,
 'back': 44,
 'soon': 45,
 'happy': 46,
 'any': 47,
 'my': 48,
 'pleasure': 49,
 'guide': 50,
 'through': 51,
 'tracking': 52,
 'offering': 53,
 'or': 54,
 'location': 55,
 'me': 56,
 'what': 57,
 'is': 58,
 'helpful': 59,
 'results': 60,
 'drugs': 61,
 'want': 62,
 'search': 63,
 'result': 64,
 'find': 65,
 'are': 66,
 'anyone': 67,
 'hey': 68,
 'hola': 69,
 'later': 70,
 'goodbye': 71,
 'chatting': 72,
 'til

In [None]:
tokenizer.index_word

{1: 'start',
 2: 'end',
 3: 'you',
 4: 'to',
 5: 'blood',
 6: 'pressure',
 7: 'for',
 8: 'adverse',
 9: 'reaction',
 10: 'provide',
 11: 'drug',
 12: 'please',
 13: 'patient',
 14: 'help',
 15: 'can',
 16: 'i',
 17: 'see',
 18: 'again',
 19: 'thanks',
 20: 'how',
 21: 'module',
 22: 'id',
 23: 'pharmacies',
 24: 'there',
 25: 'hospitals',
 26: 'and',
 27: 'navigating',
 28: 'name',
 29: 'hi',
 30: 'hello',
 31: 'good',
 32: 'bye',
 33: 'hospital',
 34: 'pharmacy',
 35: 'asking',
 36: 'day',
 37: 'nice',
 38: 'time',
 39: 'list',
 40: 'support',
 41: 'a',
 42: 'have',
 43: 'come',
 44: 'back',
 45: 'soon',
 46: 'happy',
 47: 'any',
 48: 'my',
 49: 'pleasure',
 50: 'guide',
 51: 'through',
 52: 'tracking',
 53: 'offering',
 54: 'or',
 55: 'location',
 56: 'me',
 57: 'what',
 58: 'is',
 59: 'helpful',
 60: 'results',
 61: 'drugs',
 62: 'want',
 63: 'search',
 64: 'result',
 65: 'find',
 66: 'are',
 67: 'anyone',
 68: 'hey',
 69: 'hola',
 70: 'later',
 71: 'goodbye',
 72: 'chatting',
 73: 

In [None]:
# Encode input encoder
tokenized_input_encoder = tokenizer.texts_to_sequences(input_encoder)
maxlen_input_encoder = max([len(x) for x in input_encoder ])
padded_input_encoder = tf.keras.preprocessing.sequence.pad_sequences( tokenized_input_encoder, maxlen = maxlen_input_encoder, padding = 'post')
encoder_input_data = np.array(padded_input_encoder)
print(encoder_input_data.shape, maxlen_input_encoder)

(91, 57) 57


In [None]:
# Encode input decoder
tokenized_input_decoder = tokenizer.texts_to_sequences(input_decoder)
maxlen_input_decoder = max([len(x) for x in input_decoder ])
padded_input_decoder = tf.keras.preprocessing.sequence.pad_sequences( tokenized_input_decoder, maxlen = maxlen_input_decoder, padding = 'post')
decoder_input_data = np.array(padded_input_decoder)
print(decoder_input_data.shape, maxlen_input_decoder)

(91, 107) 107


In [None]:
# Encode output decoder
tokenized_output_decoder = tokenizer.texts_to_sequences( output_decoder )
for i in range(len(tokenized_output_decoder)) :
    tokenized_output_decoder[i] = tokenized_output_decoder[i][1:]
padded_output_decoder = tf.keras.preprocessing.sequence.pad_sequences( tokenized_output_decoder , maxlen=maxlen_input_decoder , padding='post' )
onehot_output_decoder = tf.keras.utils.to_categorical( padded_output_decoder , VOCAB_SIZE )
decoder_output_data = np.array( onehot_output_decoder )
print( decoder_output_data.shape )

(91, 107, 112)


In [None]:
decoder_output_data[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
decoder_output_data[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [None]:
# Build Model
encoder_inputs = tf.keras.layers.Input(shape=( maxlen_input_encoder , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( maxlen_input_decoder ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax )
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 57)]                 0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 107)]                0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 57, 200)              22400     ['input_5[0][0]']             
                                                                                                  
 embedding_3 (Embedding)     (None, 107, 200)             22400     ['input_6[0][0]']             
                                                                                            

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, epochs=100, shuffle=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7c80228a8040>

In [None]:
# Make model for inference
def make_inference_models():

    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))

    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)

    decoder_states = [state_h, state_c]

    decoder_outputs = decoder_dense(decoder_outputs)

    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)

    return encoder_model , decoder_model

In [None]:
# Test
import re
def tokenize(sentence):
    # Convert the sentence to lowercase
    sentence = sentence.lower()

    # Remove non-alphabetic characters and replace them with spaces
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Split the sentence into tokens
    tokens = sentence.split()

    return tokens

def str_to_tokens( sentence : str ):


    tokenized_sentence = tokenize(sentence)
    tokens_list = []
    for word in tokenized_sentence:
        tokens_list.append( tokenizer.word_index[ word ])

    return tf.keras.preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_input_encoder , padding='post')


enc_model , dec_model = make_inference_models()

In [None]:
input_texts

['Hi there',
 'Hi there',
 'Hi there',
 'How are you',
 'How are you',
 'How are you',
 'Is anyone there?',
 'Is anyone there?',
 'Is anyone there?',
 'Hey',
 'Hey',
 'Hey',
 'Hola',
 'Hola',
 'Hola',
 'Hello',
 'Hello',
 'Hello',
 'Good day',
 'Good day',
 'Good day',
 'Bye',
 'Bye',
 'Bye',
 'See you later',
 'See you later',
 'See you later',
 'Goodbye',
 'Goodbye',
 'Goodbye',
 'Nice chatting to you, bye',
 'Nice chatting to you, bye',
 'Nice chatting to you, bye',
 'Till next time',
 'Till next time',
 'Till next time',
 'Thanks',
 'Thanks',
 'Thanks',
 'Thank you',
 'Thank you',
 'Thank you',
 "That's helpful",
 "That's helpful",
 "That's helpful",
 'Awesome, thanks',
 'Awesome, thanks',
 'Awesome, thanks',
 'Thanks for helping me',
 'Thanks for helping me',
 'Thanks for helping me',
 'How you could help me?',
 'How you could help me?',
 'What you can do?',
 'What you can do?',
 'What help you provide?',
 'What help you provide?',
 'How you can be helpful?',
 'How you can be help

In [None]:
# Chat

while  True:
    user_input = input()
    if user_input.lower() == "quit":
        break
    states_values = enc_model.predict( str_to_tokens(user_input) , verbose=0)
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    before = None
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values, verbose=0)
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index and word!="end" and word!=before:
                decoded_translation += ' {}'.format( word )
                sampled_word = word


        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_input_encoder or sampled_word == None:
            stop_condition = True
        before = sampled_word

        empty_target_seq = np.zeros( ( 1 , 1 ) )
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ]

    print(f'CHATBOT: {decoded_translation}' )

# **ChatBot with GPT**

In [None]:
# Install transformer package
! pip install transformers




In [None]:
# Load packages
import os
import csv
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import numpy as np

In [None]:
# Download Cornell Movie Dialogs Corpus
! wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip

--2023-10-05 13:50:15--  http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9916637 (9.5M) [application/zip]
Saving to: ‘cornell_movie_dialogs_corpus.zip’


2023-10-05 13:50:16 (14.1 MB/s) - ‘cornell_movie_dialogs_corpus.zip’ saved [9916637/9916637]



In [None]:
# Unzip file
! unzip /content/cornell_movie_dialogs_corpus.zip

Archive:  /content/cornell_movie_dialogs_corpus.zip
   creating: cornell movie-dialogs corpus/
  inflating: cornell movie-dialogs corpus/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/cornell movie-dialogs corpus/
  inflating: __MACOSX/cornell movie-dialogs corpus/._.DS_Store  
  inflating: cornell movie-dialogs corpus/chameleons.pdf  
  inflating: __MACOSX/cornell movie-dialogs corpus/._chameleons.pdf  
  inflating: cornell movie-dialogs corpus/movie_characters_metadata.txt  
  inflating: cornell movie-dialogs corpus/movie_conversations.txt  
  inflating: cornell movie-dialogs corpus/movie_lines.txt  
  inflating: cornell movie-dialogs corpus/movie_titles_metadata.txt  
  inflating: cornell movie-dialogs corpus/raw_script_urls.txt  
  inflating: cornell movie-dialogs corpus/README.txt  
  inflating: __MACOSX/cornell movie-dialogs corpus/._README.txt  


In [None]:
# Load dataset
with open(os.path.join('/content/cornell movie-dialogs corpus',
          'movie_lines.txt'), 'r', encoding='iso-8859-1') as f:
    lines = f.readlines()

In [None]:
# Print the list
lines[:30]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n',
 'L868 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ The "real you".\n',
 'L867 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ What good stuff?\n',
 "L866 +++$+++ u2 +++$+++ m0 +++$+++ CAME

In [None]:
# Select only id and texts
conversations = []
for line in lines:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 5:
                conv_id = parts[0]
                line_text = parts[4]
                conversations.append((conv_id, line_text))
conversations[:30]

[('L1045', 'They do not!'),
 ('L1044', 'They do to!'),
 ('L985', 'I hope so.'),
 ('L984', 'She okay?'),
 ('L925', "Let's go."),
 ('L924', 'Wow'),
 ('L872', "Okay -- you're gonna need to learn how to lie."),
 ('L871', 'No'),
 ('L870',
  'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?'),
 ('L869', 'Like my fear of wearing pastels?'),
 ('L868', 'The "real you".'),
 ('L867', 'What good stuff?'),
 ('L866', "I figured you'd get to the good stuff eventually."),
 ('L865',
  'Thank God!  If I had to hear one more story about your coiffure...'),
 ('L864', "Me.  This endless ...blonde babble. I'm like, boring myself."),
 ('L863', 'What crap?'),
 ('L862', 'do you listen to this crap?'),
 ('L861', 'No...'),
 ('L860',
  'Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."'),
 ('L699', 'You always been this selfish?'),
 ('L698', 'But'),
 ('L697', "Then that's all you had to say."),
 ('L696', 'Well, no...'),


In [None]:
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
model.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  124439808 
 er)                                                             
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 124439808 (474.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Tokenize text in conversation
# Tokenizer use a id (int) to represent each token in the corpus
input_ids = []
output_ids = []
for i in range(len(conversations) - 1):
    input_text = conversations[i][1]
    output_text = conversations[i + 1][1]
    input_tokenized = tokenizer.encode(input_text,
            add_special_tokens=False, truncation=True, max_length=512)
    output_tokenized = tokenizer.encode(output_text,
            add_special_tokens=False, truncation=True, max_length=512)
    input_ids.append(input_tokenized)
    output_ids.append(output_tokenized)

In [None]:
# Show input
input_ids[:10]

[[2990, 466, 407, 0],
 [2990, 466, 284, 0],
 [40, 2911, 523, 13],
 [3347, 8788, 30],
 [5756, 338, 467, 13],
 [22017],
 [16454, 1377, 345, 821, 8066, 761, 284, 2193, 703, 284, 6486, 13],
 [2949],
 [40,
  1101,
  26471,
  13,
  220,
  921,
  760,
  703,
  3360,
  345,
  655,
  1716,
  428,
  366,
  6259,
  64,
  13984,
  220,
  843,
  345,
  836,
  470,
  760,
  703,
  284,
  11238,
  30],
 [7594, 616, 3252, 286, 5762, 1613, 1424, 30]]

In [None]:
# Show output
output_ids[:10]

[[2990, 466, 284, 0],
 [40, 2911, 523, 13],
 [3347, 8788, 30],
 [5756, 338, 467, 13],
 [22017],
 [16454, 1377, 345, 821, 8066, 761, 284, 2193, 703, 284, 6486, 13],
 [2949],
 [40,
  1101,
  26471,
  13,
  220,
  921,
  760,
  703,
  3360,
  345,
  655,
  1716,
  428,
  366,
  6259,
  64,
  13984,
  220,
  843,
  345,
  836,
  470,
  760,
  703,
  284,
  11238,
  30],
 [7594, 616, 3252, 286, 5762, 1613, 1424, 30],
 [464, 366, 5305, 345, 1911]]

In [None]:
# Pad the input/output pairs to the same length
max_length = max(len(ids) for ids in input_ids + output_ids)
input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=max_length, padding='post')
output_ids = tf.keras.preprocessing.sequence.pad_sequences(output_ids,
        maxlen=max_length, padding='post')

In [None]:
a = model.predict(input_ids[:2])



In [None]:
a.keys()

odict_keys(['logits', 'past_key_values'])

In [None]:
a['logits'].shape

(2, 1024, 50257)

In [None]:
tokenizer.vocab_size

50257

In [None]:
max_length

1024

In [None]:
input_ids[:10]

array([[ 2990,   466,   407, ...,     0,     0,     0],
       [ 2990,   466,   284, ...,     0,     0,     0],
       [   40,  2911,   523, ...,     0,     0,     0],
       ...,
       [ 2949,     0,     0, ...,     0,     0,     0],
       [   40,  1101, 26471, ...,     0,     0,     0],
       [ 7594,   616,  3252, ...,     0,     0,     0]], dtype=int32)

In [None]:
input_ids.shape

(304445, 1024)

In [None]:
output_ids[:10]

array([[ 2990,   466,   284, ...,     0,     0,     0],
       [   40,  2911,   523, ...,     0,     0,     0],
       [ 3347,  8788,    30, ...,     0,     0,     0],
       ...,
       [   40,  1101, 26471, ...,     0,     0,     0],
       [ 7594,   616,  3252, ...,     0,     0,     0],
       [  464,   366,  5305, ...,     0,     0,     0]], dtype=int32)

In [None]:
# Define the training parameters
batch_size = 5
epochs = 10

# Create a generator
class CustomDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, input_ids, output_ids, batch_size, shuffle=True):
        self.input_ids = input_ids
        self.output_ids = output_ids
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(input_ids))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        return int(np.ceil(len(self.input_ids) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_input_ids = self.input_ids[batch_indices]
        batch_output_ids = self.output_ids[batch_indices]

        return (batch_input_ids, batch_output_ids)

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

# Create the custom data generator
data_generator = CustomDataGenerator(input_ids, output_ids, batch_size)



In [None]:
# Create an optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# Compile the model
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy")



In [None]:
# Train the model
model.fit(data_generator, batch_size=batch_size,epochs=epochs)

# Save the trained model
model.save_pretrained('chatbot_model')

Epoch 1/10


In [None]:
def test():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = TFGPT2LMHeadModel.from_pretrained("chatbot_model")

    while True:
        input_text = input("User: ")
        input_tokenized = tokenizer.encode(input_text, add_special_tokens=False)
        input_ids = tf.keras.preprocessing.sequence.pad_sequences([input_tokenized], maxlen=max_length, padding="post")
        output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print("Bot:", output_text)

# **ChatBot with GPT tiny version**

In [None]:
# Install transformer package
! pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
Insta

In [None]:
# Load packages
import os
import csv
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

# Import packages

import json
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load data
with open('intents.json') as file:
    data = json.load(file)


In [None]:
# Load items
items = []
for intent in data['intents']:
    for patterns in intent['patterns']:
        for responses in intent['responses']:
            items.append((patterns, responses))

items

[('Hi there', 'Hello, thanks for asking'),
 ('Hi there', 'Good to see you again'),
 ('Hi there', 'Hi there, how can I help?'),
 ('How are you', 'Hello, thanks for asking'),
 ('How are you', 'Good to see you again'),
 ('How are you', 'Hi there, how can I help?'),
 ('Is anyone there?', 'Hello, thanks for asking'),
 ('Is anyone there?', 'Good to see you again'),
 ('Is anyone there?', 'Hi there, how can I help?'),
 ('Hey', 'Hello, thanks for asking'),
 ('Hey', 'Good to see you again'),
 ('Hey', 'Hi there, how can I help?'),
 ('Hola', 'Hello, thanks for asking'),
 ('Hola', 'Good to see you again'),
 ('Hola', 'Hi there, how can I help?'),
 ('Hello', 'Hello, thanks for asking'),
 ('Hello', 'Good to see you again'),
 ('Hello', 'Hi there, how can I help?'),
 ('Good day', 'Hello, thanks for asking'),
 ('Good day', 'Good to see you again'),
 ('Good day', 'Hi there, how can I help?'),
 ('Bye', 'See you!'),
 ('Bye', 'Have a nice day'),
 ('Bye', 'Bye! Come back again soon.'),
 ('See you later', 'See

In [None]:
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
model.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  124439808 
 er)                                                             
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 124439808 (474.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Make a test with the pretrained model**

In [None]:
tokenizer.eos_token


'<|endoftext|>'

In [None]:
tokenizer.pad_token = tokenizer.eos_token

text_list = ["i am a man", "the boy is here"]

# Tokenize the list of text
tokenized_texts = tokenizer.batch_encode_plus(
    text_list,
    padding=True,        # Pad the sequences to the same length
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)

# Access the tokenized inputs and attention masks
input_ids = tokenized_texts["input_ids"]
attention_mask = tokenized_texts["attention_mask"]


In [None]:
input_ids

<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[  72,  716,  257,  582],
       [1169, 2933,  318,  994]], dtype=int32)>

In [None]:
output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode and print the generated text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

i am a man of God.

I am a man of God. I am a man of God. I am a man of God. I am a man of God. I am a man of God. I am a man of God


In [None]:
generated_text = tokenizer.decode(output_ids[1], skip_special_tokens=True)
print(generated_text)

the boy is here, and he's going to be a good kid," said the boy's mother, who asked not to be named.

The boy's mother said she was shocked by the incident.

"I'm shocked. I


In [None]:
# Tokenize text in conversation
# Tokenizer use a id (int) to represent each token in the corpus
input_ids = []
output_ids = []
for item in items:
    input_ids.append(item[0])
    output_ids.append(item[1])

In [None]:
# Show input
input_ids[:10]

['Hi there',
 'Hi there',
 'Hi there',
 'How are you',
 'How are you',
 'How are you',
 'Is anyone there?',
 'Is anyone there?',
 'Is anyone there?',
 'Hey']

In [None]:
# Show output
output_ids[:10]

['Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking']

In [None]:
len(output_ids)

91

In [None]:
# Tokenize the list of text
tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer.batch_encode_plus(
    input_ids,
    padding='max_length',
    max_length=20,        # Pad the sequences to the same length
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)
input_ids = input_ids["input_ids"]
output_ids = tokenizer.batch_encode_plus(
    output_ids,
    padding='max_length',
    max_length=20,
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)
output_ids = output_ids["input_ids"]

In [None]:
tokenizer.vocab_size

50257

In [None]:
input_ids[:10]

<tf.Tensor: shape=(10, 20), dtype=int32, numpy=
array([[17250,   612, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [ 2437,   389,   345, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [ 2437,   389,   345, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [ 2437,   389,   345, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [None]:
input_ids.shape

TensorShape([91, 20])

In [None]:
output_ids[:10]

<tf.Tensor: shape=(10, 20), dtype=int32, numpy=
array([[15496,    11,  5176,   329,  4737, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [10248,   284,   766,   345,   757, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612,    11,   703,   460,   314,  1037,    30, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [15496,    11,  5176,   329,  4737, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [10248,   284,   766,   345,   757, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612,    11,   703,   460,   314,  1037,    30, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [None]:
# Define the training parameters
batch_size = 16

# Create a generator
class CustomDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, input_ids, output_ids, batch_size, shuffle=True):
        self.input_ids = input_ids
        self.output_ids = output_ids
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(input_ids))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        return int(np.ceil(len(self.input_ids) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_input_ids = self.input_ids[batch_indices]
        batch_output_ids = self.output_ids[batch_indices]

        return (batch_input_ids, batch_output_ids)

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

# Create the custom data generator
data_generator = CustomDataGenerator(input_ids, output_ids, batch_size)



In [None]:
# Create an optimizer
num_epochs=5
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(
    5e-4,
    decay_steps=len(data_generator) * num_epochs,
    end_learning_rate=0.0,
)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

In [None]:
# Train the model
model.fit(input_ids, output_ids, batch_size=batch_size,epochs=num_epochs)

# Save the trained model
model.save_pretrained('chatbot_model')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
a = tokenizer.batch_encode_plus(
    ["hello"],
    padding=True,        # Pad the sequences to the same length
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)['input_ids']

In [None]:
output_ids = model.generate(a, max_length=50, pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2)

In [None]:
tokenizer.decode(output_ids[0], skip_special_tokens=True)

'how are you can provide you provide Adverseverse'

In [None]:
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side="left")
#model = TFGPT2LMHeadModel.from_pretrained("chatbot_model")
#tokenizer.pad_token = tokenizer.eos_token

def test():
    while True:
        input_text = input("User: ")
        input_tokenized =  tokenizer.batch_encode_plus(
                [input_text],
                padding=True,        # Pad the sequences to the same length
                truncation=True,     # Truncate sequences to a maximum length
                return_tensors="tf"  # Return tf
        )
        input_ids = input_tokenized['input_ids']

        # Calculate the maximum length for generation based on the length of input_ids


        output_ids = model.generate(input_ids, max_length=50, pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2)

        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print("Bot:", output_text)


In [None]:
test()

User: hello
Bot: helloHello
User: Hi there
Bot: Hi there to provide you provide Adverseverse
User: how are you
Bot: how are you can provide you provide Adverseverse
User: no thanks
Bot: no thanks!!
User: how you could help me
Bot: how you could help me provide Adverseverse Ad Ad


KeyboardInterrupt: ignored

https://mrmaheshrajput.medium.com/how-to-build-an-intelligent-qa-chatbot-on-your-data-with-llm-or-chatgpt-d0009d256dce

# **GPT2 Text Generation**

In [None]:
!pip install git+https://github.com/keras-team/keras-nlp.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m440.7/440.7 kB[0m [31m35.4 MB/s[0m eta [36m0

In [None]:
import os
os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
import keras_nlp
import tensorflow as tf
import keras_core as keras
import time

Using JAX backend.


In [None]:
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

In [None]:
# Generate a text
output = gpt2_lm.generate("The goal of apple company", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
The goal of apple company, Inc. was to make a better apple.

"I don't know how you would describe it, but the apple is the best apple we ever tasted, so we're very pleased with it," said Steve Hirsch, Apple's president and CEO. "It is one of the best apples ever."

The company is now working with the U.S. Department of Agriculture to develop and sell the apple.

The company is currently testing its product on apples grown by farmers in the Midwest, and is also working with the U.S. Department of Agriculture to test its products on apples grown in the Midwest.

Apple's apple has been the focus of controversy since its introduction in 2007 and its reputation has been tarnished by its poor quality.

Apple's reputation has been tarnished by a series of lawsuits that it has filed against the U.S. government, which is investigating the company for fraud.

Apple is


In [None]:
# Generate a text
output = gpt2_lm.generate("steave jobs", max_length=200)
print("\nGPT-2 output:")
print(output)

## Fine-tune GPT2

In [None]:
import tensorflow_datasets as tfds

reddit_ds = tfds.load("reddit_tifu", split="train", as_supervised=True)

Downloading and preparing dataset 639.54 MiB (download: 639.54 MiB, generated: 141.46 MiB, total: 781.00 MiB) to /root/tensorflow_datasets/reddit_tifu/short/1.1.2...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/79740 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/reddit_tifu/short/1.1.2.incompleteXTPWMX/reddit_tifu-train.tfrecord*...:  …

Dataset reddit_tifu downloaded and prepared to /root/tensorflow_datasets/reddit_tifu/short/1.1.2. Subsequent calls will reuse this data.


In [None]:
reddit_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>

In [None]:
for document, title in reddit_ds:
    print(document.numpy())
    print(title.numpy())
    break

b"me and a friend decided to go to the beach last sunday. we loaded up and headed out. we were about half way there when i decided that i was not leaving till i had seafood. \n\nnow i'm not talking about red lobster. no friends i'm talking about a low country boil. i found the restaurant and got directions. i don't know if any of you have heard about the crab shack on tybee island but let me tell you it's worth it. \n\nwe arrived and was seated quickly. we decided to get a seafood sampler for two and split it. the waitress bought it out on separate platters for us. the amount of food was staggering. two types of crab, shrimp, mussels, crawfish, andouille sausage, red potatoes, and corn on the cob. i managed to finish it and some of my friends crawfish and mussels. it was a day to be a fat ass. we finished paid for our food and headed to the beach. \n\nfunny thing about seafood. it runs through me faster than a kenyan \n\nwe arrived and walked around a bit. it was about 45min since we a

In [None]:
train_ds = (
    reddit_ds.map(lambda document, _: document)
    .batch(32)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
train_ds = train_ds.take(500)
num_epochs = 1

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs)

In [None]:
output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

In [None]:
# SEARCH FOR TOP-K
# Use a string identifier.
gpt2_lm.compile(sampler="top_k")
output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

# Use a `Sampler` instance. `GreedySampler` tends to repeat itself,
greedy_sampler = keras_nlp.samplers.GreedySampler()
gpt2_lm.compile(sampler=greedy_sampler)

output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

## Finetune on Chinese Poem Dataset

In [None]:
!git clone https://github.com/chinese-poetry/chinese-poetry.git

In [None]:
import os
import json

poem_collection = []
for file in os.listdir("chinese-poetry/全唐诗"):
    if ".json" not in file or "poet" not in file:
        continue
    full_filename = "%s/%s" % ("chinese-poetry/全唐诗", file)
    with open(full_filename, "r") as f:
        content = json.load(f)
        poem_collection.extend(content)

paragraphs = ["".join(data["paragraphs"]) for data in poem_collection]

In [None]:
print(paragraphs[0])

In [None]:
train_ds = (
    tf.data.Dataset.from_tensor_slices(paragraphs)
    .batch(16)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

# Running through the whole dataset takes long, only take `500` and run 1
# epochs for demo purposes.
train_ds = train_ds.take(500)
num_epochs = 1

learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs)

In [None]:
output = gpt2_lm.generate("昨夜雨疏风骤", max_length=200)
print(output)

# **Question Answering**

https://huggingface.co/docs/transformers/tasks/question_answering

In [None]:
! pip install transformers datasets evaluate

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 

In [None]:
from datasets import load_dataset
squad = load_dataset("squad", split="train[:5000]")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5000
})

In [None]:
squad = squad.train_test_split(test_size=0.2)

In [None]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})

In [None]:
squad["train"]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 4000
})

In [None]:
squad["train"][0]

{'id': '5733ae924776f41900661016',
 'title': 'University_of_Notre_Dame',
 'context': 'Notre Dame is known for its competitive admissions, with the incoming class enrolling in fall 2015 admitting 3,577 from a pool of 18,156 (19.7%). The academic profile of the enrolled class continues to rate among the top 10 to 15 in the nation for national research universities. The university practices a non-restrictive early action policy that allows admitted students to consider admission to Notre Dame as well as any other colleges to which they were accepted. 1,400 of the 3,577 (39.1%) were admitted under the early action plan. Admitted students came from 1,311 high schools and the average student traveled more than 750 miles to Notre Dame, making it arguably the most representative university in the United States. While all entering students begin in the College of the First Year of Studies, 25% have indicated they plan to study in the liberal arts or social sciences, 24% in engineering, 24% in b

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1000
    })
})

In [None]:
tokenized_squad["train"]["input_ids"][0]

[101,
 2054,
 7017,
 1997,
 2493,
 2012,
 10289,
 8214,
 4194,
 1999,
 1996,
 2220,
 2895,
 2565,
 1029,
 102,
 10289,
 8214,
 2003,
 2124,
 2005,
 2049,
 6975,
 20247,
 1010,
 2007,
 1996,
 14932,
 2465,
 25612,
 2075,
 1999,
 2991,
 2325,
 17927,
 1017,
 1010,
 5401,
 2581,
 2013,
 1037,
 4770,
 1997,
 2324,
 1010,
 16734,
 1006,
 2539,
 1012,
 1021,
 1003,
 1007,
 1012,
 1996,
 3834,
 6337,
 1997,
 1996,
 8302,
 2465,
 4247,
 2000,
 3446,
 2426,
 1996,
 2327,
 2184,
 2000,
 2321,
 1999,
 1996,
 3842,
 2005,
 2120,
 2470,
 5534,
 1012,
 1996,
 2118,
 6078,
 1037,
 2512,
 1011,
 25986,
 2220,
 2895,
 3343,
 2008,
 4473,
 4914,
 2493,
 2000,
 5136,
 9634,
 2000,
 10289,
 8214,
 2004,
 2092,
 2004,
 2151,
 2060,
 6667,
 2000,
 2029,
 2027,
 2020,
 3970,
 1012,
 1015,
 1010,
 4278,
 1997,
 1996,
 1017,
 1010,
 5401,
 2581,
 1006,
 4464,
 1012,
 1015,
 1003,
 1007,
 2020,
 4914,
 2104,
 1996,
 2220,
 2895,
 2933,
 1012,
 4914,
 2493,
 2234,
 2013,
 1015,
 1010,
 23532,
 2152,
 2816,
 1998

In [None]:
# create batch
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
from transformers import create_optimizer

batch_size = 16
num_epochs = 2
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [None]:
from transformers import TFAutoModelForQuestionAnswering
model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_squad["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
import tensorflow as tf
model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir="my_model",
    tokenizer=tokenizer,
)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a20303b2320>

In [None]:
# Save the trained model
model.save_pretrained('qa_model')

In [None]:
tokenizer.save_pretrained("qa_tokenizer")

('qa_tokenizer/tokenizer_config.json',
 'qa_tokenizer/special_tokens_map.json',
 'qa_tokenizer/vocab.txt',
 'qa_tokenizer/added_tokens.json',
 'qa_tokenizer/tokenizer.json')

In [None]:
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="transformers.pipelines")
question_answerer = pipeline("question-answering", model="qa_model", tokenizer="qa_tokenizer")

Some layers from the model checkpoint at qa_model were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_199']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at qa_model and are newly initialized: ['dropout_219']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or

In [None]:
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="transformers.pipelines")
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [None]:
question = "What is the capitale of France?"
context = "The capitale of french is Paris, but he capitale of Spain is Madrid."
question_answerer(question=question, context=context)

{'score': 0.1956658810377121, 'start': 26, 'end': 31, 'answer': 'Paris'}

In [None]:

question = "Where is the Eiffel Tower located?"
context = "The Eiffel Tower is one of the most iconic landmarks in the world. Located in Paris, France, it was completed in 1889 and stands at a height of 324 meters. Designed by engineer Gustave Eiffel, the tower was initially met with mixed reactions, but it has since become a symbol of France and a popular tourist attraction."
question_answerer(question=question, context=context)

{'score': 0.261226087808609, 'start': 78, 'end': 91, 'answer': 'Paris, France'}

In [None]:

question = "who designed it?"
context = "The Eiffel Tower is one of the most iconic landmarks in the world. Located in Paris, France, it was completed in 1889 and stands at a height of 324 meters. Designed by engineer Gustave Eiffel, the tower was initially met with mixed reactions, but it has since become a symbol of France and a popular tourist attraction."
question_answerer(question=question, context=context)

{'score': 0.16046275198459625,
 'start': 177,
 'end': 191,
 'answer': 'Gustave Eiffel'}

In [None]:
question="In which year was the Eiffel Tower created?"
context = "The Eiffel Tower is one of the most iconic landmarks in the world. Located in Paris, France, it was completed in 1889 and stands at a height of 324 meters. Designed by engineer Gustave Eiffel, the tower was initially met with mixed reactions, but it has since become a symbol of France and a popular tourist attraction."
question_answerer(question=question, context=context)

{'score': 0.328779011964798, 'start': 113, 'end': 117, 'answer': '1889'}