# **Costum Chatbot**

## **Packages**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.layers import Dense
import json
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import unicodedata
from sklearn.model_selection import train_test_split

## **Load datasets**

In [None]:
question  = []
answer = []
with open("dialogs.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        question.append(line[0])
        # use strip for removing '\n'
        answer.append(line[1].strip())

In [None]:
len(question)

3725

In [None]:
len(answer)

3725

In [None]:
question[:10]

['hi, how are you doing?',
 "i'm fine. how about yourself?",
 "i'm pretty good. thanks for asking.",
 'no problem. so how have you been?',
 "i've been great. what about you?",
 "i've been good. i'm in school right now.",
 'what school do you go to?',
 'i go to pcc.',
 'do you like it there?',
 "it's okay. it's a really big campus."]

In [None]:
answer[:10]

["i'm fine. how about yourself?",
 "i'm pretty good. thanks for asking.",
 'no problem. so how have you been?',
 "i've been great. what about you?",
 "i've been good. i'm in school right now.",
 'what school do you go to?',
 'i go to pcc.',
 'do you like it there?',
 "it's okay. it's a really big campus.",
 'good luck with school.']

## **Transform data to pandas dataframe**

In [None]:
data = pd.DataFrame({"question" : question ,"answer":answer})
data.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


## **Pre-processing the dataset**

In [None]:
def unicode_to_ascii(s):
    # Define a function called 'unicode_to_ascii' that takes a Unicode string 's' as input.

    # Normalize the input string 's' in the "NFD" form using unicodedata.
    normalized_string = unicodedata.normalize('NFD', s)

    # Initialize an empty string to store the result.
    result = ''

    # Iterate through each character 'c' in the normalized string.
    for c in normalized_string:
        # Check if the Unicode category of the character 'c' is not 'Mn',
        # which means it is not a non-spacing mark (diacritic).
        if unicodedata.category(c) != 'Mn':
            # If the character is not a diacritic, include it in the result.
            result += c

    # Return the result string with diacritics removed.
    return result


In [None]:
def clean_text(text):
    # Define a function called 'clean_text' that takes a text as input.

    # Convert text to lowercase, remove leading/trailing whitespace, and normalize Unicode characters.
    text = unicode_to_ascii(text.lower().strip())

    # Replace contractions with their expanded forms.
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)

    # Replace contractions and abbreviations with their expanded forms.
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)

    # Replace specific contractions.
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)

    # Replace general contractions with "not."
    text = re.sub(r"n't", " not", text)

    # Replace an apostrophe followed by 'ng' with 'ng.'
    text = re.sub(r"n'", "ng", text)

    # Replace specific contractions with their expanded forms.
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)

    # Remove various special characters and punctuation.
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)

    # Remove remaining punctuation using string.punctuation.
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Replace non-word characters (e.g., symbols) with a space.
    text = re.sub("(\\W)", " ", text)

    # Remove words containing digits.
    text = re.sub('\S*\d\S*\s*', '', text)

    # Add start and end tokens ("<sos>" and "<eos>") to the text.
    text = "<sos> " + text + " <eos>"

    # Return the cleaned and processed text.
    return text


In [None]:
data["question"] = data.question.apply(clean_text)
data["question"].head()

0                  <sos> hi how are you doing <eos>
1          <sos> i am fine how about yourself <eos>
2    <sos> i am pretty good thanks for asking <eos>
3       <sos> no problem so how have you been <eos>
4      <sos> i have been great what about you <eos>
Name: question, dtype: object

In [None]:
data["answer"] = data.answer.apply(clean_text)
data["answer"].head()

0             <sos> i am fine how about yourself <eos>
1       <sos> i am pretty good thanks for asking <eos>
2          <sos> no problem so how have you been <eos>
3         <sos> i have been great what about you <eos>
4    <sos> i have been good i am in school right no...
Name: answer, dtype: object

In [None]:
question  = data.question.values.tolist()
answer =  data.answer.values.tolist()

## **Tokenize the data**

In [None]:
def tokenize(lang):
    # Define a function called 'tokenize' that takes a 'lang' as input (a list of sentences).

    # Create a Tokenizer with no filters (no filtering of characters).
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

    # Fit the Tokenizer on the provided 'lang' data to create a vocabulary.
    lang_tokenizer.fit_on_texts(lang)

    # Convert the sentences in 'lang' to sequences of integers using the Tokenizer.
    tensor = lang_tokenizer.texts_to_sequences(lang)

    # Pad the sequences with zeros to make them of equal length (post-padding).
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    # Return the padded sequences (tensors) and the language Tokenizer.
    return tensor, lang_tokenizer

In [None]:
input_tensor , inp_lang  =  tokenize(question)

In [None]:
target_tensor , targ_lang  =  tokenize(answer)

In [None]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [None]:
max_length_targ

22

In [None]:
max_length_inp

22

In [None]:
# Split the dataset for train and test
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

## **Make batch**

In [None]:
BUFFER_SIZE = len(input_tensor_train)  # Define the buffer size, typically the number of training examples.
BATCH_SIZE = 64  # Define the batch size for training data.
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE  # Calculate the number of steps per training epoch.
embedding_dim = 256  # Define the dimension of word embeddings.
units = 1024  # Define the number of units or neurons in a recurrent neural network (RNN) layer.
vocab_inp_size = len(inp_lang.word_index) + 1  # Calculate the size of the input vocabulary.
vocab_tar_size = len(targ_lang.word_index) + 1  # Calculate the size of the target vocabulary.

# Create a TensorFlow dataset from the input and target tensors, and shuffle it using the specified BUFFER_SIZE.
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)

# Batch the dataset into batches of BATCH_SIZE and drop any remaining examples that don't fit into a batch.
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# Get an example input batch and an example target batch from the dataset.
example_input_batch, example_target_batch = next(iter(dataset))

# Print the shapes of the example input and target batches.
print("Example Input Batch Shape:", example_input_batch.shape)
print("Example Target Batch Shape:", example_target_batch.shape)

Example Input Batch Shape: (64, 22)
Example Target Batch Shape: (64, 22)


## **Create our Encoder**

In [None]:
# Define a custom class called "Encoder" that inherits from the tf.keras.Model class.
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()  # Call the constructor of the parent class.
        self.batch_sz = batch_sz  # Store the batch size as an instance variable.
        self.enc_units = enc_units  # Store the number of units in the GRU layer.

        # Create an embedding layer to convert input tokens into dense vectors.
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        # Create a GRU (Gated Recurrent Unit) layer with specified parameters.
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    # Define the forward pass for the encoder.
    def call(self, x, hidden):
        x = self.embedding(x)  # Pass the input through the embedding layer.
        output, state = self.gru(x, initial_state=hidden)  # Pass through the GRU.
        return output, state  # Return the output sequence and final hidden state.

    # Initialize the hidden state (typically with zeros).
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))


In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 22, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


## **Create attention layer**

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 22, 1)


## **Create Decoder**

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 2347)


## **Create optimizer**

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

## **Training function**

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<sos>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

## **Train the model**

In [None]:
EPOCHS = 40

for epoch in range(1, EPOCHS + 1):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if(epoch % 4 == 0):
        print('Epoch:{:3d} Loss:{:.4f}'.format(epoch,
                                          total_loss / steps_per_epoch))

Epoch:  4 Loss:1.5763
Epoch:  8 Loss:1.3437
Epoch: 12 Loss:1.1768
Epoch: 16 Loss:1.0126
Epoch: 20 Loss:0.8451
Epoch: 24 Loss:0.6584
Epoch: 28 Loss:0.4661
Epoch: 32 Loss:0.2697
Epoch: 36 Loss:0.1305
Epoch: 40 Loss:0.0562


## **Evaluate the model**

In [None]:
def remove_tags(sentence):
    return sentence.split("<start>")[-1].split("<end>")[0]

In [None]:
def evaluate(sentence):
    sentence = clean_text(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<sos>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<eos>':
            return remove_tags(result), remove_tags(sentence)

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return remove_tags(result), remove_tags(sentence)

In [None]:
questions  = []
answers = []
with open("dialogs.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        questions.append(line[0])
        answers.append(line[1].strip())

In [None]:
def ask(sentence):
    result, sentence = evaluate(sentence)

    print('Question: %s' % (sentence))
    print('Predicted answer: {}'.format(result))

In [None]:
questions[12]

"i'm doing well. how about you?"

In [None]:
ask(questions[12])

Question: <sos> i am doing well how about you <eos>
Predicted answer: never better thanks <eos> 


In [None]:
questions[100]

'i believe so.'

In [None]:
ask(questions[100])

Question: <sos> i believe so <eos>
Predicted answer: good i hope it does not cool off this weekend <eos> 


In [None]:
ask('Hello how are you!')

Question: <sos> hello how are you <eos>
Predicted answer: i am so full i am going to burst <eos> 


# **GPT2 Text Generation**

In [None]:
! rm -r *

In [None]:
!pip install git+https://github.com/keras-team/keras-nlp.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import os
os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
import keras_nlp
import tensorflow as tf
import keras_core as keras
import time

Using JAX backend.


In [None]:
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

## **GPT2 Demo**

In [None]:
# Generate a text
output = gpt2_lm.generate("The goal of apple company", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
The goal of apple company, Inc. was to make a better apple.

"I don't know how you would describe it, but the apple is the best apple we ever tasted, so we're very pleased with it," said Steve Hirsch, Apple's president and CEO. "It is one of the best apples ever."

The company is now working with the U.S. Department of Agriculture to develop and sell the apple.

The company is currently testing its product on apples grown by farmers in the Midwest, and is also working with the U.S. Department of Agriculture to test its products on apples grown in the Midwest.

Apple's apple has been the focus of controversy since its introduction in 2007 and its reputation has been tarnished by its poor quality.

Apple's reputation has been tarnished by a series of lawsuits that it has filed against the U.S. government, which is investigating the company for fraud.

Apple is


In [None]:
# Generate a text
output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
I like basketball so much that I don't think I can be a part of it. I'm a good basketball player, and I love basketball. I'm going to be a great basketball player, so that's all I can do.

I don't have to be a great basketball player to do this kind of stuff, but I'm going to get to be an awesome basketball player, because that's my job.

It's been a long time coming since I was a kid. I was in the NBA at 16 and I was playing for the New Jersey Nets. And I was just a kid. I was playing against my dad, my brother. And I was just like, "What are you doing? What is this going to be like?" I was like, "I'm just going to go play. I'm going to do this."

I'm going to do that. I'm going to go out and play, and then I'll be like


In [None]:
# Generate a text
output = gpt2_lm.generate("What is basketball?", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
What is basketball?

It is a sport that involves two teams competing in an event, usually in a series of matches, with the goal of reaching a championship game. The NBA, on the other hand, has two teams competing in an event, usually at the same time, which means that each team has its own unique game plan.

The NBA's two teams have different rules for the game, but it is generally agreed that the teams are to play in the same game, but the rules are different depending on the game. The rules for basketball are:

Each team has its own rules for a particular game

Each team plays the same game, but has its own rules for the next game

Each team plays a particular game, but has its own rules for the next game

The game rules for basketball have a number of variations. For example, if the team plays a game that is called "Finals," the other team will play "Final


In [None]:
# Generate a text
output = gpt2_lm.generate("What is football?", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
What is football? Football is about winning, winning, winning. Winning is winning, winning, winning.

The game is about winning. Winning is winning – win. Winning is winning. Winning is winning.

It's not a game of football. Football is a game of winning. Winning is winning. Winning is winning, win. Win.

The game is about winning. Winning is win, win. Winning is winning, win.

And so we are.

We are winning. We are winning. We are winning.

We are not just winning – we are winning. We are winning. We are winning.

We are winning. We are winning. We are winning.

We are winning. We are winning.

We are winning. We are winning.

We are winning. We are winning.

It's not a game of football. Football is a game of winning. Winning is winning – win


In [None]:
# Generate a text
output = gpt2_lm.generate("What is Machine Learning?", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
What is Machine Learning?

Machine learning is one of the key concepts in Artificial Intelligence. It is used to solve problems in a way that is not possible with conventional AI. It is also one of the most widely used concepts in Artificial Intelligence. Machine Learning is a set of concepts which are often used interchangeably by many different companies. Machine learning is an approach to solving problems in a way that is not possible with conventional AI. This is the key to the development and application of Machine Learning techniques.

Why does Machine Learning Matter?

Machine Learning is an approach to solving problems in a way that is not possible with traditional AI. It is also one of the most widely used concepts in Artificial Intelligence. Machine Learning is an approach to solving problems in a way that is not possible with traditional AI.

Why Does Machine Learning Matter?

Machine Learning is one of the key concepts in Artificial Intelligence. It is used t

## **Fine-tune GPT2 in Reddit dataset https://www.tensorflow.org/datasets/catalog/reddit**

Now you have the knowledge of the GPT-2 model from KerasNLP, you can take one step further to finetune the model so that it generates text in a specific style, short or long, strict or casual. In this tutorial, we will use reddit dataset for example.

In [None]:
import tensorflow_datasets as tfds
reddit_ds = tfds.load("reddit_tifu", split="train", as_supervised=True)

In [None]:
reddit_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>

In [None]:
for document, title in reddit_ds:
    print(document.numpy())
    print(title.numpy())
    break

b"me and a friend decided to go to the beach last sunday. we loaded up and headed out. we were about half way there when i decided that i was not leaving till i had seafood. \n\nnow i'm not talking about red lobster. no friends i'm talking about a low country boil. i found the restaurant and got directions. i don't know if any of you have heard about the crab shack on tybee island but let me tell you it's worth it. \n\nwe arrived and was seated quickly. we decided to get a seafood sampler for two and split it. the waitress bought it out on separate platters for us. the amount of food was staggering. two types of crab, shrimp, mussels, crawfish, andouille sausage, red potatoes, and corn on the cob. i managed to finish it and some of my friends crawfish and mussels. it was a day to be a fat ass. we finished paid for our food and headed to the beach. \n\nfunny thing about seafood. it runs through me faster than a kenyan \n\nwe arrived and walked around a bit. it was about 45min since we a

In [None]:
train_ds = (
    reddit_ds.map(lambda document, _: document)
    .batch(32)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
train_ds = train_ds.take(500)
num_epochs = 1

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs)

[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 1s/step - accuracy: 0.3190 - loss: 3.3643


<keras_core.src.callbacks.history.History at 0x7d80b4abb790>

In [None]:
output = gpt2_lm.generate("What is basketball?", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
What is basketball?

let's start with the basics. basketball is a basketball game where you play the ball, which means that you can shoot it, and you can shoot it. 

you have to score a basket, and you can't shoot the ball, so you're basically shooting the ball. 

you shoot it, you shoot it and you get the basket, and you score a free throw, and then you get the basket back, and then you get the basket back, and you get the ball back, and you get the free throw back, and then you get the free throw back and you get the basket back, and then you get the free throw


In [None]:
# SEARCH FOR TOP-K
# Use a string identifier.
gpt2_lm.compile(sampler="top_k")
output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

# Use a `Sampler` instance. `GreedySampler` tends to repeat itself,
greedy_sampler = keras_nlp.samplers.GreedySampler()
gpt2_lm.compile(sampler=greedy_sampler)

output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
I like basketball and it's a great sport and it's a great opportunity for me to be part of the team and be involved in the program.

the other day a guy was talking about how he wanted to play a basketball game with his family. he had a friend who was in the team and wanted to play a basketball game with his family so he asked the friend for a game.

"what's your favorite basketball game?"

"that's what i play."

"you're not a bad person!"

"you're not."

"i don't play basketball."

"well

GPT-2 output:
I like basketball, but i don't like to play it. 

so i was playing basketball at my local high school, and i was playing with my friends. 

i was playing with my friends, and one of them was a girl. 

she was a girl, and she was a pretty good basketball player. 

so i was playing with my friends, and one of them was a girl. 

she was a girl, and she was a pretty good basketball player. 

so i was playing with my friends, and one of them was a girl. 

so i was playing with

**Don't forget to save the fine-tuned model for future tests**

## **Finetune on Chinese Poem Dataset**

In [None]:
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

In [None]:
! git clone https://github.com/chinese-poetry/chinese-poetry.git

In [None]:
import os
import json

poem_collection = []
for file in os.listdir("chinese-poetry/全唐诗"):
    if ".json" not in file or "poet" not in file:
        continue
    full_filename = "%s/%s" % ("chinese-poetry/全唐诗", file)
    with open(full_filename, "r") as f:
        content = json.load(f)
        poem_collection.extend(content)

paragraphs = ["".join(data["paragraphs"]) for data in poem_collection]

In [None]:
print(paragraphs[0])

In [None]:
train_ds = (
    tf.data.Dataset.from_tensor_slices(paragraphs)
    .batch(16)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

# Running through the whole dataset takes long, only take `500` and run 1
# epochs for demo purposes.
train_ds = train_ds.take(500)
num_epochs = 1

learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs)

In [None]:
output = gpt2_lm.generate("昨夜雨疏风骤", max_length=200)
print(output)