# **Visualize attention with BertViz**

Ce lien permet d'avoir accés au notebook de bertviz et de visualiser le mécanismisme d'auto-attention: [BertViz](https://colab.research.google.com/github/davidarps/2022_course_embeddings_and_transformers/blob/main/Visualizing_Attention_with_BertViz.ipynb)

# **Tokenization with tiktoken**

**tiktoken est un tokenizer [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) utilisé par les modèles d'OpenAI.**

In [None]:
! pip install tiktoken

In [3]:
import tiktoken

In [None]:
# Import tiktoken, une bibliothèque de Huggingface utile pour avoir une idée de la taille d'un modèle avant de l'utiliser.
import tiktoken

# Récupérer le tokenizer de GPT-3
encoding = tiktoken.encoding_for_model("davinci")

# Donner un texte en exemple et l'afficher
text = "We need to stop anthropomorphizing ChatGPT."
print(f"text: {text}")

# Tokenizer le texte et afficher la taille du vocabulaire sur lequel GPT-3 est entraîné
token_integers = encoding.encode(text)
print(f"total number of tokens: {encoding.n_vocab}")

# Afficher la liste des tokens (ids) et les tokens correspondants (mots)
print(f"token integers: {token_integers}")
token_strings = [encoding.decode_single_token_bytes(token) for token in token_integers]
print(f"token strings: {token_strings}")
print(f"number of tokens in text: {len(token_integers)}")

# Décoder les tokens (ids) pour reconstruire la phrase de départ (avant tokenization)
encoded_decoded_text = encoding.decode(token_integers)
print(f"encoded-decoded text: {encoded_decoded_text}")

text: We need to stop anthropomorphizing ChatGPT.
total number of tokens: 50257
token integers: [1135, 761, 284, 2245, 17911, 25831, 2890, 24101, 38, 11571, 13]
token strings: [b'We', b' need', b' to', b' stop', b' anthrop', b'omorph', b'izing', b' Chat', b'G', b'PT', b'.']
number of tokens in text: 11
encoded-decoded text: We need to stop anthropomorphizing ChatGPT.


# **T5**

## **Inference**

Cette approche permet de tester de manière directe le modèle T5 générique.

In [None]:
# Installer la bibliothèque transformers de HuggingFace pour avoir accés à T5
! pip install transformers

In [None]:
# Installer sentencepiece pour la tokenization (ici on utilise pas BPE)
! pip install sentencepiece

In [None]:
# Importer le tokiner et le modèle de T5
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Charger le tokenizer pré-entraîné
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Charger le modèle pré-entraîné
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Spécifier la tâche à exécuter (ici, on fait de la traduction d'anglais vers l'allemand)
task_prefix = "translate English to German: "

# Donner deux phrases à traduire en guise d'exemple
sentences = ["The house is wonderful.", "I like to work in NYC."]

# Tokeniser les deux phrases en concaténant avec la tâche cible (translate English to German)
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

In [11]:
# Afficher la sortie de la tokenisation
inputs

{'input_ids': tensor([[13959,  1566,    12,  2968,    10,    37,   629,    19,  1627,     5,
             1,     0,     0],
        [13959,  1566,    12,  2968,    10,    27,   114,    12,   161,    16,
         13465,     5,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
# Afficher la taille de la matrice des ids obtenue après tokenisation
inputs['input_ids'].shape

torch.Size([2, 13])

In [21]:
# Tester l'encodeur du modèle T5
encoder_output = model.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
print(encoder_output)
print(encoder_output.keys())
print(encoder_output["last_hidden_state"].shape)

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 0.0154,  0.1263,  0.0301,  ..., -0.0117,  0.0373,  0.1015],
         [-0.1926, -0.1285,  0.0228,  ..., -0.0339,  0.0535,  0.1575],
         [ 0.0109, -0.0210,  0.0022,  ...,  0.0008, -0.0056, -0.0393],
         ...,
         [ 0.0756, -0.0119, -0.0273,  ..., -0.0044, -0.0505,  0.0554],
         [ 0.0373, -0.1201,  0.2142,  ...,  0.1950,  0.0546, -0.1554],
         [ 0.0321, -0.1309,  0.2183,  ...,  0.1683,  0.0588, -0.1607]],

        [[ 0.0164,  0.1266,  0.0305,  ..., -0.0114,  0.0363,  0.0988],
         [-0.0324, -0.0596, -0.0139,  ...,  0.0319, -0.0131,  0.0450],
         [ 0.0098, -0.0204,  0.0016,  ...,  0.0017, -0.0040, -0.0397],
         ...,
         [-0.0303, -0.3878,  0.1376,  ..., -0.0516,  0.0796, -0.3034],
         [ 0.0241, -0.1246,  0.0031,  ...,  0.0801, -0.2018, -0.0541],
         [ 0.0946,  0.0095, -0.0317,  ..., -0.0046, -0.0563,  0.0309]]],
       grad_fn=<MulBackward0>), past_key_values=None, hi

In [25]:
# Tester le décodeur du modèle T5 (cette méthode permet uniquement de prendre le décodeur pas la couche de logits pour la prédiction)
decoder_output = model.decoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
print(decoder_output)
print(decoder_output.keys())
print(decoder_output["last_hidden_state"].shape)

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 4.0756e-02,  2.2054e-01, -9.8648e-02,  ...,  1.0998e-01,
           4.5049e-05, -1.1511e-01],
         [-4.2975e-02,  8.5212e-02, -2.6601e-02,  ...,  9.1395e-02,
           2.6582e-04, -2.4325e-01],
         [ 4.9926e-02,  2.8985e-02,  3.7887e-03,  ...,  1.1895e-01,
           5.1045e-04,  3.9288e-02],
         ...,
         [-6.6533e-03, -6.9466e-03,  8.2814e-02,  ...,  8.2662e-02,
           1.5332e-04, -1.0807e-01],
         [-2.9056e-02,  4.2412e-02,  2.6476e-02,  ...,  6.4245e-02,
           1.1994e-04,  6.3035e-02],
         [-2.5118e-02,  3.7915e-02,  3.1097e-02,  ...,  6.8023e-02,
           1.1328e-04,  4.0840e-02]],

        [[ 4.0756e-02,  2.2054e-01, -9.8648e-02,  ...,  1.0998e-01,
           4.5049e-05, -1.1511e-01],
         [-4.2975e-02,  8.5212e-02, -2.6601e-02,  ...,  9.1395e-02,
           2.6582e-04, -2.4325e-01],
         [ 4.9926e-02,  2.8985e-02,  3.7887e-03,  ...,  1.1895e-01,
           5.104

In [27]:
# Utiliser l'encodeur et le décodeur (le transformer) pour traduire.
output_sequences = model.generate(

    input_ids=inputs["input_ids"],

    attention_mask=inputs["attention_mask"],

    do_sample=False,  # disable sampling to test if batching affects output

)

# Afficher la sortie (des tokens en ids)
print(output_sequences)
print(output_sequences.shape)



tensor([[    0,   644,  4598,   229, 19250,     5,     1,     0,     0,     0],
        [    0,  1674,     3,  5269,    15,  5462,    16, 13465,     5,     1]])
torch.Size([2, 10])


In [13]:
# Traduire les tokens (ids) en mots
print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']


In [16]:
# Test pour obtenir le token de l'id 0
tokenizer.decode(0)

'<pad>'

In [17]:
# Test pour obtenir le token de l'id 644
tokenizer.decode(644)

'Das'

## **Training**

Cette partie permet d'orienter le modèle dans un domaine spécifique

### **Supervised method: Translation example**

In [19]:
# Importer le tokenizer et modèle T5
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Charger le tokenizer et le modèle T5 pré-entraîné
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Tokenizer le texte à traduire (on retourne seulement input_ids, pas attention_mask)
input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids

# Tokenizer la sortie escompter: la bonne traduction
labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids

# Traduire le texte
output = model(input_ids=input_ids, labels=labels)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
# Type de sotie retourner par model
type(output)

transformers.modeling_outputs.Seq2SeqLMOutput

In [23]:
# Les clés du dictionnaire de sortie
output.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'encoder_last_hidden_state'])

In [24]:
# Les logits sont les sorties de prédictions, qui correspond à une séquence avec des vecteurs de taille, la taille du vocabulaire
output.logits.shape

torch.Size([1, 6, 32128])

In [28]:
# La loss (entropy dans ce cas)
output.loss

tensor(0.2542, grad_fn=<NllLossBackward0>)

In [None]:
# Exemple d'entrainement du modèle (code complet)
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Charger le tokenizer et le modèle
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Créer un jeu de données
source_texts = ["translate English to German: The house is wonderful.", ...]
target_texts = ["Das Haus ist wunderbar.", ...]

# Tokenizer les données
tokenized_inputs = tokenizer(source_texts, return_tensors="pt", padding=True, truncation=True)
tokenized_labels = tokenizer(target_texts, return_tensors="pt", padding=True, truncation=True)

# Récupérer les ids
input_ids = tokenized_inputs["input_ids"]
labels = tokenized_labels["input_ids"]

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Entraîner le modèle
num_epochs = 3
for epoch in range(num_epochs):
    model.train()

    # Forward pass
    outputs = model(input_ids=input_ids, labels=labels)
    loss = outputs.loss

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# Sauvegarder le modèle
model.save_pretrained("trained_t5_model")


### **Unsupervised method with mask**

In [29]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")

model = T5ForConditionalGeneration.from_pretrained("t5-small")

input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids

labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids

# the forward function automatically creates the correct decoder_input_ids

loss = model(input_ids=input_ids, labels=labels).loss

loss.item()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


3.7837319374084473

# **GPT-1**

Pour tester le modèle, on peut utiliser l'application avec gradio: [GPT-1](https://huggingface.co/instruct-generalize/gpt-1)

# **GPT-2**

Tester le modèle sur, l'application gradio: [GPT-2](https://huggingface.co/gpt2)

In [None]:
# Installer la bibliothèque de HuggingFace transformers
! pip install transformers

Successfully installed huggingface-hub-0.17.3 safetensors-0.4.0 tokenizers-0.14.1 transformers-4.35.0


## **Inference**

In [34]:
# Importer la bibliothèque pipeline pour tester en inférence des modèles. set_seed permet d'avoir les mêmes résultats
from transformers import pipeline, set_seed

# Charger le modèle gpt-2
generator = pipeline('text-generation', model='gpt2')

# Fixer l'aléatoire
set_seed(42)

# Générer 5 textes, de taille maximale égale à 30, en prenant pour contexte  "Hello, I'm a language model,"
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, but what I'm really doing is making a human-readable document. There are other languages, but those are"},
 {'generated_text': "Hello, I'm a language model, not a syntax model. That's why I like it. I've done a lot of programming projects.\n"},
 {'generated_text': "Hello, I'm a language model, and I'll do it in no time!\n\nOne of the things we learned from talking to my friend"},
 {'generated_text': "Hello, I'm a language model, not a command line tool.\n\nIf my code is simple enough:\n\nif (use (string"},
 {'generated_text': "Hello, I'm a language model, I've been using Language in all my work. Just a small example, let's see a simplified example."}]

## **Training**

Cette approche permet aux développeur d'orienter le modèle vers un domaine précis.

In [43]:
# Importer le modèle GPT-2 et son tokenizer
from transformers import GPT2Tokenizer, GPT2Model

# Charger le modèle GPT-2 et le tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [30]:
# Prendre un texte pour exemple
text = "Replace me by any text you'd like."

In [31]:
# Tokenizer le texte d'exemple, pour les développeur tensorflow, on peut remplacer 'pt' par 'tf' pour avoir un tf.Tensor en sortie
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

**input_ids :**<br>
    Il s'agit d'une séquence d'identifiants numériques qui représentent les mots ou les tokens dans le texte d'entrée. Chaque mot ou token est encodé sous forme d'un identifiant unique dans le vocabulaire du modèle. Dans votre exemple, la séquence d'input_ids est [3041, 5372, 502, 416, 597, 2420, 345, 1549, 588, 13].

**attention_mask :**<br>
    Il s'agit d'un masque binaire qui indique au modèle quels tokens dans la séquence d'entrée doivent être pris en compte et quels tokens doivent être ignorés lors du calcul des représentations. Un "1" dans le masque signifie que le token correspondant dans la séquence d'input_ids doit être pris en compte, tandis qu'un "0" signifie que le token doit être ignoré. Dans votre exemple, le masque d'attention est [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ce qui signifie que tous les tokens de la séquence d'input_ids doivent être pris en compte.

In [None]:
# Afficher la taille du vocabulaire d'entraînement
tokenizer.vocab_size

50257

In [34]:
# Encoder le texte
output = model(**encoded_input)

In [45]:
# Voir les clés du dictionnaire
output.keys()

odict_keys(['last_hidden_state', 'past_key_values'])

In [35]:
# Voir la taille du dernier bloc du decoder
output.last_hidden_state.shape

torch.Size([1, 10, 768])

Tout le modèle ne peuvent pas utiliser generate, pour utiliser generate avec cette approche, ça ne va pas marche, il faut utiliser un modèle qui a une tête de prédiction (**lm_head**) avec par exemple: GPT2Tokenizer, GPT2LMHeadModel (voir le code en bas.

In [53]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Charger le modèle GPT-2 et le tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Texte d'entrée
input_text = "It is your,"

# Tokenization (tokenizer.encoder retourne uniquement input_ids, alors tokenizer retourne inputs_ids et attention_mask)
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Génération
output = model.generate(input_ids, max_new_tokens=100, do_sample=True, temperature=0.8, pad_token_id=model.config.eos_token_id)


# Décodage de la sortie
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

# Afficher le résultat
print(decoded_output)


It is your, it is yours. But to do the same for me, I need more than that. I need you to trust me, and I need you to find and let me know that you are mine. I have always been a man and a woman, and to have your support is a huge honor. Do not ever allow yourself to be treated like it is your own fault that I did not have a chance to live up to my name!

And I want you to be able to tell me


In [46]:
# On doit nécessaire avoir logits sur les keys pour pouvoir exploiter generate
model(input_ids).keys()

odict_keys(['logits', 'past_key_values'])

# **GPT-3 and GPT-4**

A ma connaissance, le code n'est pas encore disponible sur des plateformes comme HuggingFace, mais pour faire des tests avec DALL-E (Génération d'images à partir de textes) et GPT-4 (communiquer en donnant en entrée du texte et/ou de l'image), on peut se connecter sur : [GPT](https://platform.openai.com/docs/overview)

# **Other Chatbots to test**

On peut tester des chatbots autre que ChatGPT: [chatbots](https://24pm.com/intelligence-artificielle/ia-generative/965-les-llms-open-source-alternatifs-a-chatgpt-gpt)

# **Make ChatBot with GPT-2**

In [None]:
# Install transformer package
! pip install transformers

In [55]:
# Importer les packages nécessaires
import os
import csv
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

import json
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

## **Load dataset**

In [None]:
# Charger le jeu de données
with open('intents.json') as file:
    data = json.load(file)


In [None]:
# Pour chaque question, on donne la réponse correspondante
items = []
for intent in data['intents']:
    for patterns in intent['patterns']:
        for responses in intent['responses']:
            items.append((patterns, responses))

items

[('Hi there', 'Hello, thanks for asking'),
 ('Hi there', 'Good to see you again'),
 ('Hi there', 'Hi there, how can I help?'),
 ('How are you', 'Hello, thanks for asking'),
 ('How are you', 'Good to see you again'),
 ('How are you', 'Hi there, how can I help?'),
 ('Is anyone there?', 'Hello, thanks for asking'),
 ('Is anyone there?', 'Good to see you again'),
 ('Is anyone there?', 'Hi there, how can I help?'),
 ('Hey', 'Hello, thanks for asking'),
 ('Hey', 'Good to see you again'),
 ('Hey', 'Hi there, how can I help?'),
 ('Hola', 'Hello, thanks for asking'),
 ('Hola', 'Good to see you again'),
 ('Hola', 'Hi there, how can I help?'),
 ('Hello', 'Hello, thanks for asking'),
 ('Hello', 'Good to see you again'),
 ('Hello', 'Hi there, how can I help?'),
 ('Good day', 'Hello, thanks for asking'),
 ('Good day', 'Good to see you again'),
 ('Good day', 'Hi there, how can I help?'),
 ('Bye', 'See you!'),
 ('Bye', 'Have a nice day'),
 ('Bye', 'Bye! Come back again soon.'),
 ('See you later', 'See

## **Load model**

In [93]:
# Charger GPT-2 et son tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left') # padding_side = "left" est obligatoire si on utilise padding sur le tokenizer
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [63]:
# get_lm_head donne l'information que le modèle à une tête de prédiction et on peut utiliser generate
model.get_lm_head

<bound method TFPreTrainedModel.get_lm_head of <transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel object at 0x7e68e40c6c80>>

In [64]:
model.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  124439808 
 er)                                                             
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 124439808 (474.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## **Inference**

In [65]:
# token spécial qui permet d'arrêter la génération
tokenizer.eos_token


'<|endoftext|>'

In [94]:
# Mettre le tokenizer de padding à eos
tokenizer.pad_token = tokenizer.eos_token

# Créer deux textes pour exemples
text_list = ["i am a man", "the boy is here but,"]

# Tokenizer les deux textes avec tensorflow
tokenized_texts = tokenizer.batch_encode_plus(
    text_list,
    padding=True,        # Pad the sequences to the same length
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)


input_ids = tokenized_texts["input_ids"]
attention_mask = tokenized_texts["attention_mask"]


In [95]:
# [50256, 50256,    72,   716,   257,   582] est complété avec 50256
input_ids

<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[50256, 50256,    72,   716,   257,   582],
       [ 1169,  2933,   318,   994,   475,    11]], dtype=int32)>

In [96]:
# 0 pour ne pas tenir en compte les deux premiers tokens du premier vecteur
attention_mask

<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[0, 0, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1]], dtype=int32)>

In [74]:
# Taille du vocabulaire
tokenizer.vocab_size

50257

In [97]:
# Prédire les ids avec generate
output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

In [98]:
output_ids

<tf.Tensor: shape=(2, 50), dtype=int32, numpy=
array([[50256, 50256,    72,   716,   257,   582,   508,   468,   587,
          257,   636,   286,   262,   995,   329,   625,  1542,   812,
           13,   314,   423,   587,   257,   636,   286,   262,   995,
          329,   625,  1542,   812,    13,   314,   423,   587,   257,
          636,   286,   262,   995,   329,   625,  1542,   812,    13,
          314,   423,   587,   257,   636],
       [ 1169,  2933,   318,   994,   475,    11,   314,  1101,   407,
         1654,   611,   339,   338,  1016,   284,   307,  1498,   284,
          466,   340,    13,   314,  1101,   407,  1654,   611,   339,
          338,  1016,   284,   307,  1498,   284,   466,   340,    13,
          314,  1101,   407,  1654,   611,   339,   338,  1016,   284,
          307,  1498,   284,   466,   340]], dtype=int32)>

In [99]:
# Decoder la génération du premier texte
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

i am a man who has been a part of the world for over 30 years. I have been a part of the world for over 30 years. I have been a part of the world for over 30 years. I have been a part


In [100]:
# Decoder la génération du deuxième texte
generated_text = tokenizer.decode(output_ids[1], skip_special_tokens=True)
print(generated_text)

the boy is here but, I'm not sure if he's going to be able to do it. I'm not sure if he's going to be able to do it. I'm not sure if he's going to be able to do it


## **Test training**

In [101]:
tokenizer.pad_token = tokenizer.eos_token

text_list = ["i am a man", "the boy is here"]


tokenized_texts = tokenizer.batch_encode_plus(
    text_list,
    padding=True,        # Pad the sequences to the same length
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)

input_ids = tokenized_texts["input_ids"]
attention_mask = tokenized_texts["attention_mask"]

In [102]:
input_ids

<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[  72,  716,  257,  582],
       [1169, 2933,  318,  994]], dtype=int32)>

In [103]:
dec_output = model(input_ids)

In [104]:
dec_output.keys()

odict_keys(['logits', 'past_key_values'])

In [105]:
dec_output['logits'].shape

TensorShape([2, 4, 50257])

## **Create the chatbot**

In [None]:
# Séparer les inputs et outputs
input_ids = []
output_ids = []
for item in items:
    input_ids.append(item[0])
    output_ids.append(item[1])

In [None]:
# Afficher input
input_ids[:10]

['Hi there',
 'Hi there',
 'Hi there',
 'How are you',
 'How are you',
 'How are you',
 'Is anyone there?',
 'Is anyone there?',
 'Is anyone there?',
 'Hey']

In [None]:
# Afficher output
output_ids[:10]

['Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking',
 'Good to see you again',
 'Hi there, how can I help?',
 'Hello, thanks for asking']

In [None]:
len(output_ids)

91

In [None]:
# Tokonizer les inputs et outputs
tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer.batch_encode_plus(
    input_ids,
    padding='max_length',
    max_length=20,        # Pad the sequences to the same length
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)
input_ids = input_ids["input_ids"]
output_ids = tokenizer.batch_encode_plus(
    output_ids,
    padding='max_length',
    max_length=20,
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)
output_ids = output_ids["input_ids"]

In [None]:
# Afficher le vocabulaire
tokenizer.vocab_size

50257

In [None]:
input_ids[:10]

<tf.Tensor: shape=(10, 20), dtype=int32, numpy=
array([[17250,   612, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [ 2437,   389,   345, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [ 2437,   389,   345, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [ 2437,   389,   345, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [None]:
input_ids.shape

TensorShape([91, 20])

In [None]:
output_ids[:10]

<tf.Tensor: shape=(10, 20), dtype=int32, numpy=
array([[15496,    11,  5176,   329,  4737, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [10248,   284,   766,   345,   757, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612,    11,   703,   460,   314,  1037,    30, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [15496,    11,  5176,   329,  4737, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [10248,   284,   766,   345,   757, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256],
       [17250,   612,    11,   703,   460,   314,  1037,    30, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [None]:
# Définir la taille du batch
batch_size = 16

# Créer un générateur
class CustomDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, input_ids, output_ids, batch_size, shuffle=True):
        self.input_ids = input_ids
        self.output_ids = output_ids
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(input_ids))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        return int(np.ceil(len(self.input_ids) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_input_ids = self.input_ids[batch_indices]
        batch_output_ids = self.output_ids[batch_indices]

        return (batch_input_ids, batch_output_ids)

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

# Initialiser le générateur
data_generator = CustomDataGenerator(input_ids, output_ids, batch_size)



In [None]:
# Définir les paramètres d'apprentissage
num_epochs = 5
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(
    5e-4,
    decay_steps=len(data_generator) * num_epochs,
    end_learning_rate=0.0,
)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

In [None]:
# Entraîner le modèle
model.fit(input_ids, output_ids, batch_size=batch_size,epochs=num_epochs)

# Sauvegarder le modèle
model.save_pretrained('chatbot_model')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
a = tokenizer.batch_encode_plus(
    ["hello"],
    padding=True,        # Pad the sequences to the same length
    truncation=True,     # Truncate sequences to a maximum length
    return_tensors="tf"  # Return tf
)['input_ids']

In [None]:
output_ids = model.generate(a, max_length=50, pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2)

In [None]:
tokenizer.decode(output_ids[0], skip_special_tokens=True)

'how are you can provide you provide Adverseverse'

In [None]:
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side="left")
#model = TFGPT2LMHeadModel.from_pretrained("chatbot_model")
#tokenizer.pad_token = tokenizer.eos_token

def test():
    while True:
        input_text = input("User: ")
        input_tokenized =  tokenizer.batch_encode_plus(
                [input_text],
                padding=True,        # Pad the sequences to the same length
                truncation=True,     # Truncate sequences to a maximum length
                return_tensors="tf"  # Return tf
        )
        input_ids = input_tokenized['input_ids']

        # Calculate the maximum length for generation based on the length of input_ids


        output_ids = model.generate(input_ids, max_length=50, pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2)

        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print("Bot:", output_text)


In [None]:
test()

User: hello
Bot: helloHello
User: Hi there
Bot: Hi there to provide you provide Adverseverse
User: how are you
Bot: how are you can provide you provide Adverseverse
User: no thanks
Bot: no thanks!!
User: how you could help me
Bot: how you could help me provide Adverseverse Ad Ad


KeyboardInterrupt: ignored

https://mrmaheshrajput.medium.com/how-to-build-an-intelligent-qa-chatbot-on-your-data-with-llm-or-chatgpt-d0009d256dce

# **Fine-tune GPT2 for specific domain and language**

In [None]:
!pip install git+https://github.com/keras-team/keras-nlp.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m440.7/440.7 kB[0m [31m35.4 MB/s[0m eta [36m0

In [None]:
import os
os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
import keras_nlp
import tensorflow as tf
import keras_core as keras
import time

Using JAX backend.


In [None]:
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

In [None]:
# Generate a text
output = gpt2_lm.generate("The goal of apple company", max_length=200)
print("\nGPT-2 output:")
print(output)


GPT-2 output:
The goal of apple company, Inc. was to make a better apple.

"I don't know how you would describe it, but the apple is the best apple we ever tasted, so we're very pleased with it," said Steve Hirsch, Apple's president and CEO. "It is one of the best apples ever."

The company is now working with the U.S. Department of Agriculture to develop and sell the apple.

The company is currently testing its product on apples grown by farmers in the Midwest, and is also working with the U.S. Department of Agriculture to test its products on apples grown in the Midwest.

Apple's apple has been the focus of controversy since its introduction in 2007 and its reputation has been tarnished by its poor quality.

Apple's reputation has been tarnished by a series of lawsuits that it has filed against the U.S. government, which is investigating the company for fraud.

Apple is


In [None]:
# Generate a text
output = gpt2_lm.generate("steave jobs", max_length=200)
print("\nGPT-2 output:")
print(output)

## **Fine-tune GPT2**

In [None]:
import tensorflow_datasets as tfds

reddit_ds = tfds.load("reddit_tifu", split="train", as_supervised=True)

Downloading and preparing dataset 639.54 MiB (download: 639.54 MiB, generated: 141.46 MiB, total: 781.00 MiB) to /root/tensorflow_datasets/reddit_tifu/short/1.1.2...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/79740 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/reddit_tifu/short/1.1.2.incompleteXTPWMX/reddit_tifu-train.tfrecord*...:  …

Dataset reddit_tifu downloaded and prepared to /root/tensorflow_datasets/reddit_tifu/short/1.1.2. Subsequent calls will reuse this data.


In [None]:
reddit_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>

In [None]:
for document, title in reddit_ds:
    print(document.numpy())
    print(title.numpy())
    break

b"me and a friend decided to go to the beach last sunday. we loaded up and headed out. we were about half way there when i decided that i was not leaving till i had seafood. \n\nnow i'm not talking about red lobster. no friends i'm talking about a low country boil. i found the restaurant and got directions. i don't know if any of you have heard about the crab shack on tybee island but let me tell you it's worth it. \n\nwe arrived and was seated quickly. we decided to get a seafood sampler for two and split it. the waitress bought it out on separate platters for us. the amount of food was staggering. two types of crab, shrimp, mussels, crawfish, andouille sausage, red potatoes, and corn on the cob. i managed to finish it and some of my friends crawfish and mussels. it was a day to be a fat ass. we finished paid for our food and headed to the beach. \n\nfunny thing about seafood. it runs through me faster than a kenyan \n\nwe arrived and walked around a bit. it was about 45min since we a

In [None]:
train_ds = (
    reddit_ds.map(lambda document, _: document)
    .batch(32)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
train_ds = train_ds.take(500)
num_epochs = 1

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs)

In [None]:
output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

In [None]:
# SEARCH FOR TOP-K
# Use a string identifier.
gpt2_lm.compile(sampler="top_k")
output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

# Use a `Sampler` instance. `GreedySampler` tends to repeat itself,
greedy_sampler = keras_nlp.samplers.GreedySampler()
gpt2_lm.compile(sampler=greedy_sampler)

output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

## **Finetune on Chinese Poem Dataset**

In [None]:
!git clone https://github.com/chinese-poetry/chinese-poetry.git

In [None]:
import os
import json

poem_collection = []
for file in os.listdir("chinese-poetry/全唐诗"):
    if ".json" not in file or "poet" not in file:
        continue
    full_filename = "%s/%s" % ("chinese-poetry/全唐诗", file)
    with open(full_filename, "r") as f:
        content = json.load(f)
        poem_collection.extend(content)

paragraphs = ["".join(data["paragraphs"]) for data in poem_collection]

In [None]:
print(paragraphs[0])

In [None]:
train_ds = (
    tf.data.Dataset.from_tensor_slices(paragraphs)
    .batch(16)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

# Running through the whole dataset takes long, only take `500` and run 1
# epochs for demo purposes.
train_ds = train_ds.take(500)
num_epochs = 1

learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs)

In [None]:
output = gpt2_lm.generate("昨夜雨疏风骤", max_length=200)
print(output)