# Text generation in antiquity style

In this notebook we attempt to use a fine-tune a transformer model to generate text in the style of travel logs dating from antiquity.

In [None]:
pip install transformers



In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
import transformers

from sklearn.model_selection import train_test_split

In [None]:
pip install datasets



In [None]:
from datasets import load_dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and Split Data

In [None]:
dataset = load_dataset("text", data_files = {"/content/drive/MyDrive/Bird is the word/muslim_traveler/combined.txt"})

Using custom data configuration default-d4f33d043c6cf52d


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-d4f33d043c6cf52d/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-d4f33d043c6cf52d/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
datasets = dataset["train"].train_test_split(test_size = 0.3)

In [None]:
show_random_elements(datasets["test"])

Unnamed: 0,text
0,"Bilad Bakr and its surroundings, leaving behind the pilgrims"
1,"My House, oppression never enter thee!"
2,forth fresh water. They divide into branches and flow in channels
3,"description, only indicating what lies beyond; for it does not"
4,
5,Then came the night of the 29th. The reader of the final
6,
7,enjoyed there. It was he who built the cathedral
8,"fecled to difplay their Rhetoric, but efpecially when"
9,"First our billeting officer, entering the city with our horse-tail standards, proceeded to the Sharia court where the notables of the province told him, ‘Your pasha holed up in Erzurum intending to become a Celali. The suspicion he aroused then has now proved true. He has raised an army of 10,000 men and has leagued himself in rebellion with Varvar Ali Pasha. We cannot allow you to occupy a walled city belonging to the Padishah.’"


---
## Load and Tokenize Data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [None]:
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

        

#1:   0%|          | 0/11 [00:00<?, ?ba/s]

#0:   0%|          | 0/11 [00:00<?, ?ba/s]

#2:   0%|          | 0/11 [00:00<?, ?ba/s]

#3:   0%|          | 0/11 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

#2:   0%|          | 0/5 [00:00<?, ?ba/s]

#3:   0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 41580
})

### Concatenate and break into blocks of fixed size

In [None]:
block_size = 128 # Number of tokens in each block

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

CausalLM models in the 🤗 Transformers library automatically apply right-shifting to the inputs, so we don't need to do it manually.

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

        

#0:   0%|          | 0/11 [00:00<?, ?ba/s]

#1:   0%|          | 0/11 [00:00<?, ?ba/s]

#2:   0%|          | 0/11 [00:00<?, ?ba/s]

#3:   0%|          | 0/11 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

#2:   0%|          | 0/5 [00:00<?, ?ba/s]

#3:   0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])


'cities, such as Tabriz and Sultinl)*a, had risen intomiddt of which there are waterwheels like revolvinga party of the “ Young Brother ” Chelebi, who invitedsailing before this ship on its course; and thereupon we took298NOTESwho sent us a hospitality gift,®^ but we did not visithave been annotated by scholars familiar with theConteft, to make us believe there are fome ■and a mountain at whose summit is a towering fortress.thoufand Dinars, or fifteen thousand Piftoles French, insent for me'

## Initialize Model

In [None]:
from transformers import TFAutoModelForCausalLM

model = TFAutoModelForCausalLM.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
from transformers import create_optimizer, AdamWeightDecay
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

All 🤗 Transformers models are capable of computing an appropriate loss for their task internally (for example, a CausalLM model will use a cross-entropy loss). To do this, the labels must be provided in the input dict (or equivalently, in the columns argument to to_tf_dataset()), so that they are visible to the model during the forward pass.

This is quite different from the standard Keras way of handling losses, where labels are passed separately and not visible to the main body of the model, and loss is handled by a function that the user passes to compile(), which uses the model outputs and the label to compute a loss value.

The approach we take is that if the user does not pass a loss to compile(), the model will assume you want the internal loss. If you are doing this, you should make sure that the labels column(s) are included in the input dict or in the columns argument to to_tf_dataset.

In [None]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


## Convert HuggingFace Dataset to TF

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

train_set = lm_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
test_set = lm_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

## Train model

In [None]:
model.fit(train_set, validation_data=test_set, epochs=1)




## Text Generation from Prompt

In [None]:
from transformers import pipeline

In [None]:
generator = pipeline(model = model, max_length = 200, task="text-generation", tokenizer = tokenizer)

In [None]:
generator('the bird is the word')

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[{'generated_text': 'the bird is the word ‘taj. That word is ‘sarah, for it is written with the letter of the night. The eagle of God is also the term Çaşşe. The great white eagle, the king of Kings, stands beside this bird and the white eagle is the word ‘the eagle of the Prophet, for he lives in the world of Islam’"’ he adds, as he is the crow\'s nest. The white crows are the birds of Jyberd. To the east of the town are many small trees as tallas of three cubits. On the one side there is a mosque, and the mosque is a dome. As in Jyberd, the mosque is surrounded by a mosque which is like a mosque but it has no windows. When the Muslims go to the mosque and eat, they eat their fish and eggs. The carpets and tiles of the mosque are so white that it can'}]

---
---
# OLD STUFF

---
## Old Stuff

In [None]:
## Load Text and Combine into single string separated by eos_token

from pathlib import Path

paths = [str(x) for x in Path("/content/drive/MyDrive/Bird is the word/").glob('**/*.txt')]

combined_string = ''

for filename in paths:
    with open(filename, 'r', encoding = 'utf-8') as f:
        x = f.read()

    combined_string += x + tokenizer.eos_token

combined_tokenized = tokenizer.encode(combined_string)

Token indices sequence length is longer than the specified maximum sequence length for this model (208761 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
print(combined_tokenized)

[17688, 642, 11, 5544, 1227, 286, 3931, 11, 1110, 1467, 11, 262, 1110, 286, 12928, 286, 31164, 321, 403, 11, 262, 15624, 286, 262, 25663, 286, 262, 10857, 286, 1703, 403, 11, 4453, 286, 20902, 12, 1659, 12, 1169, 12, 7571, 12, 43, 1746, 11, 284, 21207, 30605, 329, 262, 1049, 15581, 21405, 286, 34717, 12, 3041, 11, 2677, 286, 15391, 11, 543, 318, 2402, 262, 7850, 290, 685, 271, 1444, 60, 34717, 12, 7220, 12, 258, 13, 198, 198, 2202, 262, 1110, 286, 616, 10325, 379, 11818, 271, 11, 262, 1295, 810, 2439, 437, 274, 290, 30765, 321, 403, 389, 11, 314, 2921, 606, 262, 4596, 20981, 286, 34717, 12, 3041, 11, 2677, 286, 15391, 13, 1119, 550, 1100, 606, 503, 878, 606, 290, 484, 531, 25, 366, 40, 481, 466, 11, 314, 481, 466, 355, 34717, 12, 3041, 11, 2677, 286, 15391, 11, 674, 15876, 468, 531, 526, 198, 198, 40, 9658, 1566, 262, 5544, 1227, 286, 3931, 287, 11818, 271, 13, 3244, 2439, 437, 274, 290, 30765, 321, 403, 1908, 502, 572, 351, 262, 4074, 338, 10654, 337, 3540, 11181, 11, 290, 314, 1816, 

In [None]:
## Slice text data into 512 token intervals

examples = []
block_size = 512
BATCH_SIZE = 12
BUFFER_SIZE = 1000 # Size of queued data to sample during shuffle

for i in range(0, len(combined_tokenized) - block_size + 1, block_size):
    examples.append(combined_tokenized[i:i + block_size])

## Model is trained for next sentence prediction, so labels are just sample
## shifted by 1.
inputs, labels = [], []
attention_mask = []
for ex in examples:
    inputs.append(ex[:-1])
    attention_mask.append(np.ones(len(ex[:-1])).astype(int))
    labels.append(ex[1:])

In [None]:
data_df = pd.DataFrame({
    'inputs' : inputs,
    'labels' : labels,
    'attention_mask' : attention_mask 
})

data_df.head()

Unnamed: 0,inputs,labels,attention_mask
0,"[17688, 642, 11, 5544, 1227, 286, 3931, 11, 11...","[642, 11, 5544, 1227, 286, 3931, 11, 1110, 146...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[12431, 11, 339, 14448, 284, 534, 4074, 13, 48...","[11, 339, 14448, 284, 534, 4074, 13, 48293, 25...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[6, 1701, 679, 1816, 290, 1297, 340, 284, 262,...","[1701, 679, 1816, 290, 1297, 340, 284, 262, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[6365, 11, 290, 484, 550, 587, 49975, 656, 511...","[11, 290, 484, 550, 587, 49975, 656, 511, 3650...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[290, 1535, 11, 484, 561, 407, 423, 1908, 777,...","[1535, 11, 484, 561, 407, 423, 1908, 777, 1243...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
data_df.shape

(407, 3)

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data_df, test_size=0.20)

In [None]:
from transformers import TFGPT2LMHeadModel
EPOCHS = 4
INITAL_LEARNING_RATE = 0.001

# Decreasing learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    INITAL_LEARNING_RATE,
    decay_steps=500,
    decay_rate=0.7,
    staircase=True)

# initialize model, use_cache=False important! else wrong shape at loss calc
model = TFGPT2LMHeadModel.from_pretrained(
    "gpt2",
    use_cache=False,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model.resize_token_embeddings(len(tokenizer))
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer = optimizer, loss = model.compute_loss)
model.summary()

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


Model: "tfgpt2lm_head_model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124440576 
 r)                                                              
                                                                 
Total params: 124,440,576
Trainable params: 124,440,576
Non-trainable params: 0
_________________________________________________________________


In [None]:
train_inputs = np.array(list(train_df['inputs']))
train_labels = np.array(list(train_df['labels']))
train_mask = np.array(list(train_df['attention_mask']))

train_tf = tf.data.Dataset.from_tensor_slices(
    ({'input_ids' : train_inputs, 'attention_mask': train_mask} , train_labels)
)

In [None]:
test_inputs = np.array(list(test_df['inputs']))
test_labels = np.array(list(test_df['labels']))
test_mask = np.array(list(test_df['attention_mask']))

test_tf = tf.data.Dataset.from_tensor_slices(
    ({'input_ids' : test_inputs, 'attention_mask': test_mask} , test_labels)
)

---
## Load GPT-2 Model
Note: Consider using GPT-NEO or GPT-J, open-source attempts to match GPT-3 from Eleuther AI.

In [None]:
BUFFER_SIZE = len(train_df)

train_ds = (
    train_tf.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
)

test_ds = test_tf.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
train_ds

<BatchDataset element_spec={'input_ids': TensorSpec(shape=(12, 511), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(12, 511), dtype=tf.int64, name=None)}>

In [None]:
test_ds

<BatchDataset element_spec={'input_ids': TensorSpec(shape=(12, 511), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(12, 511), dtype=tf.int64, name=None)}>

In [None]:
%%time
# Train Model

steps_per_epoch = int(BUFFER_SIZE // BATCH_SIZE)
print(
    f"Model Params:\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
    f"Initial Learning rate: {INITAL_LEARNING_RATE}"
)

hist = model.fit(
    train_ds,
    validation_data= test_ds,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)

Model Params:
batch_size: 12
Epochs: 4
Step p. Epoch: 27
Initial Learning rate: 0.001
Epoch 1/4


TypeError: ignored

In [None]:
from transformers import TFGPT2LMHeadModel

model = TFGPT2LMHeadModel.from_pretrained("gpt2", 
                                          use_cache=False,
                                          pad_token_id = tokenizer.pad_token_id,
                                          eos_token_id = tokenizer.eos_token_id
                                          )

model.resize_token_embeddings(len(tokenizer)) # Because we added some tokens.

model.compile(loss = model.compute_loss, optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
model.summary()

Model: "tfgpt2lm_head_model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124440576 
 r)                                                              
                                                                 
Total params: 124,440,576
Trainable params: 124,440,576
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5)
)

model.resize_token_embeddings(len(tokenizer))

model.fit(dataset, epochs=6)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fc3b609d810>

In [None]:
# path = "/content/drive/MyDrive/Bird is the word/combined.txt"

# data = pd.read_csv(path, sep = 'delimiter', header = None)

  return func(*args, **kwargs)


In [None]:
data.head()

Unnamed: 0,0
0,ANABASIS
1,BOOK I
2,I.
3,Darius and Parysatis had two sons: the elder w...
4,"(1) Parrhasia, a district and town in the sout..."


In [None]:
train_df, val_df = train_test_split(data, train_size = 0.7)

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [None]:
print(train_ds)

Dataset({
    features: ['0'],
    num_rows: 1661
})


In [None]:
def tokenize_function(examples):
    return tokenizer(examples['0'], padding="max_length", truncation=True)

In [None]:
tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_val_ds = val_ds.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
print(tokenized_train_ds)

Dataset({
    features: ['0', 'input_ids', 'attention_mask'],
    num_rows: 1661
})


In [None]:
tf_train_ds = tokenized_train_ds.remove_columns(["0"]).with_format("tensorflow")
tf_val_ds = tokenized_val_ds.remove_columns(["0"]).with_format("tensorflow")