# Text Generation Models Using `Transformers`

In [1]:
# Import packages
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import tensorflow as tf
import sklearn
import re
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [2]:
# Set device preference to use GPU when available
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

# Sets option to display all text in pandas dataframes
pd.set_option('display.max_colwidth', None)

# Specify File Locations
RawDat = '../data_raw/'
ClnDat = '../data_clean/'

# Specify file to be input
InFile = 'wiki_sentence.csv'

# Set seed for reproducible results
RandSeed = 25

# Read in csv file
if re.match('wiki', InFile):
    TextDF = pd.read_csv(ClnDat+InFile).drop(columns = ['same'])
else:
    TextDF = pd.read_csv(ClnDat+InFile)

In [4]:
NormText = [i for i in TextDF['normal']]
SimpText = [i for i in TextDF['simple']]

tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
encoded_text = tokenizer(NormText, 
                         SimpText, 
                         return_token_type_ids = True, 
                         max_length = 1600,
                         truncation = True)

model = GPT2LMHeadModel.from_pretrained('distilgpt2')

In [None]:
# Split data into test and train data sets
Text_X = TextDF['normal']
Text_y = TextDF['simple']

train_X, test_X, train_y, test_y = train_test_split(Text_X, Text_y, test_size=.2, random_state = RandSeed)

In [7]:
# From transformers example
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

#Fine-tuning T5 to summarize a question into a title.
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)

# Split data into test and train data sets
Text_X = TextDF['normal']
Text_y = TextDF['simple']

train_X, test_X, train_y, test_y = train_test_split(Text_X, Text_y, test_size=.2, random_state = RandSeed)

#Training loop
for epoch in range(2):
  model.train()
  for X_i,y_i in tqdm(zip(train_X,train_y),total=len(train_X)):
    #Encode the titles (the prefix 'summarize: ' tells it to summarize)
    X_i = tokenizer.encode('summarize: '+X_i, return_tensors="pt",max_length=512,pad_to_max_length=True, truncation = True).to(device)
    y_i = tokenizer.encode(y_i, return_tensors="pt",max_length=512,pad_to_max_length=True).to(device)
    #Foward pass through the network
    loss = model(input_ids=X_i, lm_labels=y_i)[0]
    #Backward pass to compute the gradients
    loss.backward()
    #Update the parameters with gradient descent
    optimizer.step()
    optimizer.zero_grad()
    model.zero_grad()

  #Switch to evaluation mode
  model.eval()

  #Print some example titles, to gauge how well it's doing (needs many more training examples to produce reasonable results)
  for X_i,y_i in zip(train_X,train_y):
    X_i = tokenizer.encode(X_i, return_tensors="pt",max_length=512).to(device)
    summary = model.generate(X_i,max_length=200,num_beams=4,no_repeat_ngram_size=3)[0]
    summary = tokenizer.decode(summary, skip_special_tokens=True)
    print(summary)

HBox(children=(FloatProgress(value=0.0, max=94361.0), HTML(value='')))

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (




KeyboardInterrupt: 

In [None]:
train_X2 = [i for i in train_X]

In [None]:
train_X2

In [None]:
# Initialize tokenizer and language model
# tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
# model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Encode train and test data
train_encodings = tokenizer(train_X2, truncation=True, padding=True)
# val_encodings = tokenizer(val_texts, truncation=True, padding=True)
# test_encodings = tokenizer(test_texts, truncation=True, padding=True)


# optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
# model.compile(optimizer=optimizer, loss=model.compute_loss) # can also use any keras loss fn
# model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)

In [None]:
train_encodings

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

#Fine-tuning T5 to summarize a question into a title.
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
# model = model.to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:
model.train()
for X_i, y_i