# HW09: Transformers

Remember that these homework work as a completion grade. **You can skip one section of this homework.**

In [None]:
import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df = df.sample(n=10000) # # only use 10K datapoints
df.head()

## Hugginface Transformers

In [None]:
from transformers import TFDistilBertForSequenceClassification, DistilBertConfig, DistilBertTokenizerFast
import tensorflow as tf

config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
config.num_labels = 4
transformer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
def out_id_mask(data, length):
    data_tf = [tokenizer(text, return_tensors='tf',
              max_length=length) for text in data]
    input_ids, input_masks = [x["input_ids"][0].numpy() for x in data_tf], [x["attention_mask"][0].numpy() for x in data_tf]
    
    return input_ids, input_masks

In [None]:
##TODO split the sample into a training and a test set 
##TODO prepare the dataset for tensorflow.

# labels need to be numbers actually
from sklearn.model_selection import train_test_split
df['target'] = pd.factorize(df['label'])[0]

X_train, X_test, y_train, y_test = train_test_split(df['text'].tolist(),
                                                    df['target'].tolist(),
                                                    test_size=0.3)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

input_ids, input_masks = out_id_mask(X_train, 32)
input_ids_test, input_masks_test = out_id_mask(X_test, 32)

index = [i for i, elem in enumerate(input_ids) if len(elem) < 32]
index_test = [i for i, elem in enumerate(input_ids_test) if len(elem) < 32]

for item in sorted(index, reverse=True):
    del input_ids[item]
    del input_masks[item]
    del y_train[item]
    
for item in sorted(index_test, reverse=True):
    del input_ids_test[item]
    del input_masks_test[item]
    del y_test[item]

In [None]:
##TODO build a transformer model to do sequence classification with the goal to predict the label from the text
in_ids = tf.keras.layers.Input(shape=(32,), name='input_token', dtype='int32')
in_masks_ids = tf.keras.layers.Input(shape=(32,), name='masked_token', dtype='int32')
X = transformer_model(in_ids, in_masks_ids)
model = tf.keras.Model(inputs=[in_ids, in_masks_ids], outputs = X)

In [None]:
##TODO print the summary of the model
model.summary()

In [None]:
##TODO compile the model
# losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(({'input_token': input_ids,
                                               'masked_token': input_masks}, y_train)).batch(8)

In [None]:
##TODO fit the model and print the obtained accuracy
model.fit(dataset, epochs = 1)

**Hint:** All the vectorized pieces of text must have the same length (which will be equal to the input size). You have two options to ensure this:

1. Set the maximum length equal to the length of the shortest vectorized text
2. Choose the maximum length and then exclude all the data points that have vectors shorter than that length

**Hint:** Tensorflow requires your labels to be integers, not strings

In [None]:
dataset_test = tf.data.Dataset.from_tensor_slices(({'input_token': input_ids_test,
                                               'masked_token': input_masks_test})).batch(8)

In [None]:
predictions = model.predict(dataset_test)

In [None]:
import numpy as np
predictions = tf.nn.softmax(predictions)

In [None]:
pred = [np.argmax(item) for item in predictions[0]]

In [None]:
# print accuracy
from sklearn.metrics import accuracy_score
accuracy_score(pred, y_test)

## Generating Text

In [None]:
#!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

##TODO Pick one snippet for each label and generate some starting from the first 4-5 words
## pick the generating model that looks best to you (and explain why) and set the length of each generated document to 50

In [None]:
input_ids_sport = tokenizer.encode('Woods cruises to eight shot', return_tensors='pt')
input_ids_business = tokenizer.encode('FedEx raises first quarter', return_tensors='pt')
input_ids_tech = tokenizer.encode('Space Station Crew Blast', return_tensors='pt')
input_ids_world = tokenizer.encode('Plane Crashes in China', return_tensors='pt')

In [None]:
beam_output = model.generate(
    input_ids_sport, 
    max_length=50, 
    num_beams=4, 
    no_repeat_ngram_size=2,
    early_stopping=True
)
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    input_ids_sport, 
    do_sample=True, 
    max_length=50, 
    top_p=0.90, 
    top_k=0
)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    input_ids_business, 
    do_sample=True, 
    max_length=50, 
    top_p=0.90, 
    top_k=0
)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
# I think the beam approach generates more comprehensive text in this case

In [None]:
beam_output = model.generate(
    input_ids_business, 
    max_length=50, 
    num_beams=4, 
    no_repeat_ngram_size=2,
    early_stopping=True
)
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    input_ids_world, 
    do_sample=True, 
    max_length=50, 
    top_p=0.90, 
    top_k=0
)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
# even though the top_p = 0.95, this seems quite ad-hoc
# it seems that in general it doesn't perform that well on this snippet no matter the method chosen

In [None]:
sample_output = model.generate(
    input_ids_tech, 
    do_sample=True, 
    max_length=50, 
    top_p=0.95, 
    top_k=0
)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    input_ids_tech, 
    do_sample=True, 
    max_length=50, 
    top_p=0.99, 
    top_k=0
)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
beam_output = model.generate(
    input_ids_tech, 
    max_length=50, 
    num_beams=6, 
    no_repeat_ngram_size=2,
    early_stopping=True
)
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
sample_output = model.generate(
    input_ids_tech, 
    do_sample=True, 
    max_length=50, 
    top_k=0
)

print(tokenizer.decode(sample_output[0], skip_special_tokens=True))