In [4]:
import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from datasets import Dataset
from tqdm import tqdm
import pandas as pd
import os
import pdb

In [6]:
os.environ["WANDB_DISABLED"] = "true"

df = pd.read_csv('/h/brandon/internship/Uncover_implicit_bias/gpt2_training/data/ROCStories_winter2017.csv')
df = df.head(20)

In [7]:
df

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.
5,4d7b022e-25d2-4300-a9b0-24ab35f4045b,Melody's trip to the aquarium.,Melody's parents surprised her with a trip to ...,Melody took a nap during the two hour car ride...,"When they arrived, Melody was energetic and ex...","At the aquarium Melody saw sharks, tropical fi...","After five hours at the aquarium, Melody and h..."
6,8036c905-f23e-4976-83a1-85d679b5e0c2,Pop Quiz,The math teacher announced a pop quiz as class...,"While some students complained, he began passi...",I took out my pencil and began to work.,"About 5 minutes later, I finished.",I stood up feeling confident and turned it in.
7,77338898-07d4-4143-8451-284540c8b082,My first girlfriend,My first girlfriend i met on the internet.,She lives about 4 hours away from me.,Finally after 2 years we met each other.,She stayed with me for a week or two.,We decided we couldn't be apart so she moved i...
8,110fafd1-2bb7-4ffe-aac7-475706165d41,Charlie Horse,I got Charlie Horse when I was four years old.,"He's a brown stuffed horse, and at 35 I still ...","He was my best friend, and always laid at the ...","I laid him next to me, smelling his soft fur e...",I liked to listen to my radio as I fell asleep...
9,13573c2e-5eed-40eb-bbe5-ed259b5c76a6,Corn,Laura loved corn.,So she decided to grow some in her backyard.,The whole process of growing them made her ver...,But she realized that they required too much w...,So Laura quickly abandoned her corn garden idea.


In [8]:
# Define the function to concatenate the story title with the sentences
def concatenate_story_elements(row):
    title = row['storytitle']
    sentences = ' '.join(row[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].tolist())
    return f'{title} [SEP] {sentences}'

# Apply the function to each row
df['story'] = df.apply(concatenate_story_elements, axis=1)

stories_data = list(df['story'])


# Convert list of strings to a dictionary with a key
stories_dict = {'story': stories_data}

# Convert the dictionary to a Hugging Face Dataset
dataset = Dataset.from_dict(stories_dict)

In [9]:
dataset[0]

{'story': "David Drops the Weight [SEP] David noticed he had put on a lot of weight recently. He examined his habits to try and figure out the reason. He realized he'd been eating too much fast food lately. He stopped going to burger places and started a vegetarian diet. After a few weeks, he started to feel much better."}

In [10]:
# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /h/brandon/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /h/brandon/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /h/brandon/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca

In [12]:
tokenizer.add_special_tokens({'sep_token': '[SEP]'})
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

token_id = tokenizer.convert_tokens_to_ids('[SEP]')  # Replace <TITLE> with your token
print(f"The id for [SEP] is: {token_id}")

# Convert id to token
token_id = 50257
token = tokenizer.convert_ids_to_tokens(token_id)
print(f"The token for id {token_id} is: {token}")

# random init for added tokens
model.resize_token_embeddings(len(tokenizer))


# Define a function to apply to each example
def preprocess_function(example):
    # Modify examples here (e.g., tokenize text)
    return tokenizer(example["story"]) 

# Apply the function to the dataset
processed_dataset = dataset.map(preprocess_function)

def check_token_ids(dataset, tokenizer):
    # print('Checking token ids')
    for sample in processed_dataset:
        print(sample)
        input_ids = sample["input_ids"]
        if max(input_ids) >= len(tokenizer):
            print(f"Invalid token ID found: {max(input_ids)}")



# Function to decode the token IDs
def decode_sample(token_ids):
    return tokenizer.decode(token_ids, clean_up_tokenization_spaces=True, skip_special_tokens=False)

# Visualize the first few samples from train_dataset
print("Some samples from the training set:\n")
for i in range(5):  # let's print out the first 5 samples
    token_ids = processed_dataset[i]['input_ids']  # get the token IDs for the i-th sample
    text = decode_sample(token_ids)  # decode the token IDs to text
    print(f"Sample {i + 1}:\n{text}\n")





Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer


The id for [SEP] is: 50257
The token for id 50257 is: [SEP]


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some samples from the training set:

Sample 1:
David Drops the Weight [SEP] David noticed he had put on a lot of weight recently. He examined his habits to try and figure out the reason. He realized he'd been eating too much fast food lately. He stopped going to burger places and started a vegetarian diet. After a few weeks, he started to feel much better.

Sample 2:
Frustration [SEP] Tom had a very short temper. One day a guest made him very angry. He punched a hole in the wall of his house. Tom's guest became afraid and left quickly. Tom sat on his couch filled with regret about his actions.

Sample 3:
Marcus Buys Khakis [SEP] Marcus needed clothing for a business casual event. All of his clothes were either too formal or too casual. He decided to buy a pair of khakis. The pair he bought fit him perfectly. Marcus was happy to have the right clothes for the event.

Sample 4:
Different Opinions [SEP] Bobby thought Bill should buy a trailer and haul it with his car. Bill thought a truck

In [13]:
## Train

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False 
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',         # Output directory
    overwrite_output_dir=True,      # Overwrite the content of the output directory
    num_train_epochs=1,             # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,   # Batch size for evaluation
    eval_steps=400,                 # Evaluation step
    save_steps=800,                 # After # steps model is saved
    warmup_steps=500,               # Warmup steps
    logging_dir='./logs'
)

#Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed_dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: story. If story are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=99.2841552734375, metrics={'train_runtime': 0.3372, 'train_samples_per_second': 59.311, 'train_steps_per_second': 14.828, 'total_flos': 675684864000.0, 'train_loss': 99.2841552734375, 'epoch': 1.0})