In [3]:
import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from datasets import Dataset
from tqdm import tqdm
import pandas as pd
import os
import pdb

os.environ["WANDB_DISABLED"] = "true"

df = pd.read_csv('/h/brandon/internship/Uncover_implicit_bias/gpt2_training/data/ROCStories_winter2017.csv')
df = df.head(20)

# Define the function to concatenate the story title with the sentences
def concatenate_story_elements(row):
    title = row['storytitle']
    sentences = ' '.join(row[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].tolist())
    return f'{title} [SEP] {sentences}'

# Apply the function to each row
df['story'] = df.apply(concatenate_story_elements, axis=1)

stories_data = list(df['story'])


# Convert list of strings to a dictionary with a key
stories_dict = {'story': stories_data}

# Convert the dictionary to a Hugging Face Dataset
dataset = Dataset.from_dict(stories_dict)


# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


tokenizer.add_special_tokens({'sep_token': '[SEP]'})
token_id = tokenizer.convert_tokens_to_ids('[SEP]')  # Replace <TITLE> with your token
print(f"The id for [SEP] is: {token_id}")


# Convert id to token
token_id = 50257
token = tokenizer.convert_ids_to_tokens(token_id)
print(f"The token for id {token_id} is: {token}")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model.resize_token_embeddings(len(tokenizer))



# Define a function to apply to each example
def preprocess_function(example):
    # Modify examples here (e.g., tokenize text)
    return tokenizer(example["story"]) 

# Apply the function to the dataset
processed_dataset = dataset.map(preprocess_function)

def check_token_ids(dataset, tokenizer):
    # print('Checking token ids')
    for sample in processed_dataset:
        print(sample)
        input_ids = sample["input_ids"]
        if max(input_ids) >= len(tokenizer):
            print(f"Invalid token ID found: {max(input_ids)}")



# Function to decode the token IDs
def decode_sample(token_ids):
    return tokenizer.decode(token_ids, clean_up_tokenization_spaces=True, skip_special_tokens=True)

# Visualize the first few samples from train_dataset
print("Some samples from the training set:\n")
for i in range(5):  # let's print out the first 5 samples
    token_ids = processed_dataset[i]['input_ids']  # get the token IDs for the i-th sample
    text = decode_sample(token_ids)  # decode the token IDs to text
    print(f"Sample {i + 1}:\n{text}\n")


## Train

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False 
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',         # Output directory
    overwrite_output_dir=True,      # Overwrite the content of the output directory
    num_train_epochs=1,             # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,   # Batch size for evaluation
    eval_steps=400,                 # Evaluation step
    save_steps=800,                 # After # steps model is saved
    warmup_steps=500,               # Warmup steps
    logging_dir='./logs'
)

#Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed_dataset,
)


# Train the model
trainer.train()





loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /h/brandon/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /h/brandon/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /h/brandon/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca

The id for [SEP] is: 50257
The token for id 50257 is: [SEP]


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: story. If story are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5


Some samples from the training set:

Sample 1:
David Drops the WeightDavid noticed he had put on a lot of weight recently. He examined his habits to try and figure out the reason. He realized he'd been eating too much fast food lately. He stopped going to burger places and started a vegetarian diet. After a few weeks, he started to feel much better.

Sample 2:
FrustrationTom had a very short temper. One day a guest made him very angry. He punched a hole in the wall of his house. Tom's guest became afraid and left quickly. Tom sat on his couch filled with regret about his actions.

Sample 3:
Marcus Buys KhakisMarcus needed clothing for a business casual event. All of his clothes were either too formal or too casual. He decided to buy a pair of khakis. The pair he bought fit him perfectly. Marcus was happy to have the right clothes for the event.

Sample 4:
Different OpinionsBobby thought Bill should buy a trailer and haul it with his car. Bill thought a truck would be better for what he

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=99.2841552734375, metrics={'train_runtime': 0.3435, 'train_samples_per_second': 58.223, 'train_steps_per_second': 14.556, 'total_flos': 675684864000.0, 'train_loss': 99.2841552734375, 'epoch': 1.0})