In [1]:
import torch
import gc
import math
import pandas as pd
from datasets import Dataset
from torch.utils.data import random_split, DataLoader
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM

In [None]:
import os
os.getcwd()

In [2]:
torch.cuda.empty_cache()
gc.collect()

16

In [3]:
model_checkpoint = "distilgpt2"

In [None]:
model_checkpoint = "D:/_Coding/Python/AI/Text Generators/AI Text and Code Generation with GPT Neo and Python/Transformers/gpt neo 125M"

In [4]:

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint).cuda()
tokenizer.pad_token = tokenizer.eos_token

In [5]:
descriptions = pd.read_csv("descriptions_2.csv")
description_list = list(descriptions["Description"])
print(description_list[1])
print(tokenizer.encode(description_list[1]))


MIME buffer overflow in email clients, e.g. Solaris mailtool and Outlook.
[44, 12789, 11876, 30343, 287, 3053, 7534, 11, 304, 13, 70, 13, 12347, 271, 6920, 25981, 290, 30096, 13]


In [None]:
max_length = max([len(tokenizer.encode(description)) for description in description_list])
print(max_length)

In [None]:
sum_length = sum([len(tokenizer.encode(description)) for description in description_list])
average = sum_length/len(description_list)
print(average)

In [6]:
shorter_descriptions = []
for description in description_list:
    if len(tokenizer.encode(description)) < 100:
        shorter_descriptions.append(description)

Token indices sequence length is longer than the specified maximum sequence length for this model (1048 > 1024). Running this sequence through the model will result in indexing errors


In [7]:
print(len(shorter_descriptions))
print(len(description_list))

161453
187938


In [None]:
print(tokenizer.encode(description_list[0]))

In [8]:
class DescriptionDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [9]:
dataset = DescriptionDataset(shorter_descriptions, tokenizer, max_length=100)

In [10]:
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
print(len(dataset))
print(len(train_dataset))
print(len(val_dataset))

In [None]:
DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
           batch_sampler=None, num_workers=0, collate_fn=None,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None)

In [None]:
print(list(torch.utils.data.DataLoader(dataset)))

In [None]:
print(list(torch.utils.data.DataLoader(dataset[0][0])))
print(len(list(torch.utils.data.DataLoader(dataset[0][0]))))

In [None]:
print(type(train_dataset[0][0]))

In [None]:
# block_size = tokenizer.model_max_length
block_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def group_dataset(examples):
    concatenated_examples = examples[0][0]
    print(len(examples))
    print(examples[0][0])
    for i in range(1, len(examples)-1):
        # if i == 0:
        #     continue
        # else:
        concatenated_examples = torch.cat((concatenated_examples, examples[i][0]), 0)
    return concatenated_examples


In [None]:
concatinated_train_dataset = group_dataset(train_dataset)
print(concatinated_train_dataset)
print(len(concatinated_train_dataset))

In [None]:
torch.save(concatinated_train_dataset, 'concatinated_train_dataset.pt')

In [None]:
concatinated_train_dataset = torch.load('concatinated_train_dataset.pt')

In [None]:
# block_size = tokenizer.model_max_length
block_size = 128
dataset_length = len(concatinated_train_dataset)
print(dataset_length)
total_length = (dataset_length // block_size) * block_size
print(total_length)

In [None]:
split_train_dataset = torch.split(concatinated_train_dataset, 128)
print(split_train_dataset)

In [None]:
print(len(split_train_dataset[-1]))
print(len(split_train_dataset[-2]))
print(len(split_train_dataset[0]))

In [11]:
model_name = "distilgpt2"

training_args = TrainingArguments(
    f"{model_name}_finetuned_hacks",
    num_train_epochs=10,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

In [12]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})

d:\_Coding\Python\AI\Text Generators\AI Text and Code Generation with GPT Neo and Python\distilgpt2_finetuned_hacks is already a clone of https://huggingface.co/ChronicTronic/distilgpt2_finetuned_hacks. Make sure you pull the latest changes with `repo.git_pull()`.


In [13]:
trainer.train()

***** Running training *****
  Num examples = 145307
  Num Epochs = 10
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 484360
  Number of trainable parameters = 81912576


  0%|          | 0/484360 [00:00<?, ?it/s]

Saving model checkpoint to distilgpt2_finetuned_hacks\checkpoint-500
Configuration saved in distilgpt2_finetuned_hacks\checkpoint-500\config.json


{'loss': 2.0169, 'learning_rate': 1.9979354199355853e-05, 'epoch': 0.01}


Model weights saved in distilgpt2_finetuned_hacks\checkpoint-500\pytorch_model.bin
Saving model checkpoint to distilgpt2_finetuned_hacks\checkpoint-1000


{'loss': 1.8083, 'learning_rate': 1.9958708398711704e-05, 'epoch': 0.02}


Configuration saved in distilgpt2_finetuned_hacks\checkpoint-1000\config.json
Model weights saved in distilgpt2_finetuned_hacks\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to distilgpt2_finetuned_hacks\checkpoint-1500
Configuration saved in distilgpt2_finetuned_hacks\checkpoint-1500\config.json


{'loss': 1.6984, 'learning_rate': 1.9938062598067556e-05, 'epoch': 0.03}


Model weights saved in distilgpt2_finetuned_hacks\checkpoint-1500\pytorch_model.bin
Saving model checkpoint to distilgpt2_finetuned_hacks\checkpoint-2000
Configuration saved in distilgpt2_finetuned_hacks\checkpoint-2000\config.json


{'loss': 1.678, 'learning_rate': 1.9917416797423407e-05, 'epoch': 0.04}


Model weights saved in distilgpt2_finetuned_hacks\checkpoint-2000\pytorch_model.bin


NotADirectoryError: [WinError 267] The directory name is invalid: 'C:\\Users\\anali\\AppData\\Local\\Temp\\tmpmf8y_pwb\\lfs_progress'

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()