In [1]:
!pip install transformers -q
!pip install wandb -q

[0m

In [2]:
#import wandb
#wandb.login()
#wandb.init(project="gpt-2-fine-tuning", entity="bilalcelebi23")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbilalcelebi23[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import pandas as pd

data_path = '/kaggle/input/business-ideas-generated-with-gpt3/ideas.csv'
data = pd.read_csv(data_path)
data.columns = ['content']
data = data['content'].unique()
len(data)

31536

In [4]:
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]
len(train_data), len(test_data)

(25228, 6308)

In [5]:
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [6]:
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}
num_add_tokens = tokenizer.add_special_tokens(special_tokens_dict)

config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [7]:
def prepare_data(data):
    
    response = []
    
    for pair in data:
        
        new_pair = bos + ' ' + str(pair) + ' ' + eos
        
        response.append(new_pair)
        
    return response

In [8]:
train_data = prepare_data(train_data)
test_data = prepare_data(test_data)
test_data[0], train_data[0]

('<|endoftext|> A startup that wants to help people make money off their pets. It’s in early stages, but has already raised $145,000 in seed funding from AngelList. <|EOS|>',
 '<|endoftext|> A 3D printing platform that helps business owners create 3D-printed products <|EOS|>')

In [9]:
train_data = pd.DataFrame(train_data)
train_data.columns = ['content']
test_data = pd.DataFrame(test_data)
test_data.columns = ['content']

In [10]:
train_dataset = Dataset.from_pandas(train_data[['content']])
test_dataset = Dataset.from_pandas(test_data[['content']])
train_dataset, test_dataset

(Dataset({
     features: ['content'],
     num_rows: 25228
 }),
 Dataset({
     features: ['content'],
     num_rows: 6308
 }))

In [11]:
def tokenize_func(example):
    
    return tokenizer(example['content'], padding = True)

tokenized_train_dataset = train_dataset.map(tokenize_func,
                                           batched = True,
                                           num_proc = 5,
                                           remove_columns = ['content'])

tokenized_test_dataset = test_dataset.map(tokenize_func,
                                         batched = True,
                                         num_proc = 5,
                                         remove_columns = ['content'])

       

#0:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/6 [00:00<?, ?ba/s]

#4:   0%|          | 0/6 [00:00<?, ?ba/s]

       

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

   

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

#4:   0%|          | 0/2 [00:00<?, ?ba/s]

In [12]:
model_save_path = '/kaggle/working/fine_tuned_model'

training_args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir=model_save_path,
    prediction_loss_only=True,
    save_steps=10000
    #report_to = 'wandb'
)

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

trainer = Trainer(model = model,
                 args = training_args,
                 data_collator = data_collator,
                 train_dataset = tokenized_train_dataset,
                 eval_dataset = tokenized_test_dataset)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 25228
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 7890
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,8.5865
1000,2.4868
1500,2.3908
2000,2.292
2500,2.2549
3000,2.1957
3500,2.1466
4000,2.1237
4500,2.07
5000,2.0538




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7890, training_loss=2.5375726465338837, metrics={'train_runtime': 10377.84, 'train_samples_per_second': 24.309, 'train_steps_per_second': 0.76, 'total_flos': 3.1874767981056e+16, 'train_loss': 2.5375726465338837, 'epoch': 10.0})

In [14]:
trainer.save_model()
tokenizer.save_pretrained(model_save_path)

Saving model checkpoint to /kaggle/working/fine_tuned_model
Configuration saved in /kaggle/working/fine_tuned_model/config.json
Model weights saved in /kaggle/working/fine_tuned_model/pytorch_model.bin
tokenizer config file saved in /kaggle/working/fine_tuned_model/tokenizer_config.json
Special tokens file saved in /kaggle/working/fine_tuned_model/special_tokens_map.json
added tokens file saved in /kaggle/working/fine_tuned_model/added_tokens.json


('/kaggle/working/fine_tuned_model/tokenizer_config.json',
 '/kaggle/working/fine_tuned_model/special_tokens_map.json',
 '/kaggle/working/fine_tuned_model/vocab.json',
 '/kaggle/working/fine_tuned_model/merges.txt',
 '/kaggle/working/fine_tuned_model/added_tokens.json')

In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 6308
  Batch size = 16


{'eval_loss': 2.634995460510254,
 'eval_runtime': 53.5724,
 'eval_samples_per_second': 117.747,
 'eval_steps_per_second': 7.373,
 'epoch': 10.0}

In [56]:
my_model = GPT2LMHeadModel.from_pretrained(model_save_path)
my_tokenizer = GPT2Tokenizer.from_pretrained(model_save_path)

input_text = my_tokenizer.bos_token
input_ids = my_tokenizer.encode(input_text, return_tensors = 'pt')
output = my_model.generate(input_ids, min_length = 20)
output = tokenizer.decode(output[0], skip_special_tokens = True)
real_output = str(output).split('.')
print(real_output[0] + '.')

Error in callback <function _WandbInit._resume_backend at 0x7fc35daa3f80> (for pre_run_cell):


Exception: The wandb backend process has shutdown

loading configuration file /kaggle/working/gpt2-business-ideas/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50257,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50258,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache":

 A startup that helps people find the best deals on flights, hotels, and car rentals.
Error in callback <function _WandbInit._pause_backend at 0x7fc35daa3ef0> (for post_run_cell):


Exception: The wandb backend process has shutdown