<a href="https://colab.research.google.com/github/ashishmohapatra240/textbase/blob/main/fine_tune_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.4 MB/s[0m eta [36m0:00:0

In [2]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/244.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [3]:
import warnings

warnings.filterwarnings("ignore")


In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer

In [5]:
import pandas as pd
import torch

In [6]:
dataset_path = '/content/improved_conversations_1000.csv'
conversational_df = pd.read_csv(dataset_path)

In [10]:
conversational_df.head()

Unnamed: 0,Question,Answer
0,A 20-year-old male with LOW blood pressure and...,"Based on what you've told me, I'd advocate for..."
1,A 20-year-old male with LOW blood pressure and...,"Given the provided health metrics, my recommen..."
2,A 20-year-old male with NORMAL blood pressure ...,"From the data at hand, the right medication se..."
3,A 20-year-old male with NORMAL blood pressure ...,"After analyzing the details, I'd suggest going..."
4,A 20-year-old male with HIGH blood pressure an...,"After analyzing the details, I'd suggest going..."


In [11]:
conversational_df['combined'] = conversational_df['Question'] + "\n" + conversational_df['Answer'] + "\n"
conversational_df['combined'].to_csv('conversational_data.txt', index=False, header=False)

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [13]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="conversational_data.txt",
    block_size=128
)

In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [15]:
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [16]:
torch.cuda.empty_cache()

In [17]:
training_args = TrainingArguments(
    output_dir="./results_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir="./logs",
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [19]:
trainer.train

<bound method Trainer.train of <transformers.trainer.Trainer object at 0x7986ae5fd540>>

In [20]:
model.save_pretrained("./fine_tuned_gpt2/")

In [21]:
tokenizer.save_pretrained("./fine_tuned_gpt2/")

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')