## Importing Modules

In [11]:
import os
import pathlib
import numpy as np
import pandas as pd
import nltk

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config, GPT2Tokenizer
from transformers import get_linear_schedule_with_warmup

from tqdm.auto import tqdm
import random
import datetime
import time
import statistics
from nltk.translate.bleu_score import sentence_bleu
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

GPU is available!


In [3]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [4]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium']

In [5]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\gpt2'

In [6]:
configuration = GPT2Config.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path, config=configuration)

## Import Dataset

In [7]:
filenames = os.listdir(DATASET_PATH)
filenames

['Html.csv', 'Recipes.csv', 'Recipes_1000.csv']

In [8]:
file_path = DATASET_PATH + '\\' + filenames[0]
file_path

'D:\\Python\\LLM_Environment\\datasets\\Html.csv'

In [9]:
df = pd.read_csv(file_path)
df.shape

(6712, 2)

In [10]:
df.head()

Unnamed: 0,Bad_Practices,Good_Practices
0,<table alt=header>Title</table>,<table alt='header'>Title</table>
1,<tr>Content,<tr>Content</tr>
2,<h2 src='description'>Content,<h2 src='description'>Content</h2>
3,<table>Link,<table>Link</table>
4,<img src='description'>,<img src='description' alt=''>


In [12]:
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
# Set the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Load your dataset in a CSV file
dataset = load_dataset("csv", data_files=file_path)

# Split the dataset into training and validation sets
train_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 != 0])  # Use 90% of the data for training
val_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 == 0])  # Use 10% of the data for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['Bad_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['Good_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

# Apply tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

Generating train split: 6712 examples [00:00, 117370.64 examples/s]
Map: 100%|██████████| 6040/6040 [00:02<00:00, 2603.43 examples/s]
Map: 100%|██████████| 672/672 [00:00<00:00, 2401.37 examples/s]


In [16]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./model',
    overwrite_output_dir=True,
    num_train_epochs=0.5,
    per_device_train_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir='./logs',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)



In [17]:
# Fine-tune the model
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  7%|▋         | 100/1510 [00:16<03:55,  5.98it/s]

{'loss': 0.1854, 'grad_norm': 0.503948986530304, 'learning_rate': 4.668874172185431e-05, 'epoch': 0.03}



  7%|▋         | 101/1510 [00:32<1:49:45,  4.67s/it]

{'eval_loss': 0.023037223145365715, 'eval_runtime': 14.9854, 'eval_samples_per_second': 44.844, 'eval_steps_per_second': 5.605, 'epoch': 0.03}


 13%|█▎        | 200/1510 [00:49<03:41,  5.92it/s]  

{'loss': 0.0255, 'grad_norm': 0.6472672820091248, 'learning_rate': 4.337748344370861e-05, 'epoch': 0.07}


                                                  
 13%|█▎        | 201/1510 [01:04<1:43:13,  4.73s/it]

{'eval_loss': 0.019099557772278786, 'eval_runtime': 15.167, 'eval_samples_per_second': 44.307, 'eval_steps_per_second': 5.538, 'epoch': 0.07}


 20%|█▉        | 300/1510 [01:21<03:31,  5.73it/s]  

{'loss': 0.0215, 'grad_norm': 0.45021557807922363, 'learning_rate': 4.006622516556292e-05, 'epoch': 0.1}


                                                  
 20%|█▉        | 301/1510 [01:37<1:36:41,  4.80s/it]

{'eval_loss': 0.018794288858771324, 'eval_runtime': 15.2771, 'eval_samples_per_second': 43.987, 'eval_steps_per_second': 5.498, 'epoch': 0.1}


 26%|██▋       | 400/1510 [01:54<03:14,  5.72it/s]  

{'loss': 0.0217, 'grad_norm': 0.5069019794464111, 'learning_rate': 3.675496688741722e-05, 'epoch': 0.13}


                                                  
 27%|██▋       | 401/1510 [02:10<1:29:02,  4.82s/it]

{'eval_loss': 0.01784611865878105, 'eval_runtime': 15.425, 'eval_samples_per_second': 43.566, 'eval_steps_per_second': 5.446, 'epoch': 0.13}


 33%|███▎      | 500/1510 [02:27<02:55,  5.74it/s]  

{'loss': 0.0202, 'grad_norm': 0.5524856448173523, 'learning_rate': 3.3443708609271526e-05, 'epoch': 0.17}


                                                  
 33%|███▎      | 500/1510 [02:43<02:55,  5.74it/s]

{'eval_loss': 0.017505528405308723, 'eval_runtime': 15.7003, 'eval_samples_per_second': 42.802, 'eval_steps_per_second': 5.35, 'epoch': 0.17}


 40%|███▉      | 600/1510 [03:03<02:37,  5.78it/s]  

{'loss': 0.0197, 'grad_norm': 0.5654252171516418, 'learning_rate': 3.0132450331125826e-05, 'epoch': 0.2}


                                                  
 40%|███▉      | 601/1510 [03:19<1:13:00,  4.82s/it]

{'eval_loss': 0.017614027485251427, 'eval_runtime': 15.4547, 'eval_samples_per_second': 43.482, 'eval_steps_per_second': 5.435, 'epoch': 0.2}


 46%|████▋     | 700/1510 [03:36<02:23,  5.65it/s]  

{'loss': 0.0193, 'grad_norm': 0.5610722303390503, 'learning_rate': 2.6821192052980134e-05, 'epoch': 0.23}


                                                  
 46%|████▋     | 701/1510 [03:52<1:05:58,  4.89s/it]

{'eval_loss': 0.017050961032509804, 'eval_runtime': 15.6888, 'eval_samples_per_second': 42.833, 'eval_steps_per_second': 5.354, 'epoch': 0.23}


 53%|█████▎    | 800/1510 [04:10<02:07,  5.59it/s]  

{'loss': 0.0196, 'grad_norm': 0.5970497727394104, 'learning_rate': 2.3509933774834437e-05, 'epoch': 0.26}


                                                  
 53%|█████▎    | 801/1510 [04:26<58:15,  4.93s/it]

{'eval_loss': 0.01738790050148964, 'eval_runtime': 15.7897, 'eval_samples_per_second': 42.559, 'eval_steps_per_second': 5.32, 'epoch': 0.26}


 60%|█████▉    | 900/1510 [04:43<01:48,  5.61it/s]

{'loss': 0.0191, 'grad_norm': 0.45496657490730286, 'learning_rate': 2.0198675496688745e-05, 'epoch': 0.3}


                                                  
 60%|█████▉    | 901/1510 [04:59<50:31,  4.98s/it]

{'eval_loss': 0.01707989163696766, 'eval_runtime': 15.9732, 'eval_samples_per_second': 42.07, 'eval_steps_per_second': 5.259, 'epoch': 0.3}


 66%|██████▌   | 1000/1510 [05:17<01:31,  5.55it/s]

{'loss': 0.0191, 'grad_norm': 0.5989903807640076, 'learning_rate': 1.688741721854305e-05, 'epoch': 0.33}


                                                   
 66%|██████▌   | 1000/1510 [05:34<01:31,  5.55it/s]

{'eval_loss': 0.017367595806717873, 'eval_runtime': 16.582, 'eval_samples_per_second': 40.526, 'eval_steps_per_second': 5.066, 'epoch': 0.33}


 73%|███████▎  | 1100/1510 [05:55<01:18,  5.24it/s]

{'loss': 0.018, 'grad_norm': 0.45427507162094116, 'learning_rate': 1.3576158940397351e-05, 'epoch': 0.36}


                                                   
 73%|███████▎  | 1101/1510 [06:12<36:44,  5.39s/it]

{'eval_loss': 0.01676344871520996, 'eval_runtime': 17.2853, 'eval_samples_per_second': 38.877, 'eval_steps_per_second': 4.86, 'epoch': 0.36}


 79%|███████▉  | 1200/1510 [06:31<00:55,  5.54it/s]

{'loss': 0.0182, 'grad_norm': 0.471922367811203, 'learning_rate': 1.0264900662251655e-05, 'epoch': 0.4}


                                                   
 80%|███████▉  | 1201/1510 [06:48<26:31,  5.15s/it]

{'eval_loss': 0.016578366979956627, 'eval_runtime': 16.5362, 'eval_samples_per_second': 40.638, 'eval_steps_per_second': 5.08, 'epoch': 0.4}


 86%|████████▌ | 1300/1510 [07:06<00:38,  5.53it/s]

{'loss': 0.0179, 'grad_norm': 0.4868495464324951, 'learning_rate': 6.95364238410596e-06, 'epoch': 0.43}


                                                   
 86%|████████▌ | 1301/1510 [07:22<18:00,  5.17s/it]

{'eval_loss': 0.016473527997732162, 'eval_runtime': 16.593, 'eval_samples_per_second': 40.499, 'eval_steps_per_second': 5.062, 'epoch': 0.43}


 93%|█████████▎| 1400/1510 [07:41<00:19,  5.53it/s]

{'loss': 0.0181, 'grad_norm': 0.5775110125541687, 'learning_rate': 3.642384105960265e-06, 'epoch': 0.46}


                                                   
 93%|█████████▎| 1401/1510 [07:58<09:33,  5.26s/it]

{'eval_loss': 0.01639706827700138, 'eval_runtime': 16.6988, 'eval_samples_per_second': 40.242, 'eval_steps_per_second': 5.03, 'epoch': 0.46}


 99%|█████████▉| 1500/1510 [08:16<00:01,  5.36it/s]

{'loss': 0.0174, 'grad_norm': 0.6427941918373108, 'learning_rate': 3.3112582781456954e-07, 'epoch': 0.5}


                                                   
 99%|█████████▉| 1500/1510 [08:32<00:01,  5.36it/s]

{'eval_loss': 0.0163738876581192, 'eval_runtime': 16.3005, 'eval_samples_per_second': 41.226, 'eval_steps_per_second': 5.153, 'epoch': 0.5}


100%|██████████| 1510/1510 [08:39<00:00,  2.91it/s]

{'train_runtime': 519.3529, 'train_samples_per_second': 5.815, 'train_steps_per_second': 2.907, 'train_loss': 0.030614700903560943, 'epoch': 0.5}





TrainOutput(global_step=1510, training_loss=0.030614700903560943, metrics={'train_runtime': 519.3529, 'train_samples_per_second': 5.815, 'train_steps_per_second': 2.907, 'total_flos': 789101936640000.0, 'train_loss': 0.030614700903560943, 'epoch': 0.5})

In [18]:
#trainer.save_model()

In [19]:
# Inference Example
example_input = "Heading"
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=50, top_p=0.95, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Original HTML code: Heading
Corrected HTML code: Heading>List Item</htable>
