In [1]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx


from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# function ot read files from pdf

def read_pdf(file_path):
    with open(file_path,"rb") as file:
        pdf_reader = PdfReader(file)
        text = ""

        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    docs = docx.Document(file_path)
    text = ""
    for paragraph in docs.paragraphs:
        text += paragraph.text + "\n"
    return text


In [3]:
data = read_pdf(r"C:\Users\Omen\OneDrive\Desktop\Linear Regression.pdf")

data = re.sub(r'\n+', '\n', data).strip() # remove excess newline characters

with open(r"data.txt", "w") as f:
    f.write(data)

In [4]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

    return dataset

In [5]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm
    )

    return data_collator

In [6]:
def train(train_file_path,model_name,output_dir,overwrite_output_dir, per_device_train_batch_size,num_train_epochs,save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path,tokenizer=tokenizer)
    data_collator = load_data_collator(tokenizer=tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)


    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
    )


    trainer = Trainer(
        model = model,
        args = training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()

In [18]:
training_file_path = r"data.txt"
model_name = 'gpt2'
output_dir = "./saved_model"
overwrite_output_dir = False
per_device_batch_size = 8
num_train_epochs = 100
save_steps = 5000

In [19]:
train(train_file_path=training_file_path,
      model_name=model_name,
      output_dir=output_dir,
      overwrite_output_dir=overwrite_output_dir,
      per_device_train_batch_size=per_device_batch_size,
      num_train_epochs=num_train_epochs,
      save_steps=save_steps
      )

100%|██████████| 100/100 [00:20<00:00,  4.86it/s]


{'train_runtime': 20.5972, 'train_samples_per_second': 29.13, 'train_steps_per_second': 4.855, 'train_loss': 0.632500343322754, 'epoch': 100.0}


In [20]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [21]:
model_path = r"D:\vs code\python\DeepLearning\LLMs\GPT-2\saved_model"
sequence1 = "[Q] What is Linear Regression?"
max_len = 50
generate_text(model_path, sequence1, max_len) 

[Q] What is Linear Regression? Linear regression is a machine learning library for machine learning that makes predictions using sparse training sets. Linear regression is a machine learning library for linear regression that learns by fitting the data with gradient descent. Linear regression is
