In [77]:
import os
import re
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text
def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

In [78]:
train_directory = 'TrainingData'
text_data = read_documents_from_directory(train_directory)
text_data = re.sub(r'\n+', '\n', text_data).strip()

In [79]:
with open("DataSource/final_train.txt", "w") as f:
    f.write(text_data)

In [80]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [81]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [82]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [83]:
import torch
def train(train_file_path, model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps,
          stop_token="###",
          device="cuda" if torch.cuda.is_available() else "cpu"):

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({'additional_special_tokens': [stop_token]})
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name).to(torch.device(device))

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    trainer.train()
    trainer.save_model()

In [84]:
train_file_path = "TrainingData/final_train.txt"
model_name = 'gpt2'
output_dir = 'nmodels'
overwrite_output_dir = True
per_device_train_batch_size = 4
num_train_epochs = 1000
save_steps = 5000

In [85]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.0742
1000,0.0056


Checkpoint destination directory nmodels/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory nmodels/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [86]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [101]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer
def generate_text(model_path, sequence, max_length,stop_token="<Stop>"):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,

        top_k=50,
        top_p=0.95,
        eos_token_id=tokenizer.encode(stop_token)[0] if stop_token else None,
        pad_token_id=model.config.eos_token_id,
        temperature=0.1,
    )
    data_gen=tokenizer.decode(final_outputs[0], skip_special_tokens=True)
    str1=""
    for x in data_gen.split("..."):
      # if x=="...":
      #   break
      # elif str1=="":
      #   str1=x
      # else:
      #   str1=str1+" "+x
        print(x)
        break

In [105]:
model_path = "nmodels"
sequence = "[Q] Why does Santa have three gardens?"
max_len = 50
generate_text(model_path, sequence, max_len)

[Q] Why does Santa have three gardens?
[A] He has a lot of water in his belly.
[Q] What's the difference between a corn husker with epilepsy and a hooker with dysentery?

