### Import libraries

In [10]:
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset
from nltk.tokenize import sent_tokenize
import os

### Prepare the text data to train the model

In [11]:
def concatenate_text_files(folder_path):
    concatenated_text = ""
    
    # Loop through all files in the specified folder
    for filename in os.listdir(folder_path):
        # Check if the file has a .txt extension
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                concatenated_text += file.read() + "\n"  # Add a newline character after each file's content
    
    return concatenated_text

In [12]:
# Path to folder containing the .txt files
folder_path = '../data/contents/'

# Concatenate text from all .txt files in the folder
all_text = concatenate_text_files(folder_path)

# Optionally, save the concatenated text to a new file
output_file_path = '../data/contents/full-text/full-text.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write(all_text)

### Initiation of tokenizer and data collator

In [13]:
# Load pre-trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [14]:
# Function to load and tokenize dataset
def load_and_tokenize_data(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

In [15]:
# Tokenize the dataset    
train_dataset = load_and_tokenize_data('../data/contents/full-text/full-text.txt', tokenizer)



In [16]:
# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

### Model building and training

In [17]:
# This function trains the model with the dataset
def train_the_model(dataset):
    
    # Load pre-trained model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        save_steps=10000,
        save_total_limit=2,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )
    
    # Fine-tune the model
    trainer.train()
    
    # Save the model
    model.save_pretrained('./fine_tuned_model/')
    tokenizer.save_pretrained('./fine_tuned_model/')

In [18]:
# Train the model with dataset
train_the_model(train_dataset)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss
500,3.3733
1000,2.9359
1500,2.6961
2000,2.5263
2500,2.4162
