In [1]:
import re
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
with open('llm_model_training_data.txt', 'r', encoding ='utf-8') as file:
    text_data = file.read()

In [4]:
def preprocess_text(input_data):
    input_data = re.sub(r'[^a-zA-Z\s£$%+=<>]', '', input_data)
    tokens = word_tokenize(input_data)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    processed_data = ' '.join(tokens)

    return processed_data

preprocessed_data = preprocess_text(text_data)

In [5]:
class CompanyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data[idx]
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=self.max_length, truncation=True)
        inputs['input_ids'] = inputs['input_ids'].squeeze()
        return inputs


In [6]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
company_dataset = CompanyDataset(data=preprocessed_data.split('.'), tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
batch_size = 8
dataloader = DataLoader(company_dataset, batch_size=batch_size)
model = GPT2LMHeadModel.from_pretrained(model_name)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
epochs = 8
learning_rate = 1e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [9]:
for epoch in range(epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch['input_ids'])  # Corrected labels handling
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [10]:
model.save_pretrained("new_fine_tuned_llm_model")
tokenizer.save_pretrained("new_fine_tuned_llm_model")

('new_fine_tuned_llm_model/tokenizer_config.json',
 'new_fine_tuned_llm_model/special_tokens_map.json',
 'new_fine_tuned_llm_model/vocab.json',
 'new_fine_tuned_llm_model/merges.txt',
 'new_fine_tuned_llm_model/added_tokens.json')

In [12]:
fine_tune_model = "new_fine_tuned_llm_model"
model = GPT2LMHeadModel.from_pretrained(fine_tune_model)

In [15]:
def generate_text(prompt, model, tokenizer, max_length=100, num_return_sequences=1, company_name=""):
    full_prompt = company_name + prompt
    input_ids = tokenizer.encode(full_prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        do_sample=True  # Enable sampling
    )

    generated_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output]

    return generated_texts

In [16]:
company_name = "unilever"
prompt = "What is the company culture?"
generated_texts = generate_text(prompt, model, tokenizer, max_length=200, company_name=company_name)

# Print generated text
for i, generated_text in enumerate(generated_texts):
    print(f"This text is generated for: {generated_text}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This text is generated for: unileverWhat is the company culture? Well, we're a team of innovators focused on delivering the highest quality, top quality products across all categories. From our world-class expertise in sustainable manufacturing and innovation to our award-winning workforce, our dedicated team believes in our mission to deliver the best possible experience for all of you.
