In [41]:
#importing important libraries
import re
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
with open('llm_model_training_data.txt', 'r', encoding ='utf-8') as file:
  text_data = file.read()
chars = sorted(set(text_data))
print(len(text_data))
print(chars)

1719777
['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '~', '£', '©', '®', '°', 'Á', 'Ç', 'Ø', 'Ú', 'á', 'ã', 'ä', 'ç', 'è', 'é', 'í', 'ô', 'ú', 'û', 'ü', 'ć', 'č', 'Ľ', 'ś', 'ş', 'ا', 'ت', 'ج', 'د', 'ر', 'س', 'ش', 'ع', 'ف', 'ق', 'ل', 'م', 'ن', 'و', 'ک', 'ں', 'ہ', 'ی', 'ے', '\u200b', '–', '—', '‘', '’', '“', '”', '…', '⁰', '₂', '₦', '€', '™', '爱', '\ufeff']


In [None]:
#Preprocessing the data
def preprocess_text(input_data):
    input_data= re.sub(r'[^a-zA-Z\s£$%+=<>]', '', input_data)
    tokens = word_tokenize(input_data)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    processed_data = ' '.join(tokens)

    return processed_data

In [None]:
preprocessed_data = preprocess_text(text_data)

In [None]:
class CompanyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data[idx]
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=self.max_length, truncation=True)
        return inputs


In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
company_dataset = CompanyDataset(data=[preprocessed_data], tokenizer=tokenizer)


In [None]:
#fine-tuning the model
batch_size = 8
dataloader = DataLoader(company_dataset, batch_size=batch_size)


In [None]:
model = GPT2LMHeadModel.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
epochs = 3
learning_rate = 1e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch.input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
model.save_pretrained("fine_tuned_company_model")
tokenizer.save_pretrained("fine_tuned_company_model")

('fine_tuned_company_model/tokenizer_config.json',
 'fine_tuned_company_model/special_tokens_map.json',
 'fine_tuned_company_model/vocab.json',
 'fine_tuned_company_model/merges.txt',
 'fine_tuned_company_model/added_tokens.json')

In [None]:
fine_tune_model = "fine_tuned_company_model"
tokenizer = GPT2Tokenizer.from_pretrained(fine_tune_model)
model = GPT2LMHeadModel.from_pretrained(fine_tune_model)

In [None]:
#creating testing suite
def generate_text(prompt, model, tokenizer, max_length=100, num_return_sequences=1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )

    generated_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output]

    unique_texts = []
    seen_texts = set()
    for text in generated_texts:
        if text not in seen_texts:
            unique_texts.append(text)
            seen_texts.add(text)

    return unique_texts

In [None]:
prompt = "Ask a question about the company culture."
generated_texts = generate_text(prompt, model, tokenizer, max_length=200)

for i, generated_text in enumerate(generated_texts):
    print(f"This text is generated for: {generated_text}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This text is generated for: Ask a question about the company culture.

"We're a company that's been around for a long time and we're very proud of what we do," said CEO John McAfee. "We've been doing this for over 20 years and it's a very exciting time for us. We're excited about what's coming and looking forward to working with you."
.@JohnMcAfee is a leader in the world of security and privacy. He is the founder of the leading security company in Asia. His company, Security, is dedicated to protecting the privacy of all our customers. John is also the CEO of Security Asia, a leading global security firm. Security is an integral part of our business and our mission is to provide the best security solutions for our clients.
