In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-16.0.0-cp38-cp38-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp38-cp38-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py38-none-any.whl.metadata (7.1 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.9.5-cp38-cp38-win_amd64.whl.metadata (7.7 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->datasets)
  Downloading frozenlist-1.4.1-cp38-cp38-win_amd64.whl.metadata (12 kB)
Collecting multidict<7.0,

In [1]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

In [2]:
dataset = load_dataset('csv', data_files='dataset.csv')

In [3]:
dataset['train'][:5]

{'name': ['ibm', 'tata consultancy services', 'accenture', 'us army', 'ey'],
 'keywords': ['ibm,information technology and services,new york, new york, united states,united states',
  'tata,consultancy,services,information technology and services,bombay, maharashtra, india,india',
  'accenture,information technology and services,dublin, dublin, ireland,ireland',
  'us,army,military,alexandria, virginia, united states,united states',
  'ey,accounting,london, greater london, united kingdom,united kingdom']}

In [4]:
def filter_function(example):
    # Check if either 'name' or 'keywords' are None or empty strings
    return example['name'] not in (None, '') and example['keywords'] not in (None, '')

dataset = dataset.filter(filter_function)

In [5]:
def preprocess_function(examples):
    # Concatenate 'keywords' and 'name' with a specific format for model training
    examples['text'] = examples['keywords'] + " -> " + examples['name']
    return examples

# Apply the preprocessing function to the dataset
dataset = dataset.map(preprocess_function)

Map:   0%|          | 0/7173423 [00:00<?, ? examples/s]

In [6]:
sample_dataset = dataset['train'].shuffle(seed=42).select(range(1_000))
train_test_split = sample_dataset.train_test_split(test_size=0.1)
train_test_split

DatasetDict({
    train: Dataset({
        features: ['name', 'keywords', 'text'],
        num_rows: 900
    })
    test: Dataset({
        features: ['name', 'keywords', 'text'],
        num_rows: 100
    })
})

In [7]:
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Tokenizing the text data for GPT-2 model input
    model_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
    
    # GPT-2 expects labels for calculating the loss; we use input_ids as labels for language modeling.
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Apply tokenization and prepare the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)




Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
train_dataset[1]

{'name': 'zdravotnický holding královéhradeckého kraje a.s.',
 'keywords': 'zdravotnický,holding,královéhradeckého,kraje,a.s.,hospital & health care,hradec kralove, kralovehradecky kraj, czechia,czechia',
 'text': 'zdravotnický,holding,královéhradeckého,kraje,a.s.,hospital & health care,hradec kralove, kralovehradecky kraj, czechia,czechia -> zdravotnický holding královéhradeckého kraje a.s.',
 'input_ids': [89,
  67,
  4108,
  313,
  17172,
  127,
  121,
  11,
  19216,
  11,
  38584,
  6557,
  27086,
  2634,
  11840,
  671,
  694,
  2634,
  8873,
  11,
  74,
  430,
  18015,
  11,
  64,
  13,
  82,
  1539,
  49257,
  1222,
  1535,
  1337,
  11,
  11840,
  671,
  66,
  479,
  1373,
  659,
  11,
  479,
  1373,
  659,
  11840,
  671,
  694,
  88,
  479,
  430,
  73,
  11,
  269,
  15356,
  544,
  11,
  66,
  15356,
  544,
  4613,
  1976,
  67,
  4108,
  313,
  17172,
  127,
  121,
  4769,
  479,
  81,
  6557,
  27086,
  2634,
  11840,
  671,
  694,
  2634,
  8873,
  479,
  430,
  18015,
 

In [11]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5325,0.47204
2,0.4062,0.441925
3,0.3992,0.434958


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=675, training_loss=0.5309929900699192, metrics={'train_runtime': 3457.0803, 'train_samples_per_second': 0.781, 'train_steps_per_second': 0.195, 'total_flos': 176372121600000.0, 'train_loss': 0.5309929900699192, 'epoch': 3.0})

In [12]:
# Save the model and tokenizer
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

# Evaluate the model on the validation set
results = trainer.evaluate()
print(results)

{'eval_loss': 0.43495845794677734, 'eval_runtime': 36.13, 'eval_samples_per_second': 2.768, 'eval_steps_per_second': 0.36, 'epoch': 3.0}


In [23]:
def generate_text(prompt_text, max_length=50):
    # Encode the prompts using the tokenizer
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    
    # Generate a sequence of tokens following the prompt
    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=max_length + len(encoded_prompt[0]),
        temperature=1.0,
        top_k=40,
        top_p=0.95,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )
    
    # Decode the output sequences to strings
    generated_sequences = []
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()
        
        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        
        # Remove all text after the stop token
        text = text[: text.find(tokenizer.eos_token)]
        
        generated_sequences.append(text)
    
    return generated_sequences[0]

# Example usage
prompt_text = "Innovation, Tech, Advancement, AI, DeepLearning, Machine Learning"
generated_text = generate_text(prompt_text + ' ->')
print("Generated Text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: Innovation, Tech, Advancement, AI, DeepLearning, Machine Learning -> ininnuine
