# Finetuning with O*NET Dataset 

In [None]:
import torch
from datasets import Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    DataCollatorForLanguageModeling
)
import logging
import pandas as pd

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)
from transformers import logging as hf_logging
hf_logging.set_verbosity_info()

# Check device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

# Load and inspect the dataset
df = pd.read_csv('Occupation Data.csv')  # Replace with your CSV file name
dataset = Dataset.from_pandas(df)

print("Dataset Column Names:", dataset.column_names)

print("\nSample Data:")
print(dataset[:3])

# Initialize tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPT-2 does not have a pad token by default, so we set it to the eos token
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# Resize token embeddings in case new tokens are added
model.resize_token_embeddings(len(tokenizer))

# Preprocessing function to create prompt-response pairs
def preprocess_function(examples):
    codes = examples['O*NET-SOC Code']
    titles = examples['Title']
    descriptions = examples['Description']
    
    # Create prompts and responses
    prompts = [
        f"O*NET-SOC Code: {code}\nJob Title: {title}\nJob Description:"
        for code, title in zip(codes, titles)
    ]
    responses = [desc for desc in descriptions]
    
    # Concatenate prompt and response
    full_texts = [prompt + " " + response for prompt, response in zip(prompts, responses)]
    
    # Tokenize the concatenated texts
    tokenized = tokenizer(
        full_texts,
        max_length=512,  # Adjust as needed
        truncation=True,
        padding='max_length'
    )
    
    return tokenized

# Apply preprocessing
tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True, 
    num_proc=1,
    remove_columns=dataset.column_names,
    load_from_cache_file=True,
    desc="Tokenizing the dataset"
)

print("\nTokenized Dataset Columns:", tokenized_dataset.column_names)
print("\nSample Tokenized Data:")
print(tokenized_dataset[:3])

# Set the format for PyTorch
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Clear CUDA cache if available
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./gpt2_onet_soc_model',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100,
    per_device_train_batch_size=2,  # Adjust based on GPU memory
    per_device_eval_batch_size=2,   # Adjust based on GPU memory
    num_train_epochs=5,             # Increase epochs for better performance
    gradient_accumulation_steps=4,  # To simulate larger batch size
    fp16=True if torch.cuda.is_available() else False,  # Enable FP16 if GPU supports
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    report_to=["tensorboard"],
    logging_dir='./logs',
    logging_first_step=True,
    optim='adamw_torch',
    dataloader_num_workers=4,
    run_name='gpt2_onet_soc_finetuning',
    learning_rate=5e-5,
)

# Initialize early stopping
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2
)

# Split the dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping],
)

# Start training
print("Starting training for O*NET-SOC Job Description Generation with GPT-2...")
trainer.train()
print("Training completed.")

# Save the fine-tuned model
trainer.save_model('./gpt2_onet_soc_model')
tokenizer.save_pretrained('./gpt2_onet_soc_model')

# Define a generation function
def generate_job_description(code, title):
    input_prefix = f"O*NET-SOC Code: {code}\nJob Title: {title}\nJob Description:"
    inputs = tokenizer.encode(
        input_prefix, 
        return_tensors='pt', 
        max_length=512, 
        truncation=True
    ).to(device)
    
    outputs = model.generate(
        inputs,
        max_length=600,  # Adjust as needed
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the job description
    job_description = generated_text.split("Job Description:")[-1].strip()
    return job_description

# Example usage
test_code = dataset['O*NET-SOC Code'][0]
test_title = dataset['Title'][0]
print("\nO*NET-SOC Code:", test_code)
print("Job Title:", test_title)
print("\nGenerated Job Description:")
print(generate_job_description(test_code, test_title))


Using device: cpu
Dataset Column Names: ['O*NET-SOC Code', 'Title', 'Description']

Sample Data:
{'O*NET-SOC Code': ['11-1011.00', '11-1011.03', '11-1021.00'], 'Title': ['Chief Executives', 'Chief Sustainability Officers', 'General and Operations Managers'], 'Description': ['Determine and formulate policies and provide overall direction of companies or private and public sector organizations within guidelines set up by a board of directors or similar governing body. Plan, direct, or coordinate operational activities at the highest level of management with the help of subordinate executives and staff managers.', 'Communicate and coordinate with management, shareholders, customers, and employees to address sustainability issues. Enact or oversee a corporate sustainability strategy.', 'Plan, direct, or coordinate the operations of public or private sector organizations, overseeing multiple departments or locations. Duties and responsibilities include formulating policies, managing daily o

loading file vocab.json from cache at C:\Users\swaro\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\vocab.json
loading file merges.txt from cache at C:\Users\swaro\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\swaro\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\tokenizer_config.json
loading file tokenizer.json from cache at C:\Users\swaro\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\tokenizer.json
loading configuration file config.json from cache at C:\Users\swaro\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "


Tokenized Dataset Columns: ['input_ids', 'attention_mask']

Sample Tokenized Data:
{'input_ids': [[46, 9, 12884, 12, 50, 4503, 6127, 25, 1367, 12, 8784, 16, 13, 405, 198, 33308, 11851, 25, 5953, 8393, 315, 1083, 198, 33308, 12489, 25, 45559, 3810, 290, 46418, 4788, 290, 2148, 4045, 4571, 286, 2706, 393, 2839, 290, 1171, 6567, 5745, 1626, 9949, 900, 510, 416, 257, 3096, 286, 13445, 393, 2092, 15030, 1767, 13, 5224, 11, 1277, 11, 393, 20435, 13919, 4568, 379, 262, 4511, 1241, 286, 4542, 351, 262, 1037, 286, 34618, 12353, 290, 3085, 11663, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 5025

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

: 