In [1]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00

In [4]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# Load training data
train_data = pd.read_csv("Downloads/llumno ai ass/summarize.csv")
val_data = pd.read_csv("Downloads/llumno ai ass/val_sumarize.csv")

# Step 1: Preprocess the dataset
def preprocess_text(text):
    if isinstance(text, str):  # Check if the entry is a string
        return text.lower()  # Convert to lowercase
    else:
        return ''  # Return an empty string if not a string (e.g., NaN)

# Preprocess the text in 'Input' and 'Output' columns for both train and validation data
train_data['Input'] = train_data['Input'].apply(preprocess_text)
train_data['Output'] = train_data['Output'].apply(preprocess_text)

val_data['Input'] = val_data['Input'].apply(preprocess_text)
val_data['Output'] = val_data['Output'].apply(preprocess_text)

# Ensure data is clean
print("Training Data Sample:\n", train_data.head())
print("Validation Data Sample:\n", val_data.head())

# Step 2: Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Step 3: Load GPT-2 tokenizer and model
model_name = "gpt2"  # Use GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)




Training Data Sample:
                                                Input  \
0  in this paper, the author investigates how to ...   
1  **summary of contributions:** this paper propo...   
2  this paper addresses the problem of moe routin...   
3  this paper discusses applications of variants ...   
4  the authors introduce the problem of telegraph...   

                                              Output  
0  this paper studies how to learn dexterous mani...  
1  this paper proposed a new family of losses for...  
2  mixture-of-expert (moe) models have demonstrat...  
3  in this work, the authors conduct experiments ...  
4  this paper presents methods for telegraphic su...  
Validation Data Sample:
                                                Input  \
0  this paper presents an approach, called estine...   
1  the paper aimed at improving the performance o...   
2  the submission shows the numerical instabiliti...   
3  this paper presents a method for training a ne...   
4  th



Map:   0%|          | 0/7692 [00:00<?, ? examples/s]

Map:   0%|          | 0/1648 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


ValueError: Expected input batch_size (2044) to match target batch_size (508).

In [None]:
# GPT-2 does not have padding token by default, so we add it
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Tokenization function
def tokenize_function(examples):
    max_input_length = 512  # Ensure the max length is consistent for both input and output
    max_output_length = 512  # Set max length to be equal for consistency

    # Tokenize inputs and outputs with the same padding strategy
    inputs = tokenizer(examples['Input'], truncation=True, padding='max_length', max_length=max_input_length)
    targets = tokenizer(examples['Output'], truncation=True, padding='max_length', max_length=max_output_length)

    # Align inputs and targets so that their lengths match
    inputs['labels'] = targets['input_ids']  # Aligning output with input as labels

    # Ignore padding tokens in labels for cross-entropy loss calculation
    inputs['labels'] = [
        label if label != tokenizer.pad_token_id else -100 for label in inputs['labels']
    ]

    return inputs

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)


# Step 4: Prepare the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=5e-5,
    save_steps=500,
    evaluation_strategy="steps",  # Evaluate during training
    eval_steps=500,               # Perform evaluation every 500 steps
    save_total_limit=2
)

# Step 5: Define Trainer and start fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,  # Validation dataset for evaluation
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

print("Fine-tuning complete and model saved.")


Map:   0%|          | 0/7692 [00:00<?, ? examples/s]

Map:   0%|          | 0/1648 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
500,2.5595,2.624704


Step,Training Loss,Validation Loss
500,2.5595,2.624704


In [None]:
82d29125ea3763bbd2c572dd972ad93e3ebb3202