# Google MT5 trial 2 by Ayush Kumar


##### in this notebook we are trying A new LLM named MT5 which is specialy designed for text summrization part


In [None]:
# Importing the required libraries
import pandas as pd
import re
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Step 1: Load and preprocess the dataset
def preprocess_text(text):
    if isinstance(text, str):  # Check if the entry is a string
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text.lower()  # Convert to lowercase
    else:
        return ''  # Return an empty string if not a string (e.g., NaN)

# Load training and validation datasets
train_data = pd.read_csv('/content/drive/MyDrive/LLUMO/processed_train.csv')
val_data = pd.read_csv('/content/drive/MyDrive/LLUMO/processed_validation.csv')  # Assuming validation data is available

# Preprocess the text in 'Input' and 'Output' columns
train_data['Input'] = train_data['Input'].apply(preprocess_text)
train_data['Output'] = train_data['Output'].apply(preprocess_text)
val_data['Input'] = val_data['Input'].apply(preprocess_text)
val_data['Output'] = val_data['Output'].apply(preprocess_text)

# Step 2: Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Step 3: Load MT5 tokenizer and model
model_name = "google/mt5-small"  # You can choose mt5-small, mt5-base, mt5-large, etc.
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization
def tokenize_function(examples):
    inputs = tokenizer(examples['Input'], truncation=True, padding='max_length', max_length=512)
    targets = tokenizer(examples['Output'], truncation=True, padding='max_length', max_length=128)  # Assuming summaries are shorter
    inputs['labels'] = targets['input_ids']
    return inputs

# Tokenize both training and validation datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)





Map:   0%|          | 0/7491 [00:00<?, ? examples/s]

Map:   0%|          | 0/1634 [00:00<?, ? examples/s]

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00

In [8]:
# Step 4: Prepare the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=5e-5,
    save_steps=500,
    evaluation_strategy="steps",  # Evaluate during training
    eval_steps=500,               # Perform evaluation every 500 steps
    save_total_limit=2
)

# Step 5: Define Trainer and start fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,  # Validation dataset for evaluation
    tokenizer=tokenizer
)
for param in model.parameters():
    param.data = param.data.contiguous()

# Train the model

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_mt5')
tokenizer.save_pretrained('./fine_tuned_mt5')

print("Fine-tuning complete and model saved.")




Step,Training Loss,Validation Loss
500,6.2847,4.318887
1000,3.7359,3.146604
1500,3.1587,2.906727
2000,3.2756,2.838207
2500,3.4613,2.800581
3000,2.9205,2.779632
3500,2.9507,2.755191
4000,2.8313,2.740955
4500,3.0677,2.735096
5000,3.026,2.723583


Fine-tuning complete and model saved.


In [9]:
from transformers import pipeline, MT5ForConditionalGeneration, MT5Tokenizer

# Load the fine-tuned model and tokenizer
fine_tuned_model = MT5ForConditionalGeneration.from_pretrained('./fine_tuned_mt5')
fine_tuned_tokenizer = MT5Tokenizer.from_pretrained('./fine_tuned_mt5')

# Create a summarization pipeline
summarizer = pipeline('summarization', model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Define the prompt
def generate_summary(meta_review_text):
    prompt = f"""
    Summarize the following paper meta-review in a concise and informative manner:

    Meta-Review: {meta_review_text}

    Summary:
    """
    inputs = fine_tuned_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = fine_tuned_model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    return fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage with a sample meta-review
meta_review = "The paper explores new neural network architectures for image classification but lacks proper evaluation on diverse datasets, making it hard to generalize the results."
summary = generate_summary(meta_review)
print("Generated Summary:", summary)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Generated Summary: this paper presents new neural network architectures for image classification but lacks proper evaluation on diverse datasets and lacks proper evaluation on diverse datasets for image classification but lacks proper evaluation on diverse datasets and lacks proper evaluation on diverse datasets for image classification but lacks proper evaluation on diverse datasets and lacks proper evaluation on diverse datasets for image classification but lacks proper evaluation on diverse datasets and lacks proper evaluation on diverse datasets


In [12]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [20]:
import pandas as pd
import re
import torch
from transformers import pipeline, MT5ForConditionalGeneration, MT5Tokenizer

# Load the fine-tuned model and tokenizer
fine_tuned_model = MT5ForConditionalGeneration.from_pretrained('./fine_tuned_mt5')
fine_tuned_tokenizer = MT5Tokenizer.from_pretrained('./fine_tuned_mt5')

# Check for GPU availability
device = 0 if torch.cuda.is_available() else -1  # Use 0 for the first GPU or -1 for CPU

# Create a summarization pipeline with device specified
summarizer = pipeline('summarization', model=fine_tuned_model, tokenizer=fine_tuned_tokenizer, device=device)

# Define the preprocessing function
def preprocess_text(text):
    if isinstance(text, str):  # Check if the entry is a string
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text.lower()  # Convert to lowercase
    else:
        return ''  # Return an empty string if not a string (e.g., NaN)

# Load your test data
test_data = pd.read_csv('/content/drive/MyDrive/LLUMO/processed_test.csv')  # Update with the path to your test data

# Select the top 20 rows from the test data
test_data_top_20 = test_data.head(500).copy()  # Create a copy to avoid SettingWithCopyWarning

# Preprocess the 'Input' column in the test data
test_data_top_20.loc[:, 'Input'] = test_data_top_20['Input'].apply(preprocess_text)

# Generate summaries for the preprocessed test data rows
test_data_top_20.loc[:, 'Generated_Summary'] = test_data_top_20['Input'].apply(generate_summary)

# Load ROUGE metric after ensuring dependencies are installed
!pip install rouge_score  # Install the ROUGE score library if not already installed

from evaluate import load  # Make sure to import load from the evaluate library

# Load ROUGE metric
rouge_metric = load("rouge")

# Calculate ROUGE scores for the top 20
rouge_scores = rouge_metric.compute(
    predictions=test_data_top_20['Generated_Summary'].tolist(),
    references=test_data_top_20['Output'].tolist()
)

# Print ROUGE scores
print("ROUGE Scores:", rouge_scores)


ROUGE Scores: {'rouge1': 0.1993118095536636, 'rouge2': 0.04013189786225761, 'rougeL': 0.15473585889709227, 'rougeLsum': 0.1546420010652066}


In [16]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=09d9370c50c29b61b4ab2fe8eafee7d1aeeef483c30be7721b178f04c5f72a18
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
