In [None]:
# Loading Datasets from Google Sheets without authentication
import pandas as pd

# CSV export URLs 
csv_url_1 = 'https://docs.google.com/spreadsheets/d/1Nw35Dq_s_0kxizCw9yQW9CGBc7pWfybDKc9SnfGG1Os/export?format=csv&gid=924822626'
csv_url_2 = 'https://docs.google.com/spreadsheets/d/1sgIRq1UvHADKxm1Z2seZFl1d2TZTWXCC9IdpZjEN5Jg/export?format=csv&gid=0'

try:
    # Load the first dataset directly as CSV
    df1 = pd.read_csv(csv_url_1)

    # Load the second dataset directly as CSV
    df2 = pd.read_csv(csv_url_2)

    print("DataFrames loaded successfully.")
    print("First DataFrame (df1) head:")
    display(df1.head())

    print("\nSecond DataFrame (df2) head:")
    display(df2.head())

except Exception as e:
    print(f"An unexpected error occurred: {e}")

DataFrames loaded successfully.
First DataFrame (df1) head:


Unnamed: 0,text,text_length,summary,summary_length,type
0,ينظم معهد الشارقة للفنون معرضاً فنياً تحت عنوا...,246,"ينظم معهد الشارقة للفنون معرضاً فنياً بعنوان ""...",150,Culture
1,تقيم الفنانة ليتا كابيلوت معرضاً في مطلع العام...,963,تقيم الفنانة ليتا كابيلوت معرضاً في دبي مطلع ا...,471,Culture
2,تتواصل الليلة مسيرة التحدّي والمتعة والإثارة ض...,1717,تستمر الليلة منافسات برنامج شاعر المليون، حيث ...,426,Culture
3,عقدت ظهر أمس الأول في قصر الثقافة في الشارقة ا...,4488,عُقدت الجلسة الثانية والأخيرة من الملتقى الأول...,555,Culture
4,اختارت صحيفة «التايمز» جورج أورويل ليحل في الم...,1050,اختارت صحيفة «التايمز» الكاتب البريطاني جورج أ...,493,Culture



Second DataFrame (df2) head:


Unnamed: 0,text,text_length,summary,summary_length
0,يكون سعر الفاكهة والخضراوات في موسم إنباتها أق...,6034,يُعد شراء الفاكهة والخضراوات في موسمها خطوة ذك...,1154
1,الأطعمة الصحية ليست باهظة الثمن بالضرورة، بل ف...,4061,الأطعمة الصحية ليست بالضرورة مكلفة، بل غالبًا ...,1035
2,استفد من حديقتك المنزلية أو أصيص الزرع الصغير ...,3050,يمكنك تقليل تكلفة الطعام وتحسين جودته عبر استغ...,792
3,تساعدك الخطط المسبقة في كل نواحي حياتك على وضع...,2958,تساعد الخطط المسبقة في تنظيم حياتك بشكل عام، و...,906
4,نظرا لأن السبب الرئيسي لضغط العين هو أن ثقافة ...,722,يرتبط ضغط العين بشكل كبير بقضاء وقت طويل أمام ...,498


In [None]:
# Concatenate the two dataframes
merged_df = pd.concat([df1, df2], ignore_index=True)

# Drop columns that are not needed for the summarization task
# Assuming 'text', 'summary', 'text_length', and 'summary_length' are the relevant columns
columns_to_keep = ['text', 'summary', 'text_length', 'summary_length']
merged_df = merged_df[columns_to_keep]


# Drop rows where either 'text' or 'summary' is missing
merged_df.dropna(subset=['text', 'summary'], inplace=True)

# Convert text and summary columns to string type
merged_df['text'] = merged_df['text'].astype(str)
merged_df['summary'] = merged_df['summary'].astype(str)


print("Data merged and cleaned.")
print("Missing values after cleaning:")
print(merged_df.isnull().sum())

print("\nDataFrame after cleaning:")
display(merged_df.head())
print("\nShape of the DataFrame after cleaning:")
print(merged_df.shape)

Data merged and cleaned.
Missing values after cleaning:
text              0
summary           0
text_length       0
summary_length    0
dtype: int64

DataFrame after cleaning:


Unnamed: 0,text,summary,text_length,summary_length
0,ينظم معهد الشارقة للفنون معرضاً فنياً تحت عنوا...,"ينظم معهد الشارقة للفنون معرضاً فنياً بعنوان ""...",246,150
1,تقيم الفنانة ليتا كابيلوت معرضاً في مطلع العام...,تقيم الفنانة ليتا كابيلوت معرضاً في دبي مطلع ا...,963,471
2,تتواصل الليلة مسيرة التحدّي والمتعة والإثارة ض...,تستمر الليلة منافسات برنامج شاعر المليون، حيث ...,1717,426
3,عقدت ظهر أمس الأول في قصر الثقافة في الشارقة ا...,عُقدت الجلسة الثانية والأخيرة من الملتقى الأول...,4488,555
4,اختارت صحيفة «التايمز» جورج أورويل ليحل في الم...,اختارت صحيفة «التايمز» الكاتب البريطاني جورج أ...,1050,493



Shape of the DataFrame after cleaning:
(14150, 4)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary sets (e.g., 80% train, 20% temp)
train_df, temp_df = train_test_split(merged_df, test_size=0.2, random_state=42)

# Split the temporary set into validation and test sets (e.g., 10% validation, 10% test from the original data)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print("Shape of training set:", train_df.shape)
print("Shape of validation set:", val_df.shape)
print("Shape of test set:", test_df.shape)

Shape of training set: (11320, 4)
Shape of validation set: (1415, 4)
Shape of test set: (1415, 4)


In [None]:
!pip install transformers datasets

Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl.metadata (16 kB)
Collecting PyArabic (from arabert)
  Downloading PyArabic-0.6.15-py3-none-any.whl.metadata (10 kB)
Collecting farasapy (from arabert)
  Downloading farasapy-0.0.14-py3-none-any.whl.metadata (8.9 kB)
Collecting emoji==1.4.2 (from arabert)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: emoji


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Specify the pre-trained model name for Arabic summarization
model_name = "UBC-NLP/AraT5v2-base-1024"

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"Model and Tokenizer loaded successfully from {model_name}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Model and Tokenizer loaded successfully from UBC-NLP/AraT5v2-base-1024


In [None]:
# Define a function to tokenize the data
def tokenize_function(examples):
    # Tokenize the input text (articles)
    model_inputs = tokenizer(examples["text"], max_length=512, truncation=True)

    # Tokenize the target text (summaries)
    labels = tokenizer(examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert pandas DataFrames to Hugging Face Datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenization function to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

print("Data tokenization complete.")
print(tokenized_train_dataset)
print(tokenized_val_dataset)
print(tokenized_test_dataset)

Map:   0%|          | 0/11320 [00:00<?, ? examples/s]

Map:   0%|          | 0/1415 [00:00<?, ? examples/s]

Map:   0%|          | 0/1415 [00:00<?, ? examples/s]

Data tokenization complete.
Dataset({
    features: ['text', 'summary', 'text_length', 'summary_length', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 11320
})
Dataset({
    features: ['text', 'summary', 'text_length', 'summary_length', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1415
})
Dataset({
    features: ['text', 'summary', 'text_length', 'summary_length', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1415
})


In [None]:
from transformers import Seq2SeqTrainingArguments

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Directory to save output
    learning_rate=2e-5,  
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,  
    weight_decay=0.01, 
    num_train_epochs=3,  # Number of training epochs
    predict_with_generate=True, # To generate text during evaluation
    logging_dir="./logs", # Directory for logs
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none" # To avoid logging to external services for this example
)

print("Training arguments defined.")

Training arguments defined.


In [None]:
from transformers import DataCollatorForSeq2Seq

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

print("Data collator defined.")

Data collator defined.


In [None]:
from transformers import Seq2SeqTrainer

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset=tokenized_train_dataset, 
    eval_dataset=tokenized_val_dataset, 
    data_collator=data_collator, 
    processing_class=tokenizer,
)

print("Trainer initialized.")

Trainer initialized.


In [None]:
# Start training the model
print("Starting model training...")
trainer.train()
print("Model training complete.")

Starting model training...


Step,Training Loss
10,27.233
20,18.7395
30,14.1932
40,9.7577
50,6.9852
60,5.9194
70,5.0794
80,4.4594
90,4.5666
100,4.293


Model training complete.


In [None]:
# Evaluate the model on the test set
print("Evaluating the model on the test set...")
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

print("Evaluation complete.")
print(eval_results)

Evaluating the model on the test set...


Evaluation complete.
{'eval_loss': 1.8935766220092773, 'eval_runtime': 98.7984, 'eval_samples_per_second': 14.322, 'eval_steps_per_second': 3.583, 'epoch': 3.0}


In [None]:
# Define the directory to save the model and tokenizer
save_directory = "./fine_tuned_arat5_new_model"

# Save the model and tokenizer
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./fine_tuned_arat5_new_model


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

from google.colab import files

!zip -r /content/fine_tuned_arat5_model.zip /content/fine_tuned_arat5_model
files.download('/content/fine_tuned_arat5_model.zip')

Mounted at /content/drive


In [None]:
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=2832a82d444d2a86836953da9020f1a0d510abe59271128d59d0eec7db3c4942
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.5 rouge_score-0.1.2


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
import torch
from tqdm import tqdm

model_path = '/content/drive/MyDrive/model'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval() 

test_texts = [row['article'] for row in test_dataset]  
reference_summaries = [row['highlights'] for row in test_dataset]

generated_summaries = []
for text in tqdm(test_texts):
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors='pt').to(device) 
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for ref, gen in zip(reference_summaries, generated_summaries):
    scores = scorer.score(ref, gen)
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

print(f"Average ROUGE-1: {avg_rouge1*100:.2f}%")
print(f"Average ROUGE-2: {avg_rouge2*100:.2f}%")
print(f"Average ROUGE-L: {avg_rougeL*100:.2f}%")

Average ROUGE-1: 26.28%
Average ROUGE-2: 13.31%
Average ROUGE-L: 25.74%