In [1]:
from datasets import load_dataset
#Load ILSUM-2.0 dataset english
ds = load_dataset("ILSUM/ILSUM-2.0", "English")

Downloading readme:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/107M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28342 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2895 [00:00<?, ? examples/s]

In [7]:
import pandas as pd

# Convert the dataset to Pandas DataFrames
df_train = ds['train'].to_pandas()
df_test = ds['test'].to_pandas()

In [8]:
df_train.head()

Unnamed: 0,id,Heading,Summary,Article
0,english_2023_train_0,"Barbie Review: Greta Gerwig, Margot Robbie, Ry...",Barbie Movie Review: Ryan Gosling shines the b...,Barbie Movie Review: One mention of Barbie and...
1,english_2023_train_1,Gadar 2: Sunny Deol-Ameesha Patel Starrer's Tr...,A source close to the film told News18 exclusi...,"The highly anticipated Gadar 2, starring Sunny..."
2,english_2023_train_2,Kartik Aaryan Ditches First Class To Fly In Ec...,Kartik Aaryan was spotted flying in economy cl...,"Kartik Aaryan, who is gearing up for the relea..."
3,english_2023_train_3,"India's Anju, Now Fatima, Receives Land, Money...",Abbasi had said that it was important to make ...,"Indian woman, Anju who travelled to Pakistan t..."
4,english_2023_train_4,Himachal Pradesh Hotels Offer 50% Discount As ...,Flash floods and landslides caused by heavy do...,"Himachal Pradesh, one of the worst hit states ..."


In [9]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split

# Function to clean text
def clean_text(text):
    #text = re.sub(r'<[^>]+>', '', text)  # remove HTML tags
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces and strip
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    return text



In [10]:
df = df_train[['Heading','Summary', 'Article']]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28342 entries, 0 to 28341
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Heading  28342 non-null  object
 1   Summary  28342 non-null  object
 2   Article  28342 non-null  object
dtypes: object(3)
memory usage: 664.4+ KB


In [None]:

# Apply cleaning function to subtitles and summaries
#df_train['Article'] = df_train['Article'].apply(clean_text)
#df_train['Summary'] = df_train['Summary'].apply(clean_text)


In [12]:

# Filter rows where the Summary string length is more than 150 characters
filtered_df = df[df['Summary'].apply(len) > 150]

In [13]:
filtered_df

Unnamed: 0,Heading,Summary,Article
3,"India's Anju, Now Fatima, Receives Land, Money...",Abbasi had said that it was important to make ...,"Indian woman, Anju who travelled to Pakistan t..."
4,Himachal Pradesh Hotels Offer 50% Discount As ...,Flash floods and landslides caused by heavy do...,"Himachal Pradesh, one of the worst hit states ..."
7,Brij Bhushan Sharan Singh Claims 22 State Unit...,Brij Bhushan Sharan Singh will have 'his candi...,Outgoing Wrestling Federation of India (WFI) p...
8,"India's Tiger Population Estimate at 3,682 Aft...",The latest estimate is said to cover areas of ...,"India's tiger population is estimated to be 3,..."
9,MP: Administration Razes Houses of Two Men Acc...,"The accused, Ravindra Kumar and Atul Bhadoliya...",The local administration on Saturday demolishe...
...,...,...,...
28337,Japan approves plan to release millions of ton...,A massive earthquake and tsunami in 2011 destr...,FILE - Tanks storing treated radioactive water...
28338,"16 killed, hundreds stranded on Indonesian mou...","Over 680 tourists from France, Thailand, the N...",Villagers clear debris caused by an earthquake...
28339,Gadhafi’s son Seif al-Islam released after mor...,"Seif al-Islam, the son and one-time heir appar...","Seif al-Islam, the son and one-time heir appar..."
28340,Rescued Thai boys to be ordained in Buddhist c...,Most members of the Thai youth football team r...,Thai BoysMost members of the Thai youth footba...


In [14]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21830 entries, 3 to 28341
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Heading  21830 non-null  object
 1   Summary  21830 non-null  object
 2   Article  21830 non-null  object
dtypes: object(3)
memory usage: 682.2+ KB


In [None]:
# Splitting labeled data for training
train_data, val_data = train_test_split(filtered_df, test_size=0.2, random_state=42)

In [None]:
# Reset the index of your DataFrames
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

In [None]:
train_data

In [None]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments
from transformers import BartTokenizer
from torch.utils.data import Dataset, DataLoader

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

In [None]:


class CustomSummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length
        self.inputs = self.data['Article']
        self.targets = self.data['Summary']

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = str(self.inputs[index])
        target_text = str(self.targets[index])

        # Tokenizing the input and target texts
        input_tokens = self.tokenizer.encode_plus(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        target_tokens = self.tokenizer.encode_plus(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        return {
            'input_ids': input_tokens['input_ids'].flatten(),
            'attention_mask': input_tokens['attention_mask'].flatten(),
            'labels': target_tokens['input_ids'].flatten()
        }



In [None]:
# Assuming your DataFrame is named df and already split into train_data and val_data
train_dataset = CustomSummarizationDataset(train_data, tokenizer)
val_dataset = CustomSummarizationDataset(val_data, tokenizer)

# Creating data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [None]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # data_collator defaults to DataCollatorWithPadding, adjust if necessary
)

trainer.train()

In [None]:
# Save the fine-tuned model
trainer.save_model("./fine_tuned_bart_model")