In [26]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
from transformers import BertModel, BertTokenizer

In [27]:
class BERTSUM(torch.nn.Module):
    def __init__(self):
        super(BERTSUM, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)
        #self.linear = torch.nn.Linear(self.bert.config.hidden_size, len(filtered_sentences))

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        scores = self.linear(pooled_output)
        return scores

In [28]:
# Initialize the lemmatizer and stopword list
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

# Original sentences
original_sentences = ["The quick brown fox jumps over the lazy dog.",
                       "This is a sentence.",
                       "Another sentence here.",
                       "This is a short sentence.",
                       "The dog is lazy."]

# Preprocess the sentences
filtered_sentences = []
for sentence in original_sentences:
    # Tokenize the sentence
    words = nltk.word_tokenize(sentence.lower())
    
    # Remove stopwords and lemmatize the remaining words
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
    
    # Convert the filtered words back to a sentence and add it to the list of filtered sentences
    filtered_sentence = ' '.join(filtered_words)
    filtered_sentences.append(filtered_sentence)

In [29]:
# def bertsum_extractive_summarization(original_sentences, filtered_sentences, size=5):
    
#     # Set the random seed for PyTorch
#     torch.manual_seed(42)
#     # Tokenize original sentences
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     # input_ids = tokenizer(original_sentences, padding=True, truncation=True, return_tensors='pt').input_ids
#     # attention_mask = tokenizer(original_sentences, padding=True, truncation=True, return_tensors='pt').attention_mask
    
#     # Tokenize filtered sentences
#     input_ids_filtered = tokenizer(filtered_sentences, padding=True, truncation=True, return_tensors='pt').input_ids
#     attention_mask_filtered = tokenizer(filtered_sentences, padding=True, truncation=True, return_tensors='pt').attention_mask
    
#     # Generate BERTSUM scores for filtered sentences
#     model = BERTSUM()
#     model.eval()
#     scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))

#     # Sort scores in descending order
#     sorted_scores, sorted_indices = torch.sort(scores.view(-1), descending=True)

#     # Select top "size" sentences with highest scores
#     top_indices = sorted_indices[:size].tolist()

#     # Generate final summary
#     summary = ""
#     for index in top_indices:
#         summary += original_sentences[index] + " "
            
#     return summary

In [30]:
# summary1 = bertsum_extractive_summarization(original_sentences, filtered_sentences, 2)
# summary2 = bertsum_extractive_summarization(original_sentences, filtered_sentences, 2)
# summary3 = bertsum_extractive_summarization(original_sentences, filtered_sentences, 2)

In [31]:
# print(summary1)
# print(summary2)
# print(summary3)

In [32]:
def bertsum_extractive_summarization_batch(original_sentences, filtered_sentences, batch_size=10, size=5):
    # Set the random seed for PyTorch
    torch.manual_seed(42)

    # Tokenize original sentences
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Initialize the BERTSUM model
    model = BERTSUM()
    model.eval()

    num_batches = len(filtered_sentences) // batch_size + int(len(filtered_sentences) % batch_size > 0)
    all_scores = []

    for batch_idx in range(num_batches):
        # Tokenize filtered sentences in the current batch
        batch_start = batch_idx * batch_size
        batch_end = min((batch_idx + 1) * batch_size, len(filtered_sentences))
        input_ids_filtered = tokenizer(filtered_sentences[batch_start:batch_end], padding=True, truncation=True, return_tensors='pt').input_ids
        attention_mask_filtered = tokenizer(filtered_sentences[batch_start:batch_end], padding=True, truncation=True, return_tensors='pt').attention_mask

        # Generate BERTSUM scores for filtered sentences in the current batch
        scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))

        # Store scores for the current batch
        all_scores.extend(scores.view(-1).tolist())

    # Sort scores in descending order
    sorted_indices = sorted(range(len(all_scores)), key=lambda i: all_scores[i], reverse=True)

    # Select top "size" sentences with highest scores
    top_indices = sorted_indices[:size]
    # Generate final summary
    summary = ""
    for index in top_indices:
        summary += original_sentences[index] + " "

    return summary


In [33]:
summary = bertsum_extractive_summarization_batch(original_sentences, filtered_sentences, size = 3)
summary

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))


'This is a sentence. This is a short sentence. Another sentence here. '

In [34]:
import pandas as pd
from summarization_algorithm import *
from preprocessing_algorithms import *
from efficiency_scores import *
df = pd.read_csv("DUC\Duc_first_reference\main_dataset\Duc_dataset_first_ref_summary.csv")

In [35]:
sentences, processed_sentences =process_one_column_df(df['Original Article'],remove_stopwords=True,lemmatization=True)

100%|██████████| 49/49 [02:57<00:00,  3.63s/it]


In [36]:
list_of_summaries = []
for i in tqdm(range(49)):
    list_of_summaries.append(bertsum_extractive_summarization_batch(sentences[i],processed_sentences[i],size = 5))

  0%|          | 0/49 [00:00<?, ?it/s]Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))
  2%|▏         | 1/49 [00:0

In [37]:
from summarizer import Summarizer
modeles = Summarizer()
modeles("".join(combined_sentences[1]))

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'combined_sentences' is not defined

In [None]:
predicted_summary = "Honduras braced for potential catastrophe Tuesday as Hurricane Mitch \r\nroared through the northwest Caribbean, churning up high waves and \r\nintense rain that sent coastal residents scurrying for safer ground. In Belize, \r\na hurricane watch was in place and the government also closed schools \r\nand sent workers home early Monday. Jerry Jarrell, the weather \r\ncenter director, said Mitch was the strongest hurricane to strike \r\nthe Caribbean since 1988, when Gilbert killed more than 300 people. Maria \r\nGonzalez said she needed the gas to cook with when her firewood gets \r\nwet. In El Progreso, \r\n100 miles (160 kilometers) north of the Honduran capital of Tegucigalpa, \r\nthe army evacuated more than 5,000 people who live in low-lying banana \r\nplantations along the Ulua River, said Nolly Soliman, a resident. That meant the Honduran coast had been under hurricane \r\nconditions for more than a day. ``The hurricane has destroyed almost \r\neverything,'' said Mike Brown, a resident of Guanaja Island which \r\nwas within miles (kms) of the eye of the hurricane. At its, 4th graf pvs \r\n \r\nHurricane Mitch cut through the Honduran coast like a ripsaw Thursday, \r\nits devastating winds whirling for a third day through resort islands \r\nand mainland communities. About 10,000 residents fled to crowded shelters \r\nin schools, churches and firehouses. Only a few hotels and offices with their own \r\ngenerators had electricity. Wind-whipped waves almost buried some \r\nhouses near the shore. Police and soldiers patrolled the streets, and a few \r\npeople wandered amid the boarded-up houses. ``We couldn't go out on a boat, we couldn't go snorkeling. The U.S. \r\nAgency for International Development sent two helicopters each to \r\nBelize and Honduras to help in search, rescue and relief efforts. At least 231 people have been confirmed dead in Honduras from former-hurricane \r\nMitch, bringing the storm's death toll in the region to 357, the National \r\nEmergency Commission said Saturday. By late \r\nSunday, Mitch's winds, once near 180 mph (290 kph), had dropped to \r\nnear 30 mph (50 kph), and the storm _ now classified as a tropical \r\ndepression _ was near Tapachula, on Mexico's southern Pacific coast \r\nnear the Guatemalan border. That is in addition to least another 600 people elsewhere in the country, \r\nBolanos said. But its own regional affiliate in \r\nSan Miguel province reported 125 dead there alone. Guatemala reported \r\n100 storm-related deaths. the EU approved for the region on Friday. Pope John Paul II appealed for aid Wednesday for the Central American \r\ncountries stricken by hurricane Mitch and said he feels close to the \r\nthousands who are suffering. Among those attending the audience were six \r\nRussian cosmonauts taking a special course in Italy. President Carlos Flores declared Hurricane Mitch had \r\nset back Honduras' development by 50 years. In the capital, Tegucigalpa, Mexican rescue teams began searching \r\nfor avalanche victims. ``We have more access to places affected by the storm,'' Urbizo explained. El Salvador reported 239 dead; Guatemala said 194 of its \r\npeople had been killed. Hillary Rodham Clinton also will \r\ntravel to the region, visiting Nicaragua and Honduras on Nov. 16. The estimated \r\nnumber of homeless dropped from 580,000 to 569,000 Thursday. ``Our problem is that \r\nall of the country has been affected,'' he said. Dozens \r\nof homes were swept into the river. Foreign aid and pledges of assistance poured into Central America, \r\nbut damage to roads and bridges reduced the amount of supplies reaching \r\nhundreds of isolated communities to a trickle: only as much as could \r\nbe dropped from a helicopter, when the aircraft can get through. In many \r\nnearby villages, residents have gone days without potable water or \r\nfood. ``It's a coincidence that the ships \r\nare there but they've got men and equipment that can be put to work \r\nin an organized way,'' said International Development Secretary Clare \r\nShort."
original_summary = df.iloc[1,1]

In [None]:
efficiency_dict = calculate_efficiency(predicted_summary,original_summary)

In [38]:
df_sum = pd.DataFrame(list_of_summaries)

In [39]:
scores = rouge_scores_df(df,df_sum)

100%|██████████| 49/49 [01:16<00:00,  1.57s/it]


In [40]:
avg_scores = df_avg_by_column(scores)

In [41]:
avg_scores

rouge1_fmeasure        tensor(0.1965)
rouge1_precision       tensor(0.2541)
rouge1_recall          tensor(0.1665)
rouge2_fmeasure        tensor(0.0203)
rouge2_precision       tensor(0.0269)
rouge2_recall          tensor(0.0169)
rougeL_fmeasure        tensor(0.1114)
rougeL_precision       tensor(0.1474)
rougeL_recall          tensor(0.0931)
rougeLsum_fmeasure     tensor(0.1796)
rougeLsum_precision    tensor(0.2329)
rougeLsum_recall       tensor(0.1518)
dtype: object