In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer and stopword list
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

# Original sentences
original_sentences = ["The quick brown fox jumps over the lazy dog.",
                       "This is a sentence.",
                       "Another sentence here.",
                       "This is a short sentence.",
                       "The dog is lazy."]

# Preprocess the sentences
filtered_sentences = []
for sentence in original_sentences:
    # Tokenize the sentence
    words = nltk.word_tokenize(sentence.lower())
    
    # Remove stopwords and lemmatize the remaining words
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
    
    # Convert the filtered words back to a sentence and add it to the list of filtered sentences
    filtered_sentence = ' '.join(filtered_words)
    filtered_sentences.append(filtered_sentence)

In [15]:
import torch
from transformers import BertModel, BertTokenizer

class BERTSUM(torch.nn.Module):
    def __init__(self):
        super(BERTSUM, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        scores = self.linear(pooled_output)
        return scores

def bertsum_extractive_summarization(original_sentences, filtered_sentences, size):
    
    # Set the random seed for PyTorch
    torch.manual_seed(42)
    # Tokenize original sentences
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # input_ids = tokenizer(original_sentences, padding=True, truncation=True, return_tensors='pt').input_ids
    # attention_mask = tokenizer(original_sentences, padding=True, truncation=True, return_tensors='pt').attention_mask
    
    # Tokenize filtered sentences
    input_ids_filtered = tokenizer(filtered_sentences, padding=True, truncation=True, return_tensors='pt').input_ids
    attention_mask_filtered = tokenizer(filtered_sentences, padding=True, truncation=True, return_tensors='pt').attention_mask
    
    # Generate BERTSUM scores for filtered sentences
    model = BERTSUM()
    model.eval()
    scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))

    # Sort scores in descending order
    sorted_scores, sorted_indices = torch.sort(scores.view(-1), descending=True)

    # Select top "size" sentences with highest scores
    top_indices = sorted_indices[:size].tolist()

    # Generate final summary
    summary = ""
    for index in top_indices:
        summary += original_sentences[index] + " "
            
    return summary



In [16]:
summary1 = bertsum_extractive_summarization(original_sentences, filtered_sentences, 2)
summary2 = bertsum_extractive_summarization(original_sentences, filtered_sentences, 2)
summary3 = bertsum_extractive_summarization(original_sentences, filtered_sentences, 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))
Some weights of the model checkpoint at bert-base-uncased were no

In [17]:
print(summary2)
print(summary1)
print(summary3)

This is a sentence. This is a short sentence. 
This is a sentence. This is a short sentence. 
This is a sentence. This is a short sentence. 


In [18]:
summary1 = bertsum_extractive_summarization(sentences[0], processed_sentences[0], 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))


In [19]:
summary1

"``Some (opposition politicians) have wanted the \r\nUnited Nations to help guarantee a safe return for them.'' It was one of the main \r\nstumbling blocks in months of discord. "

In [5]:
import pandas as pd
df = pd.read_csv("DUC\Duc_first_reference\main_dataset\Duc_dataset_first_ref_summary.csv")
df = df["Original Article"]

In [6]:
from preprocessing_algorithms import *

In [7]:
sentences, processed_sentences =process_one_column_df(df)

100%|██████████| 49/49 [02:04<00:00,  2.53s/it]


In [11]:
from summarization_algorithm import *

In [12]:
df = summarize_with(list_of_filtered_articles = processed_sentences, list_of_articles=sentences,summary_algorithm =bertsum_extractive_summarization ,size = 5)

  0%|          | 0/49 [00:00<?, ?it/s]Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))
  2%|▏         | 1/49 [00:3

RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 250699776 bytes.

In [21]:
list_of_summaries = []
for i in range(49):
    list_of_summaries.append(bertsum_extractive_summarization(processed_sentences[i],sentences[i],5))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  scores = model(torch.tensor(input_ids_filtered), torch.tensor(attention_mask_filtered))
Some weights of the model checkpoint at bert-base-uncased were no