In [37]:
import time
import numpy as np
import torch
from exBERT import BertTokenizer
from nltk.tokenize import sent_tokenize
import torch.nn as nn
from torch.nn.init import xavier_uniform_
from exBERT import BertModel, BertConfig
from SUMM.encoder import ExtTransformerEncoder
from tokenizers.processors import TemplateProcessing


class Bert(nn.Module):
    def __init__(self, config_2 = 'config/bert_config_ex_s3.json'):
        super(Bert, self).__init__()
        # self.bert_type = bert_type
        # if bert_type == 'bertbase':
        bert_config_1 = BertConfig.from_json_file('config/bert_config.json')
        bert_config_2 = BertConfig.from_json_file(config_2)

        self.model = BertModel(bert_config_1, bert_config_2)
      
    def forward(self, x, segs, mask):
        top_vec, second_augment = self.model(x, attention_mask=mask, token_type_ids=segs)   ## top_vec = //// self.bert(src, segs, mask_src)
        print('@@@@@', top_vec)
        print('@@@@@second_augment', second_augment)
        print('@@@@@@@@ top_vec size', len(top_vec))
        print('@@@@@second_augment size', second_augment)

        if len(top_vec) == 0: #debug
            print("top_vec is empty") #debug
            print(top_vec) #debug
        else: #debug
            print("top_vec is not empty") #debug
        return top_vec

class ExtSummarizer(nn.Module):
    def __init__(self, device = 'cpu', checkpoint = None):
        super().__init__()
        self.device = device
        self.bert = Bert()
        self.load_state_dict(checkpoint, strict = False)

        
        self.ext_layer = ExtTransformerEncoder(
            self.bert.model.config.hidden_size, d_ff=2048, heads=8, dropout=0.2, num_inter_layers=2
        )
        
        # print(self) #debug
        
        # print('ext_layer', self.ext_layer) #debug
        # print('checkpoint', checkpoint)  #debug
        #self.load_state_dict(checkpoint, strict = False)
        # new_model.load_state_dict(checkpoint['state_dict'], strict=False)

        # Match and load BERT layers
        for model_layer_name, model_layer in self.bert.named_modules():
            if model_layer_name in checkpoint:
                model_layer.load_state_dict(checkpoint[model_layer_name])
        
        # Match and load ExtTransformerEncoder layers
        for model_layer_name, model_layer in self.ext_layer.named_modules():
            if model_layer_name in checkpoint:
                model_layer.load_state_dict(checkpoint[model_layer_name])
        
        # Move the entire model (including the BERT model) to the specified device
        self.to(device)
        # print('print out ExtSummarizer', self) #debug


    def forward(self, src, segs, clss, mask_src, mask_cls):
        # top_vec = self.bert(src, segs, mask_src) ###<----empty WHYYYYY!!!!! KILL ME PLEASE
        print('----->check:src', src)
        print('----->check:segs', segs)
        print('----->check:mask_src', mask_src )
        
        top_vec = self.bert(src, segs, mask_src) 
        # print('EXTTTTT top_vec', top_vec)
        print('EXTTTTT len top_vec', len(top_vec))
        print('EXTTTTT[0]len top_vec ', len(top_vec[0]))#.size())
        
        print('EXTTTTT[1] len top_vec ', len(top_vec[1]))#.size())
        print('type of top_vec', type(top_vec))
        print('type of[0] top_vec', type(top_vec[0]))
        print('type of[1] top_vec', type(top_vec[1]))
        
        print('len[0] [0] top_vec size ', len(top_vec[0][0]))
        print('len[1] [0] top_vec size ', len(top_vec[1][0]))



        # # Check if top_vec is a tuple and extract the tensor from it
        # if isinstance(top_vec, tuple):
        # # Assuming the tensor is the first element of the tuple
        #     top_vec = top_vec[0]

        sents_vec = top_vec[torch.arange(top_vec[1].size(0)).unsqueeze(1), clss]
        print('sents_vec', sents_vec)
        print('sents_vec shape', sents_vec.shape)
        sents_vec = sents_vec * mask_cls[:, :, None].float()
        sent_scores = self.ext_layer(sents_vec, mask_cls).squeeze(-1)
        return sent_scores, mask_cls
    
# class AbsSummarizer(nn.Module):
#     def __init__(self, device = 'cpu', checkpoint_pth='./results/bert_ex_s3/Best_stat_dic_exBERTe2_b32_lr1e-05.pth', bert_from_extractive=None, bert_type='bertbase') :
#         super().__init__()
#         self.device = device
#         self.bert = Bert(bert_type=bert_type)
        
#         # if bert_from_extractive is not None:
#         #     self.bert.model.load_state_dict(
#         #         dict([(n[11:], p) for n, p in bert_from_extractive.items() if n.startswith('bert.model')]), strict=True)
        
#         self.vocab_size = self.bert.model.config.vocab_size
#         tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
        
#         self.decoder = TransformerDecoder(
#             self.args.dec_layers,
#             self.args.dec_hidden_size, heads=self.args.dec_heads,
#             d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings)
        
#         self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device)
#         self.generator[0].weight = self.decoder.embeddings.weight
        
#         self.load_state_dict(checkpoint_pth['model'], strict=True)
        
#         self.to(device)


In [38]:
def preprocess(source_fp):
    """
    - Remove \n
    - Sentence Tokenize
    - Add [SEP] [CLS] as sentence boundary
    """
    with open(source_fp) as source:
        raw_text = source.read().replace("\n", " ").replace("[CLS] [SEP]", " ")
    sents = sent_tokenize(raw_text)
    # print('nltk--sent_tokenize', sents) # debug
    processed_text = "[CLS] [SEP]".join(sents)
    print('processed_text', processed_text)
    return processed_text, len(sents)


def load_text(processed_text, max_pos, device):
    # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    tok = BertTokenizer("data/interim/custom_bert_vocab.txt")

    # tokenizer.post_processor = TemplateProcessing(
    #     single="[CLS] $A [SEP]",
    #     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #     special_tokens=[
    #         ("[CLS]", tokenizer.convert_tokens_to_ids("[CLS]")),
    #         ("[SEP]", tokenizer.convert_tokens_to_ids("[SEP]")),
    #         ],
    #     )   
    # tokenizer.post_processor = post_processor

    sep_vid = tok.vocab["[SEP]"]
    # print('sep_vid', sep_vid) # debug
    cls_vid = tok.vocab["[CLS]"]
    # print('cls_vid', cls_vid) #debug

    def _process_src(raw):
        # raw = raw.strip().lower()
        # raw = raw.replace("[cls]", "[CLS]").replace("[sep]", "[SEP]")
        print('raw', raw)
        
        src_subtokens = tok.tokenize(raw)
        print('src_subtokens_1', src_subtokens)
        
        # tokens = ['[', 'cl', '##s', ']', '[', 'sep', ']']
        new_tokens = ['[CLS]' if token in ['[', 'cl', '##s', ']'] else '[SEP]' if token in ['[', 'sep', ']'] else token for token in src_subtokens]
        new_tokens = [new_tokens[0]] + [token for i, token in enumerate(new_tokens[1:], 1) if not (token == '[CLS]' and new_tokens[i - 1] == '[CLS]')]

        print('new_tokens', new_tokens)
        
        src_subtokens = ["[CLS]"] + new_tokens + ["[SEP]"]
        src_subtoken_idxs = tok.convert_tokens_to_ids(src_subtokens)
        # Original list of tokens

        print('src_subtoken_idxs', src_subtoken_idxs)
        src_subtoken_idxs = src_subtoken_idxs[:-1][:max_pos]
        src_subtoken_idxs[-1] = sep_vid
        print('src_subtokens', src_subtokens)
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        
        segments_ids = []
        segs = segs[:max_pos]
        for i, s in enumerate(segs):
            if i % 2 == 0:
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]
        print('segments_ids', segments_ids)
        src = torch.tensor(src_subtoken_idxs)[None, :].to(device)
        mask_src = ((~(src == 0)).float()).to(device)
        cls_ids = [[i for i, t in enumerate(src_subtoken_idxs) if t == cls_vid]]
        clss = torch.tensor(cls_ids).to(device)
        mask_cls = (~(clss == -1)).float()
        clss[clss == -1] = 0
        return src, mask_src, segments_ids, clss, mask_cls

    src, mask_src, segments_ids, clss, mask_cls = _process_src(processed_text)
    segs = torch.tensor(segments_ids)[None, :].to(device)
    src_text = [[sent.replace("[SEP]", "").strip() for sent in processed_text.split("[CLS]")]]
    return src, mask_src, segs, clss, mask_cls, src_text

def test(model, input_data, result_path, max_length, block_trigram=True):
    def _get_ngrams(n, text):
        ngram_set = set()
        text_length = len(text)
        max_index_ngram_start = text_length - n
        for i in range(max_index_ngram_start + 1):
            ngram_set.add(tuple(text[i : i + n]))
        return ngram_set

    def _block_tri(c, p):
        tri_c = _get_ngrams(3, c.split())
        for s in p:
            tri_s = _get_ngrams(3, s.split())
            if len(tri_c.intersection(tri_s)) > 0:
                return True
        return False

    with open(result_path, "w") as save_pred:
        with torch.no_grad():
            src, mask, segs, clss, mask_cls, src_str = input_data
            print('test-----src ',input_data[0])
            print('test-----index 1',input_data[1])
            print('test-----seg ',input_data[2])
            print('test-----index 3',input_data[3])
            print('test-----index 4',input_data[4])
            print('test-----index 5',input_data[5])
            
            
   
            sent_scores, mask = model(src, segs, clss, mask, mask_cls)
            print('test-----sent_scores',sent_scores)
            print('test-----mask',mask)
            print('test-----', model)
            
            
            
            sent_scores = sent_scores + mask.float()
            sent_scores = sent_scores.cpu().data.numpy()
            selected_ids = np.argsort(-sent_scores, 1)

            pred = []
            for i, idx in enumerate(selected_ids):
                _pred = []
                if len(src_str[i]) == 0:
                    continue
                for j in selected_ids[i][: len(src_str[i])]:
                    if j >= len(src_str[i]):
                        continue
                    candidate = src_str[i][j].strip()
                    if block_trigram:
                        if not _block_tri(candidate, _pred):
                            _pred.append(candidate)
                    else:
                        _pred.append(candidate)

                    if len(_pred) == max_length:
                        break

                _pred = " ".join(_pred)
                pred.append(_pred)

            for i in range(len(pred)):
                save_pred.write(pred[i].strip() + "\n")


def summarize(raw_txt_fp, result_fp, model, max_length=2, max_pos=512, return_summary=True):
    model.eval()
    processed_text, full_length = preprocess(raw_txt_fp)
    input_data = load_text(processed_text, max_pos, device="cpu")
    print('src ',input_data[0])
    print('index 1',input_data[1])
    print('seg ',input_data[2])
    print('index 3',input_data[3])
    print('index 4',input_data[4])
    print('index 5',input_data[5])

 
           # src, mask, segs, clss, mask_cls, src_str = input_data


    test(model, input_data, result_fp, max_length, block_trigram=True)
    if return_summary:
        return open(result_fp).read().strip()
        

In [39]:
# import torch
# from SUMM.model_builder import ExtSummarizer
# from ext_sum import summarize

# Load model
checkpoint_pth='./results/bert_ex_s3/Best_stat_dic_exBERTe2_b32_lr1e-05.pth'
# checkpoint_pth='/Users/camilleko/NLP_summarization/results/bert_C_ex_s3/model_step_100.pt'
# checkpoint_url = './results/bert_ex_s3/Best_stat_dic_exBERTe2_b32_lr1e-05.pth'
# checkpoint_url = './results/bertext_cnndm_transformer.pt'
device = 'cpu'
checkpoint = torch.load(checkpoint_pth, map_location=device)
# for k,v in checkpoint['opt']:
#     print(k)
model = ExtSummarizer(device=device , checkpoint = checkpoint )
# model = ExtSummarizer()x


# Run summarization
input_fp = 'input.txt'
result_fp = 'results/summary.txt'
summary = summarize(input_fp, result_fp, model, max_length=3)
print(summary)

processed_text (CNN) Over and over again in 2018, during an apology tour that took him from the halls of the US Congress to an appearance before the European Parliament, Mark Zuckerberg said Facebook had failed to "take a broad enough view of our responsibilities.[CLS] [SEP]But two years later, Zuckerberg and Facebook are still struggling with their responsibilities and how to handle one of their most famous users: President Donald Trump.[CLS] [SEP]Despite Zuckerberg having previously indicated any post that "incites violence" would be a line in the sand — even if it came from a politician — Facebook remained silent for hours Friday after Trump was accused of glorifying violence in posts that appeared on its platforms.[CLS] [SEP]At 12:53am ET on Friday morning, as cable news networks carried images of fires and destructive protests in Minneapolis, the President tweeted : "These THUGS are dishonoring the memory of George Floyd, and I won't let that happen.[CLS] [SEP]Just spoke to Govern

TypeError: arange(): argument 'end' (position 1) must be Number, not Tensor

In [None]:
import torch
import torch.nn as nn
from pytorch_transformers import BertModel, BertConfig
from SUMM.encoder import ExtTransformerEncoder  # Import your custom modules

model_name = "bert-base-uncased"  # You can choose a different pre-trained model

class CustomBertWithExtractiveSummarization(nn.Module):
    def __init__(self, bert_config_path, ext_layer_config):
        super(CustomBertWithExtractiveSummarization, self).__init__()
        
        # Load the BERT model
        self.bert = BertModel.from_pretrained(model_name)
        
        # Initialize the extractive summarization layer
        self.ext_layer = ExtTransformerEncoder(
            self.bert.config.hidden_size, d_ff=2048, heads=8, dropout=0.2, num_inter_layers=2
        )
    
    def forward(self, input_ids, attention_mask):
        # Get BERT embeddings
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        
        # Apply extractive summarization layer
        ext_sum_outputs = self.ext_layer(bert_outputs[0], attention_mask)
        
        return ext_sum_outputs

# Load the checkpoint from a file
checkpoint_pth = '/Users/camilleko/NLP_summarization/results/bert_C_ex_s3/model_step_100.pt'
device = 'cpu'  # You can change this to 'cuda' if you have a GPU

# Load the checkpoint data
checkpoint = torch.load(checkpoint_pth, map_location=device)

# Extract the 'model' from the checkpoint
model = CustomBertWithExtractiveSummarization(model_name)


ModuleNotFoundError: No module named 'models'

In [None]:
###########
from preprocess import data_builder
from models import data_loader
from models.data_loader import load_text
from models.model_builder import ExtSummarizer
from models.trainer import build_trainer
from models.data_loader import load_text
from models.model_builder import ExtSummarizer
from models.trainer import build_trainer
import argparse
import os
import json
from datetime import datetime

def load_your_data():
    # Replace this with your data loading code.
    # Your data should be in the format required by PreSumm.
    # You need to create a list of documents for summarization.
    # Each document should be a string.

    your_documents = [
        "Your first document goes here.",
        "Your second document goes here.",
        # Add more documents as needed.
    ]

    return your_documents

def extractive_summarization(your_documents):
    # Load the pretrained model checkpoint
    pretrained_path = "/path/to/bertext_cnndm_transformer"  # Replace with your actual path
    checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage)

    # Initialize the summarizer
    model = ExtSummarizer(device="cpu", checkpoint=checkpoint)

    # Prepare your data for summarization
    documents = [{"src": doc, "labels": [""]} for doc in your_documents]

    # Build data loader
    train_iter = data_loader.Dataloader(args, documents, model.vocab, device="cpu", shuffle=False)
    trainer = build_trainer(args, device_id, model, None)

    # Summarize your documents
    for batch in train_iter:
        with torch.no_grad():
            trainer.test(batch)

    # Get the extracted summaries
    all_generated_ids, _, _ = trainer.get_output_texts()

    # Print the summaries
    for i, summary_ids in enumerate(all_generated_ids):
        summary = model.decode(summary_ids, model.vocab)
        print(f"Document {i + 1} Summary: {summary}")

if __name__ == '__main__':
    your_documents = load_your_data()
    extractive_summarization(your_documents)
