# SummaRuNNer with BERT on ECT
## Usage
Just run all the cells:-
- Setup
- Write Files
    - RNN_RNN.py
    - Vocab.py
    - main.py
- Training
- Testing

To change hyperparameters:
- Edit main.py (3rd cell in Write Files)
- Or specifiy as arguments in training cmd

## Setup

In [None]:
!git clone https://github.com/hpzhao/SummaRuNNer
!git clone https://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@github.com/abhinav-bohra/Long-Text-Summarization.git

Cloning into 'SummaRuNNer'...
remote: Enumerating objects: 431, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 431 (delta 40), reused 39 (delta 39), pack-reused 385[K
Receiving objects: 100% (431/431), 179.31 MiB | 13.70 MiB/s, done.
Resolving deltas: 100% (222/222), done.
Checking out files: 100% (27/27), done.
Cloning into 'Long-Text-Summarization'...
remote: Enumerating objects: 8148, done.[K
remote: Counting objects: 100% (4006/4006), done.[K
remote: Compressing objects: 100% (3995/3995), done.[K
remote: Total 8148 (delta 17), reused 3990 (delta 8), pack-reused 4142[K
Receiving objects: 100% (8148/8148), 82.48 MiB | 11.23 MiB/s, done.
Resolving deltas: 100% (83/83), done.
Checking out files: 100% (11917/11917), done.


In [None]:
!cp /content/Long-Text-Summarization/data/reuters/summarunner/* /content/SummaRuNNer/data/
%cd /content/SummaRuNNer

/content/SummaRuNNer


In [None]:
!pip install -q transformers
!pip install rouge
!pip install tqdm

[K     |████████████████████████████████| 4.2 MB 4.3 MB/s 
[K     |████████████████████████████████| 84 kB 3.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 31.2 MB/s 
[K     |████████████████████████████████| 596 kB 58.4 MB/s 
[?25hCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


## Write Files

### Abhinav

In [None]:
# SummaRuNNer with BERT models/RNN_RNN.py
%%writefile models/RNN_RNN.py

from .BasicModule import BasicModule
import subprocess as sp
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from transformers import BertTokenizer, BertModel

import logging
logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)

HH = 256

class RNN_RNN(BasicModule):
    def __init__(self, args, embed=None):
        super(RNN_RNN, self).__init__(args)
        self.model_name = 'RNN_RNN'
        self.args = args

        # V = args.embed_num
        # D = args.embed_dim
        H = args.hidden_size
        S = args.seg_num
        P_V = args.pos_num 
        P_D = args.pos_dim
        self.abs_pos_embed = nn.Embedding(P_V,P_D)
        self.rel_pos_embed = nn.Embedding(S,P_D)
        # self.embed = nn.Embedding(V,D,padding_idx=0)
        # if embed is not None:
        #     self.embed.weight.data.copy_(embed)

        self.bert_m = BertModel.from_pretrained('bert-base-cased',
                                  ) 
        
        for name, param in list(self.bert_m.named_parameters())[:-66]:
            param.requires_grad = False

        self.sent_RNN = nn.GRU(
                        input_size = 768,
                        hidden_size = HH,
                        batch_first = True,
                        bidirectional = True
                        )
        self.fc = nn.Linear(2*HH,2*HH)

        # Parameters of Classification Layer
        self.content = nn.Linear(2*HH,1,bias=False)
        self.salience = nn.Bilinear(2*HH,2*HH,1,bias=False)
        self.novelty = nn.Bilinear(2*HH,2*HH,1,bias=False)
        self.abs_pos = nn.Linear(P_D,1,bias=False)
        self.rel_pos = nn.Linear(P_D,1,bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1,0.1))

    def max_pool1d(self,x,seq_lens):
        # x:[N,L,O_in]
        out = []
        for index,t in enumerate(x):
            t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.max_pool1d(t,t.size(2)))

        out = torch.cat(out).squeeze(2)
        return out

    def avg_pool1d(self,x,seq_lens):
        # x:[N,L,O_in]
        out = []
        for index,t in enumerate(x):
            t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.avg_pool1d(t,t.size(2)))

        out = torch.cat(out).squeeze(2)
        return out

    def forward(self,input_ids,attention_masks,doc_lens):
        # word level GRU
        H = self.args.hidden_size
        outputs = self.bert_m(input_ids=input_ids, attention_mask=attention_masks)
        # hidden representation of last layer 
        token_vecs = outputs.last_hidden_state
        # dimension : [N,max_len_sent,768] N: no of sentences
        k=0
        for i in token_vecs:
            # cls embedding
            sentence_embedding = i[0] 
            if(k==0): 
                sen = sentence_embedding.unsqueeze(0) 
                emb = sen
                k=k+1
            else:
                sen = sentence_embedding.unsqueeze(0)
                emb = torch.cat((emb,sen),0)
        
        torch.cuda.empty_cache()
        k=0
        x = self.pad_doc(emb,doc_lens)
        sent_out = self.sent_RNN(x)[0]
        docs = self.max_pool1d(sent_out,doc_lens)
        del emb
        torch.cuda.empty_cache()
        del input_ids
        del attention_masks
        torch.cuda.empty_cache()
        probs = []
        
        for index,doc_len in enumerate(doc_lens):
            valid_hidden = sent_out[index,:doc_len,:]                            # (doc_len,2*H)
            doc = F.tanh(self.fc(docs[index])).unsqueeze(0)
            s = Variable(torch.zeros(1,2*HH))
            if self.args.device is not None:
                s = s.cuda()
            for position, h in enumerate(valid_hidden):
                h = h.view(1, -1)                                                # (1,2*H)
                # get position embeddings
                abs_index = Variable(torch.LongTensor([[position]]))
                if self.args.device is not None:
                    abs_index = abs_index.cuda()
                abs_features = self.abs_pos_embed(abs_index).squeeze(0)

                rel_index = int(round((position + 1) * 9.0 / doc_len))
                rel_index = Variable(torch.LongTensor([[rel_index]]))
                if self.args.device is not None:
                    rel_index = rel_index.cuda()
                rel_features = self.rel_pos_embed(rel_index).squeeze(0)

                # classification layer
                content = self.content(h)
                salience = self.salience(h,doc)
                novelty = -1 * self.novelty(h,F.tanh(s))
                abs_p = self.abs_pos(abs_features)
                rel_p = self.rel_pos(rel_features)
                prob = F.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
                s = s + torch.mm(prob,h)
                probs.append(prob)
        del sent_out
        del docs
        torch.cuda.empty_cache()
        return torch.cat(probs).squeeze()

Overwriting models/RNN_RNN.py


In [None]:
# SummaRuNNer with BERT utils/Vocab.py
%%writefile utils/Vocab.py

import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

import logging
logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)

class Vocab():
    def __init__(self,embed,word2id):
        self.embed = embed
        self.word2id = word2id
        # self.id2word = {v:k for k,v in word2id.items()}
        # assert len(self.word2id) == len(self.id2word)
        # self.PAD_IDX = 0
        # self.UNK_IDX = 1
        # self.PAD_TOKEN = 'PAD_TOKEN'
        # self.UNK_TOKEN = 'UNK_TOKEN'

    # def __len__(self):
    #     return len(word2id)

    # def i2w(self,idx):
    #     return self.id2word[idx]
    # def w2i(self,w):
    #     if w in self.word2id:
    #         return self.word2id[w]
    #     else:
    #         return self.UNK_IDX

    def make_features(self,batch,sent_trunc=50,doc_trunc=800,split_token='\n'):
        sents_list,targets,doc_lens = [],[],[]
        # trunc document
        for doc,label in zip(batch['doc'],batch['labels']):
            sents = doc.split(split_token)
            labels = label.split(split_token)
            try:
                labels = [int(l) for l in labels]
            except:
                #print("Ignoring:", labels, sents)
                continue
            max_sent_num = min(doc_trunc,len(sents))
            sents = sents[:max_sent_num]
            labels = labels[:max_sent_num]
            sents_list += sents
            targets += labels
            doc_lens.append(len(sents))
        # trunc or pad sent
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = sent.split()
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        input_ids = []
        attention_masks = []
        for doc in batch['doc']:
            doc_n = doc.split(split_token)
            k=0
            for sent in doc_n:
                encoded_dict = tokenizer.encode_plus(
                    sent,
                    None,
                    add_special_tokens=True,
                    max_length=64,
                    pad_to_max_length=True,
                    return_token_type_ids=True,
                    return_tensors="pt"
                    )
                k=k+1
                input_ids.append(encoded_dict['input_ids'])
                attention_masks.append(encoded_dict['attention_mask'])
                if(k==doc_trunc):
                    break
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        targets = torch.LongTensor(targets)
        summaries = batch['summaries']

        return input_ids,attention_masks,targets,summaries,doc_lens

    def make_predict_features(self, batch, sent_trunc=150, doc_trunc=300, split_token='. '):
        sents_list, doc_lens = [],[]
        for doc in batch:
            sents = doc.split(split_token)
            max_sent_num = min(doc_trunc,len(sents))
            sents = sents[:max_sent_num]
            sents_list += sents
            doc_lens.append(len(sents))

        # trunc or pad sent
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = sent.split()
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        # features = []
        # for sent in batch_sents:
        #     feature = [self.w2i(w) for w in sent] + [self.PAD_IDX for _ in range(max_sent_len-len(sent))]
        #     features.append(feature)

        #features = torch.LongTensor(features)
        input_ids = []
        attention_masks = []
        for sent in sents_list:
            encoded_dict = tokenizer.encode_plus(
                    sent,                      # Sentence to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    max_length = 128,           # Pad & truncate all sentences.
                    truncation = True,
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',     # Return pytorch tensors.
                 )
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        return  input_ids,attention_masks,doc_lens

Overwriting utils/Vocab.py


In [None]:
# SummaRuNNer with BERT main.py
%%writefile main.py

#!/usr/bin/env python3

import subprocess as sp
import os
import json
import models
import utils
import argparse,random,logging,numpy,os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from torch.nn.utils import clip_grad_norm
from time import time
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")
logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)

logging.basicConfig(level=logging.INFO, format='%(asctime)s [INFO] %(message)s')
parser = argparse.ArgumentParser(description='extractive summary')
# model
parser.add_argument('-save_dir',type=str,default='checkpoints/')
parser.add_argument('-embed_dim',type=int,default=100)
parser.add_argument('-embed_num',type=int,default=100)
parser.add_argument('-pos_dim',type=int,default=100)
parser.add_argument('-pos_num',type=int,default=300)
parser.add_argument('-seg_num',type=int,default=10)
parser.add_argument('-kernel_num',type=int,default=100)
parser.add_argument('-kernel_sizes',type=str,default='3,4,5')
parser.add_argument('-model',type=str,default='RNN_RNN')
parser.add_argument('-hidden_size',type=int,default=200)
# train
parser.add_argument('-lr',type=float,default=1e-5) #discuss
parser.add_argument('-batch_size',type=int,default=32)
parser.add_argument('-epochs',type=int,default=15)
parser.add_argument('-seed',type=int,default=1)
parser.add_argument('-train_dir',type=str,default='data/train.json')
parser.add_argument('-val_dir',type=str,default='data/val.json')
parser.add_argument('-embedding',type=str,default='data/embedding.npz')
parser.add_argument('-word2id',type=str,default='data/word2id.json')
parser.add_argument('-report_every',type=int,default=20)  #discuss 1350/8 = 168
parser.add_argument('-seq_trunc',type=int,default=50) #discuss
parser.add_argument('-max_norm',type=float,default=1.0) #discuss
# test
parser.add_argument('-load_dir',type=str,default='checkpoints/RNN_RNN_seed_1.pt')
# parser.add_argument('-test_dir',type=str,default='data/val_ls_first100_without_scores.json')
parser.add_argument('-test_dir',type=str,default='data/test.json')
parser.add_argument('-ref',type=str,default='outputs/ref')
parser.add_argument('-hyp',type=str,default='outputs/hyp')
parser.add_argument('-filename',type=str,default='x.txt') # TextFile to be summarized
parser.add_argument('-topk',type=int,default=8) #discuss
# device
parser.add_argument('-device',type=int)
# option
parser.add_argument('-test',action='store_true')
parser.add_argument('-debug',action='store_true')
parser.add_argument('-predict',action='store_true')
args = parser.parse_args()
use_gpu = args.device is not None

if torch.cuda.is_available() and not use_gpu:
	print("WARNING: You have a CUDA device, should run with -device 0")

# set cuda device and seed
if use_gpu:
	torch.cuda.set_device(args.device)
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
random.seed(args.seed)
numpy.random.seed(args.seed)


def eval(net,vocab,data_iter,criterion):
	with torch.no_grad():
		net.eval()
		total_loss = 0
		batch_num = 0
		for batch in data_iter:
			input_ids,attention_masks,targets,_,doc_lens = vocab.make_features(batch)
			input_ids,attention_masks,targets = Variable(input_ids),Variable(attention_masks), Variable(targets.float())
			if use_gpu:
				targets = targets.cuda()
				input_ids = input_ids.cuda()
				attention_masks = attention_masks.cuda()
			probs = net(input_ids,attention_masks,doc_lens)
			loss = criterion(probs,targets)
			total_loss += loss.item()
			batch_num += 1
		loss = total_loss / batch_num
		del targets
		del input_ids
		del attention_masks
		torch.cuda.empty_cache()
		net.train()
	return loss

def train():
	logging.info('Loading vocab, train and val dataset. Wait a second, please')
	pp = 3
	# embed = torch.Tensor(np.load(args.embedding)['embedding'])
	# with open(args.word2id) as f:
	#     word2id = json.load(f)
	embed, word2id = None, None
	vocab = utils.Vocab(embed, word2id)    

	with open(args.train_dir) as f:
		examples = [json.loads(line) for line in f]
	train_dataset = utils.Dataset(examples)

	with open(args.val_dir) as f:
		examples = [json.loads(line) for line in f]
	val_dataset = utils.Dataset(examples)

	# update args
	# args.embed_num = embed.size(0)
	# args.embed_dim = embed.size(1)
	args.embed_num = None
	args.embed_dim = None
	args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')]

	acc_steps = 16
	
	# build model
	net = getattr(models,args.model)(args,embed)
	if use_gpu:
		net.cuda()
	
	# load dataset
	train_iter = DataLoader(dataset=train_dataset,
			batch_size=args.batch_size,
			shuffle=True)
	val_iter = DataLoader(dataset=val_dataset,
			batch_size=args.batch_size,
			shuffle=False)
	
	# loss function
	criterion = nn.BCELoss()
	
	# model info
	#print(net)
	
	params = sum(p.numel() for p in list(net.parameters())) / 1e6
	print('#Params: %.1fM' % (params))

	min_loss = float('inf')
	optimizer = torch.optim.Adam(net.parameters(),lr=args.lr)
	net.train()

	t1 = time()
	checkpp = 0
	for epoch in tqdm(range(1,args.epochs+1)):
		logging.info(f"\nEpoch: {epoch}")
		if(checkpp==pp):
			break
		optimizer.zero_grad()
		t_loss = 0
		s_loss = 0
		for i,batch in enumerate(train_iter):
			input_ids,attention_masks,targets,_,doc_lens = vocab.make_features(batch)
			input_ids,attention_masks,targets = Variable(input_ids),Variable(attention_masks), Variable(targets.float())
			if use_gpu:
				input_ids = input_ids.cuda()
				attention_masks = attention_masks.cuda()
			   
			probs = net(input_ids,attention_masks,doc_lens)
			if use_gpu:
				targets = targets.cuda()
		   
			loss = criterion(probs,targets)
			t_loss = t_loss+loss.item()
			loss = loss / acc_steps
			s_loss = s_loss+1
			loss.backward()
			clip_grad_norm(net.parameters(), args.max_norm)
			if(((i+1) % acc_steps == 0) or (i== args.report_every)):
				optimizer.step()
				optimizer.zero_grad()
			if args.debug:
				logging.info(f'Batch ID:{i} Loss:{loss.data.item()}')
				continue
			if( (i % args.report_every == 0) and (i!=0)):
				cur_loss = eval(net,vocab,val_iter,criterion)
				train_loss = t_loss/s_loss
				t_loss = 0
				s_loss = 0
				if cur_loss < min_loss:
					checkpp = 0
					min_loss = cur_loss
					best_path = net.save()
					logging.info('Model Checkpoint Saved')
				else:
					checkpp = checkpp+1

				logging.info('Epoch:%2d Min_Val_Loss: %f Cur_Val_Loss: %f training loss: %f'
						% (epoch,min_loss,cur_loss,train_loss))

	t2 = time()
	logging.info('Total Time:%f h'%((t2-t1)/3600))

def test():

	# embed = torch.Tensor(np.load(args.embedding)['embedding'])
	# with open(args.word2id) as f:
	#     word2id = json.load(f)
	
	embed, word2id = None, None
	vocab = utils.Vocab(embed, word2id)

	#Loading Test File Names
	with open("/content/SummaRuNNer/data/test_files.txt") as f:
		file_names = f.readlines()
	file_names = [x.strip() for x in file_names]

	with open(args.test_dir) as f:
		examples = [json.loads(line) for line in f]
	test_dataset = utils.Dataset(examples)

	test_iter = DataLoader(dataset=test_dataset,
							batch_size=args.batch_size,
							shuffle=False)
	if use_gpu:
		checkpoint = torch.load(args.load_dir)
	else:
		checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

	# checkpoint['args']['device'] saves the device used as train time
	# if at test time, we are using a CPU, we must override device to None
	if not use_gpu:
		checkpoint['args'].device = None
	net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
	net.load_state_dict(checkpoint['model'])
	if use_gpu:
		net.cuda()
	net.eval()

	doc_num = len(test_dataset)
	time_cost = 0
	file_count = 0
	for batch in tqdm(test_iter):
		input_ids,attention_masks,targets,summaries,doc_lens  = vocab.make_features(batch)
		input_ids,attention_masks,targets = Variable(input_ids),Variable(attention_masks), Variable(targets.float())
		t1 = time()
		if use_gpu:
			input_ids = input_ids.cuda()
			attention_masks = attention_masks.cuda()
			probs = net(input_ids,attention_masks,doc_lens)
		else:
			probs = net(input_ids,attention_masks,doc_lens)
		t2 = time()
		time_cost += t2 - t1
		start = 0
		for doc_id,doc_len in enumerate(doc_lens):
			stop = start + doc_len
			prob = probs[:stop]
			topk_elems = min(args.topk,doc_len)
			
			values, indices = prob.topk(topk_elems)
			topk_values, topk_indices = [], []
			
			#Consider predictions with >0.5 prob score
			for v, i in zip(values, indices):
				if v >= 0.5:
					topk_values.append(v.cpu().data.numpy())
					topk_indices.append(i.cpu().data.numpy())

			#These values should be >0.5
			if(len(topk_values)==0):
				print(f"No predictions with >=0.5 prob_score in file: [{file_names[file_count]}]")
				print(f"Prob Scores: {values}")

			topk_indices.sort()
			doc = batch['doc'][doc_id].split('\n')[:doc_len]
			hyp = [doc[index] for index in topk_indices]
			ref = summaries[doc_id]
			with open(os.path.join(args.ref, file_names[file_count]), 'w') as f:
				f.write(ref)
			with open(os.path.join(args.hyp, file_names[file_count]), 'w') as f:
				f.write('\n'.join(hyp))
			start = stop
			file_count = file_count + 1

		del input_ids
		del attention_masks
		torch.cuda.empty_cache()
	logging.info(f'Speed: {(doc_num / time_cost)} docs / s' )


def predict(examples):
	# embed = torch.Tensor(np.load(args.embedding)['embedding'])
	# with open(args.word2id) as f:
	#     word2id = json.load(f)
		
	embed, word2id = None, None
	vocab = utils.Vocab(embed, word2id)
	pred_dataset = utils.Dataset(examples)

	pred_iter = DataLoader(dataset=pred_dataset,
							batch_size=args.batch_size,
							shuffle=False)
	if use_gpu:
		checkpoint = torch.load(args.load_dir)
	else:
		checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

	# checkpoint['args']['device'] saves the device used as train time
	# if at test time, we are using a CPU, we must override device to None
	if not use_gpu:
		checkpoint['args'].device = None
	net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
	net.load_state_dict(checkpoint['model'])

	if use_gpu:
		net.cuda()
	net.eval()

	doc_num = len(pred_dataset)
	time_cost = 0
	file_id = 1
	for batch in tqdm(pred_iter):
		input_ids,attention_masks, doc_lens = vocab.make_predict_features(batch)
		t1 = time()
		if use_gpu:
			probs = net(input_ids,attention_masks,doc_lens)
		else:
			input_ids,attention_masks = Variable(input_ids),Variable(attention_masks)
			probs = net(input_ids,attention_masks, doc_lens)
		t2 = time()
		time_cost += t2 - t1
		start = 0
		for doc_id,doc_len in enumerate(doc_lens):
			stop = start + doc_len
			prob = probs[start:stop]
			topk_elems = min(args.topk,doc_len)
			
			values, indices = prob.topk(topk_elems)
			topk_values, topk_indices = [], []
			
			#Consider predictions with >0.5 prob score
			for v, i in zip(values, indices):
				if v >= 0.5:
					topk_values.append(v.cpu().data.numpy())
					topk_indices.append(i.cpu().data.numpy())

			#These values should be >0.5
			#print(topk_values)
			
			topk_indices.sort()
			
			doc = batch[doc_id].split('. ')[:doc_len]
			hyp = [doc[index] for index in topk_indices]
			with open(os.path.join(args.hyp,str(file_id)+'.txt'), 'w') as f:
				f.write('. '.join(hyp))
			start = stop
			file_id = file_id + 1
	logging.info(f'Speed: {(doc_num / time_cost)} docs / s' )

if __name__=='__main__':
	if args.test:
		logging.info("TESTING")
		test()
	elif args.predict:
		logging.info("PREDICTING")
		with open(args.filename) as file:
			bod = [file.read()]
		predict(bod)
	else:
		logging.info("TRAINING")
		train()


Overwriting main.py


### Rajdeep

In [None]:
# SummaRuNNer with BERT models/RNN_RNN.py
# By Rajdeep
%%writefile models/RNN_RNN.py

from .BasicModule import BasicModule
import subprocess as sp
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from transformers import BertTokenizer, BertModel

import logging
logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)

HH = 256

class RNN_RNN(BasicModule):
	def __init__(self, args, embed=None):
		super(RNN_RNN, self).__init__(args)
		
		self.model_name = 'RNN_RNN'
		self.args = args
		
		# V = args.embed_num
		# D = args.embed_dim
		
		H = args.hidden_size
		S = args.seg_num
		P_V = args.pos_num 
		P_D = args.pos_dim
		self.abs_pos_embed = nn.Embedding(P_V, P_D)
		self.rel_pos_embed = nn.Embedding(S, P_D)
		
		# self.embed = nn.Embedding(V,D,padding_idx=0)
		# if embed is not None:
		#     self.embed.weight.data.copy_(embed)

		self.bert_m = BertModel.from_pretrained('ProsusAI/finbert')
		# for name, param in list(self.bert_m.named_parameters())[:-66]:
		# 	param.requires_grad = False

		self.sent_RNN = nn.GRU(
						input_size = 768,
						hidden_size = HH,
						batch_first = True,
						bidirectional = True
						)
		self.fc = nn.Linear(2*HH, 2*HH)

		# Parameters of Classification Layer
		self.content = nn.Linear(2*HH, 1, bias=False)
		self.salience = nn.Bilinear(2*HH, 2*HH, 1, bias=False)
		self.novelty = nn.Bilinear(2*HH, 2*HH, 1, bias=False)
		self.abs_pos = nn.Linear(P_D, 1, bias=False)
		self.rel_pos = nn.Linear(P_D, 1, bias=False)
		self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1, 0.1))

	def max_pool1d(self, x, seq_lens):
		# x:[N, L, O_in]
		out = []
		for index,t in enumerate(x):
			t = t[:seq_lens[index],:]
			t = torch.t(t).unsqueeze(0)
			out.append(F.max_pool1d(t,t.size(2)))

		out = torch.cat(out).squeeze(2)
		return out

	def avg_pool1d(self, x, seq_lens):
		# x:[N, L, O_in]
		out = []
		for index,t in enumerate(x):
			t = t[:seq_lens[index],:]
			t = torch.t(t).unsqueeze(0)
			out.append(F.avg_pool1d(t,t.size(2)))

		out = torch.cat(out).squeeze(2)
		return out

	def forward(self, input_ids, attention_masks, doc_lens):
		# word level GRU
		# H = self.args.hidden_size
		
		outputs = self.bert_m(input_ids=input_ids, attention_mask=attention_masks)
		
		# hidden representation of last layer 
		token_vecs = outputs.last_hidden_state		
		# dimension : [N, max_len_sent, 768] N: no of sentences
		
		k = 0
		for i in token_vecs:
			# cls embedding
			sentence_embedding = i[0]
			sen = sentence_embedding.unsqueeze(0)
			if(k == 0):				
				emb = sen
				k = k + 1
			else:
				emb = torch.cat((emb, sen), 0)

		torch.cuda.empty_cache()		
		# make sent features (pad with zeros)
		x = self.pad_doc(emb, doc_lens)
		
		# sent level GRU
		sent_out = self.sent_RNN(x)[0]										# (B, max_doc_len, 2*H)
		docs = self.max_pool1d(sent_out, doc_lens)							# (B, 2*H)
		
		del emb		
		del input_ids
		del attention_masks
		torch.cuda.empty_cache()
		
		probs = []		
		for index, doc_len in enumerate(doc_lens):
			valid_hidden = sent_out[index,:doc_len,:]						# (doc_len, 2*H)
			doc = F.tanh(self.fc(docs[index])).unsqueeze(0)
			s = Variable(torch.zeros(1,2*HH))
			if self.args.device is not None:
				s = s.cuda()
			for position, h in enumerate(valid_hidden):
				h = h.view(1, -1)											# (1, 2*H)
				# get position embeddings
				abs_index = Variable(torch.LongTensor([[position]]))
				if self.args.device is not None:
					abs_index = abs_index.cuda()
				abs_features = self.abs_pos_embed(abs_index).squeeze(0)

				rel_index = int(round((position + 1) * 9.0 / doc_len))
				rel_index = Variable(torch.LongTensor([[rel_index]]))
				if self.args.device is not None:
					rel_index = rel_index.cuda()
				rel_features = self.rel_pos_embed(rel_index).squeeze(0)

				# classification layer
				content = self.content(h)
				salience = self.salience(h,doc)
				novelty = -1 * self.novelty(h,F.tanh(s))
				abs_p = self.abs_pos(abs_features)
				rel_p = self.rel_pos(rel_features)
				prob = F.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
				s = s + torch.mm(prob,h)
				probs.append(prob)
		
		del sent_out
		del docs
		torch.cuda.empty_cache()
		
		return torch.cat(probs).squeeze()

Overwriting models/RNN_RNN.py


In [None]:
# SummaRuNNer with BERT utils/Vocab.py
# By Rajdeep
%%writefile utils/Vocab.py

import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

import logging
logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)


class Vocab():
	def __init__(self, embed, word2id):
		self.embed = embed
		self.word2id = word2id
		# self.id2word = {v:k for k,v in word2id.items()}
		# assert len(self.word2id) == len(self.id2word)
		# self.PAD_IDX = 0
		# self.UNK_IDX = 1
		# self.PAD_TOKEN = 'PAD_TOKEN'
		# self.UNK_TOKEN = 'UNK_TOKEN'

	# def __len__(self):
	#     return len(word2id)

	# def i2w(self,idx):
	#     return self.id2word[idx]
	# def w2i(self,w):
	#     if w in self.word2id:
	#         return self.word2id[w]
	#     else:
	#         return self.UNK_IDX

	def make_features(self, batch, sent_trunc=50, doc_trunc=800, split_token='\n'):
		sents_list, targets, doc_lens = [], [], []
		
		# trunc document
		for doc, label in zip(batch['doc'], batch['labels']):
			sents = doc.split(split_token)
			labels = label.split(split_token)
			labels = [int(l) for l in labels]
			max_sent_num = min(doc_trunc, len(sents))
			sents = sents[:max_sent_num]
			labels = labels[:max_sent_num]
			sents_list += sents
			targets += labels
			doc_lens.append(len(sents))
		
		# # trunc or pad sent
		# max_sent_len = 0
		# batch_sents = []
		# for sent in sents_list:
		# 	words = sent.split()
		# 	if len(words) > sent_trunc:
		# 		words = words[:sent_trunc]
		# 	max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
		# 	batch_sents.append(words)

		input_ids = []
		attention_masks = []
		for sent in sents_list:
			encoded_dict = tokenizer.encode_plus(
				text=sent,
				text_pair=None,
				add_special_tokens=True,
				padding='max_length',
				max_length=64,
				truncation='longest_first',
				return_token_type_ids=True,
				return_tensors="pt"
				)			
			input_ids.append(encoded_dict['input_ids'])
			attention_masks.append(encoded_dict['attention_mask'])
			
		input_ids = torch.cat(input_ids, dim=0)
		attention_masks = torch.cat(attention_masks, dim=0)
		targets = torch.LongTensor(targets)
		summaries = batch['summaries']

		return input_ids, attention_masks, targets, summaries, doc_lens

	
	def make_predict_features(self, batch, sent_trunc=50, doc_trunc=800, split_token='. '):
		sents_list, doc_lens = [], []
		for doc in batch:
			sents = doc.split(split_token)
			max_sent_num = min(doc_trunc, len(sents))
			sents = sents[:max_sent_num]
			sents_list += sents
			doc_lens.append(len(sents))

		# # trunc or pad sent
		# max_sent_len = 0
		# batch_sents = []
		# for sent in sents_list:
		# 	words = sent.split()
		# 	if len(words) > sent_trunc:
		# 		words = words[:sent_trunc]
		# 	max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
		# 	batch_sents.append(words)

		# features = []
		# for sent in batch_sents:
		#     feature = [self.w2i(w) for w in sent] + [self.PAD_IDX for _ in range(max_sent_len-len(sent))]
		#     features.append(feature)
		# features = torch.LongTensor(features)

		input_ids = []
		attention_masks = []
		for sent in sents_list:
			encoded_dict = tokenizer.encode_plus(
				text=sent,
				text_pair=None,
				add_special_tokens=True,
				padding='max_length',
				max_length=64,
				truncation='longest_first',
				return_token_type_ids=True,
				return_tensors="pt"
				)			
			input_ids.append(encoded_dict['input_ids'])
			attention_masks.append(encoded_dict['attention_mask'])
			
		input_ids = torch.cat(input_ids, dim=0)
		attention_masks = torch.cat(attention_masks, dim=0)

		return  input_ids, attention_masks, doc_lens

Overwriting utils/Vocab.py


In [None]:
# SummaRuNNer with BERT main.py
# By Rajdeep
%%writefile main.py

#!/usr/bin/env python3

import subprocess as sp
import os
import json
import models
import utils
import argparse,random,logging,numpy,os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from torch.nn.utils import clip_grad_norm
from time import time
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")
logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)

logging.basicConfig(level=logging.INFO, format='%(asctime)s [INFO] %(message)s')
parser = argparse.ArgumentParser(description='extractive summary')
# model
parser.add_argument('-save_dir',type=str,default='checkpoints/')
# parser.add_argument('-embed_dim',type=int,default=100)
# parser.add_argument('-embed_num',type=int,default=100)
parser.add_argument('-pos_dim',type=int,default=100)
parser.add_argument('-pos_num',type=int,default=300)
parser.add_argument('-seg_num',type=int,default=10)
parser.add_argument('-kernel_num',type=int,default=100)
parser.add_argument('-kernel_sizes',type=str,default='3,4,5')
parser.add_argument('-model',type=str,default='RNN_RNN')
parser.add_argument('-hidden_size',type=int,default=200)
# train
parser.add_argument('-lr',type=float,default=2e-5)
parser.add_argument('-batch_size',type=int,default=16)
parser.add_argument('-epochs',type=int,default=5)
parser.add_argument('-seed',type=int,default=42)
parser.add_argument('-train_dir',type=str,default='data/train.json')
parser.add_argument('-val_dir',type=str,default='data/val.json')
parser.add_argument('-embedding',type=str,default='data/embedding.npz')
parser.add_argument('-word2id',type=str,default='data/word2id.json')
parser.add_argument('-report_every',type=int,default=16)
# parser.add_argument('-seq_trunc',type=int,default=50)
parser.add_argument('-max_norm',type=float,default=1.0)
# test
parser.add_argument('-load_dir',type=str,default='checkpoints/RNN_RNN_seed_42.pt')
parser.add_argument('-test_dir',type=str,default='data/test.json')
parser.add_argument('-ref',type=str,default='outputs/ref')
parser.add_argument('-hyp',type=str,default='outputs/hyp')
parser.add_argument('-filename',type=str,default='x.txt') # TextFile to be summarized
parser.add_argument('-topk',type=int,default=8)
# device
parser.add_argument('-device',type=int)
# option
parser.add_argument('-test',action='store_true')
parser.add_argument('-debug',action='store_true')
parser.add_argument('-predict',action='store_true')
args = parser.parse_args()
use_gpu = args.device is not None

if torch.cuda.is_available() and not use_gpu:
	print("WARNING: You have a CUDA device, should run with -device 0")

# set cuda device and seed
if use_gpu:
	torch.cuda.set_device(args.device)
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
random.seed(args.seed)
numpy.random.seed(args.seed)


def eval(net, vocab, data_iter, criterion):
	net.eval()
	with torch.no_grad():		
		total_loss = 0
		batch_num = 0
		for batch in data_iter:
			input_ids, attention_masks, targets, _, doc_lens = vocab.make_features(batch)
			input_ids, attention_masks, targets = Variable(input_ids), Variable(attention_masks), Variable(targets.float())
			if use_gpu:				
				input_ids = input_ids.cuda()
				attention_masks = attention_masks.cuda()
				targets = targets.cuda()
			probs = net(input_ids, attention_masks, doc_lens)
			loss = criterion(probs, targets)
			total_loss += loss.item()
			batch_num += 1
		loss = total_loss / batch_num
		del targets
		del input_ids
		del attention_masks
		torch.cuda.empty_cache()
		net.train()
	return loss


def train():
	logging.info('Loading vocab, train and val dataset. Wait a second, please')
	early_stopping = 3
	
	# embed = torch.Tensor(np.load(args.embedding)['embedding'])
	# with open(args.word2id) as f:
	#     word2id = json.load(f)
	
	embed, word2id = None, None
	vocab = utils.Vocab(embed, word2id)

	with open(args.train_dir) as f:
		examples = [json.loads(line) for line in f]
	train_dataset = utils.Dataset(examples)

	with open(args.val_dir) as f:
		examples = [json.loads(line) for line in f]
	val_dataset = utils.Dataset(examples)

	# update args
	args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')]
	
	# args.embed_num = embed.size(0)
	# args.embed_dim = embed.size(1)	
	# args.embed_num = None
	# args.embed_dim = None	

	acc_steps = 2
	
	# build model
	net = getattr(models, args.model)(args, embed)
	if use_gpu:
		net.cuda()
	
	# load dataset
	train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True)
	val_iter = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False)
	
	# loss function
	criterion = nn.BCELoss()
	
	# model info
	# print(net)	
	params = sum(p.numel() for p in list(net.parameters())) / 1e6
	print('#Params: %.1fM' % (params))

	min_loss = float('inf')
	optimizer = torch.optim.Adam(net.parameters(), lr=args.lr)
	net.train()

	t1 = time()
	checkpp = 0
	for epoch in tqdm(range(1, args.epochs+1)):
		logging.info(f"\nEpoch: {epoch}")
		if(checkpp == early_stopping):
			break
		optimizer.zero_grad()
		t_loss = 0
		s_loss = 0
		for i,batch in enumerate(train_iter):
			input_ids, attention_masks, targets, _, doc_lens = vocab.make_features(batch)
			input_ids, attention_masks, targets = Variable(input_ids), Variable(attention_masks), Variable(targets.float())
			if use_gpu:
				input_ids = input_ids.cuda()
				attention_masks = attention_masks.cuda()
				targets = targets.cuda()
			   
			probs = net(input_ids, attention_masks, doc_lens)
			loss = criterion(probs, targets)			
			t_loss = t_loss + loss.item()
			loss = loss / acc_steps
			s_loss = s_loss + 1
			loss.backward()
			clip_grad_norm(net.parameters(), args.max_norm)
			if (i+1) % acc_steps == 0:
				optimizer.step()
				optimizer.zero_grad()
			if args.debug:
				logging.info(f'Batch ID:{i} Loss:{loss.data.item()}')
				continue
			if (i+1) % args.report_every == 0:
				cur_loss = eval(net, vocab, val_iter, criterion)
				train_loss = t_loss/s_loss
				t_loss = 0
				s_loss = 0
				if cur_loss < min_loss:
					checkpp = 0
					min_loss = cur_loss
					best_path = net.save()
					logging.info('Model Checkpoint Saved')
				else:
					checkpp = checkpp+1
				logging.info(f'Epoch:{epoch} Min_Val_Loss: {min_loss} Cur_Val_Loss: {cur_loss} training loss: {train_loss}')

	t2 = time()
	logging.info('Total Time:%f h'%((t2-t1)/3600))


def test():
	# embed = torch.Tensor(np.load(args.embedding)['embedding'])
	# with open(args.word2id) as f:
	#     word2id = json.load(f)
	
	embed, word2id = None, None
	vocab = utils.Vocab(embed, word2id)

	#Loading Test File Names
	with open("data/test_files.txt") as f:
		file_names = f.readlines()
	file_names = [x.strip() for x in file_names]

	with open(args.test_dir) as f:
		examples = [json.loads(line) for line in f]
	test_dataset = utils.Dataset(examples)

	test_iter = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False)
	if use_gpu:
		checkpoint = torch.load(args.load_dir)
	else:
		checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

	# checkpoint['args']['device'] saves the device used as train time
	# if at test time, we are using a CPU, we must override device to None
	if not use_gpu:
		checkpoint['args'].device = None
	net = getattr(models, checkpoint['args'].model)(checkpoint['args'])
	net.load_state_dict(checkpoint['model'])
	if use_gpu:
		net.cuda()
	net.eval()

	doc_num = len(test_dataset)
	time_cost = 0
	file_count = 0
	for batch in tqdm(test_iter):
		input_ids, attention_masks, targets, summaries, doc_lens  = vocab.make_features(batch)
		input_ids, attention_masks, targets = Variable(input_ids), Variable(attention_masks), Variable(targets.float())
		t1 = time()
		if use_gpu:
			input_ids = input_ids.cuda()
			attention_masks = attention_masks.cuda()
			probs = net(input_ids, attention_masks, doc_lens)
		else:
			probs = net(input_ids, attention_masks, doc_lens)
		t2 = time()
		time_cost += t2 - t1
		start = 0
		for doc_id, doc_len in enumerate(doc_lens):
			stop = start + doc_len
			prob = probs[:stop]
			topk_elems = min(args.topk, doc_len)			
			values, indices = prob.topk(topk_elems)
			topk_values, topk_indices = [], []			
			#Consider predictions with >=0.5 prob score
			for v, i in zip(values, indices):
				if v >= 0.5:
					topk_values.append(v.cpu().data.numpy())
					topk_indices.append(i.cpu().data.numpy())
			#These values should be >0.5
			if(len(topk_values) == 0):
				print(f"No predictions with >=0.5 prob_score in file: [{file_names[file_count]}]")
				print(f"Prob Scores: {values}")

			topk_indices.sort()
			doc = batch['doc'][doc_id].split('\n')[:doc_len]
			hyp = [doc[index] for index in topk_indices]
			ref = summaries[doc_id]
			if not os.path.isdir(args.ref):
				os.makedirs(args.ref)
			with open(os.path.join(args.ref, file_names[file_count]), 'w') as f:
				f.write(ref)
			if not os.path.isdir(args.hyp):
				os.makedirs(args.hyp)
			with open(os.path.join(args.hyp, file_names[file_count]), 'w') as f:
				f.write('\n'.join(hyp))
			start = stop
			file_count = file_count + 1

		del input_ids
		del attention_masks
		torch.cuda.empty_cache()
	logging.info(f'Speed: {(doc_num / time_cost)} docs / s' )


def predict(examples):
	# embed = torch.Tensor(np.load(args.embedding)['embedding'])
	# with open(args.word2id) as f:
	#     word2id = json.load(f)
		
	embed, word2id = None, None
	vocab = utils.Vocab(embed, word2id)
	
	pred_dataset = utils.Dataset(examples)
	pred_iter = DataLoader(dataset=pred_dataset, batch_size=args.batch_size, shuffle=False)
	if use_gpu:
		checkpoint = torch.load(args.load_dir)
	else:
		checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

	# checkpoint['args']['device'] saves the device used as train time
	# if at test time, we are using a CPU, we must override device to None
	if not use_gpu:
		checkpoint['args'].device = None
	net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
	net.load_state_dict(checkpoint['model'])

	if use_gpu:
		net.cuda()
	net.eval()

	doc_num = len(pred_dataset)
	time_cost = 0
	file_id = 1
	for batch in tqdm(pred_iter):
		input_ids, attention_masks, doc_lens  = vocab.make_predict_features(batch)
		input_ids, attention_masks = Variable(input_ids), Variable(attention_masks)
		t1 = time()
		if use_gpu:
			input_ids = input_ids.cuda()
			attention_masks = attention_masks.cuda()
			probs = net(input_ids, attention_masks, doc_lens)
		else:
			probs = net(input_ids, attention_masks, doc_lens)
		t2 = time()
		time_cost += t2 - t1
		start = 0
		for doc_id, doc_len in enumerate(doc_lens):
			stop = start + doc_len
			prob = probs[start:stop]
			topk_elems = min(args.topk, doc_len)			
			values, indices = prob.topk(topk_elems)
			topk_values, topk_indices = [], []			
			#Consider predictions with >=0.5 prob score
			for v, i in zip(values, indices):
				if v >= 0.5:
					topk_values.append(v.cpu().data.numpy())
					topk_indices.append(i.cpu().data.numpy())

			#These values should be >0.5
			#print(topk_values)
			
			topk_indices.sort()			
			doc = batch[doc_id].split('. ')[:doc_len]
			hyp = [doc[index] for index in topk_indices]
			if not os.path.isdir(args.hyp):
				os.makedirs(args.hyp)
			with open(os.path.join(args.hyp, str(file_id) + '.txt'), 'w') as f:
				f.write('. '.join(hyp))
			start = stop
			file_id = file_id + 1
	logging.info(f'Speed: {(doc_num / time_cost)} docs / s' )



if __name__=='__main__':
	if args.test:
		logging.info("TESTING")
		test()
	elif args.predict:
		logging.info("PREDICTING")
		with open(args.filename) as file:
			bod = [file.read()]
		predict(bod)
	else:
		logging.info("TRAINING")
		train()

Overwriting main.py


# Training

In [None]:
!python main.py -device 0 -batch_size 2 -epochs 5 -model RNN_RNN -seed 42 -save_dir checkpoints/SR_

2022-05-22 22:20:19,724 [INFO] TRAINING
2022-05-22 22:20:19,725 [INFO] Loading vocab, train and val dataset. Wait a second, please
Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
#Params: 111.9M
  0% 0/5 [00:00<?, ?it/s]2022-05-22 22:20:26,061 [INFO] 
Epoch: 1
2022-05-22 22:21:03,305 [INFO] Model Checkpoint Saved
2022-05-22 22:21:03,305 [INFO] Epoch:1 Min_Val_Loss: 0.2258691761701826 Cur_Val_Loss: 0.2258691761701826 training loss: 0.3719206308014691


# Testing

In [None]:
# Uncomment to remove previous pred. outputs
# !rm /content/SummaRuNNer/outputs/hyp/*
# !rm /content/SummaRuNNer/outputs/ref/*

!python main.py -device 0 -batch_size 1 -test -load_dir checkpoints/SR_RNN_RNN_seed_1.pt

# Inference

In [None]:
# !python main.py -batch_size 1 -predict -filename x.txt -load_dir checkpoints/SR_RNN_RNN_seed_1.pt

# Data prep

In [None]:
# !git clone https://RajdeepMukherjee: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@github.com/rajdeep345/ECTSumm.git
# !git clone https://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@github.com/abhinav-bohra/Long-Text-Summarization.git
# ! cp -r /content/ECTSumm/data/reuters/sr/exp2/* /content/Long-Text-Summarization/data/reuters/summarunner/
# %cd /content/SummaRuNNer/ECTSumm

In [None]:
import json
import numpy as np
from tqdm import tqdm
from ect_utils import *

# Experiment 2
# Doc - All lines with numerical/monetary figures/values that cover target summary sentences
# Summ - All lines except REFINITIV that are covered by document sentences


def getDocLines(fname):
    doc_lines = get_DocLines(fname)
    processed_lines = getProcessedLines(doc_lines)
    assert len(doc_lines) == len(processed_lines)
    num_lines = []
    for i in range(len(processed_lines)):
        if '[NUM]' in processed_lines[i]:
            num_lines.append(doc_lines[i])
    return num_lines


def getSummLines(fname):
    lines = get_SummLines(fname)
    lines = [line for line in lines if 'REFINITIV IBES DATA' not in line]
    return lines
    

def prepare_data(dataPath, out_path):
    ect_path = f'{dataPath}/ects/'
    summ_path = f'{dataPath}/gt_summaries/'
    data = list()
    for file in tqdm(os.listdir(ect_path)):
        if file.endswith('.txt'):
            doc_lines = getDocLines(f'{ect_path}{file}')
            summ_lines = getSummLines(f'{summ_path}{file}')
            d_lines, s_lines = [], []
            labels = np.zeros(len(doc_lines))
            for line in summ_lines:
                summ_text = getPartiallyProcessedText(line)
                if re.search(pattern6, summ_text):
                    values_summ_line = re.findall(pattern6, summ_text)
                    for i, text in enumerate(doc_lines):
                        doc_text = getPartiallyProcessedText(text)
                        values_doc_line = re.findall(pattern6, doc_text)
                        if set(values_doc_line).issuperset(set(values_summ_line)):
                            labels[i]=1
                            d_lines.append(text)
                            s_lines.append(line)
                            
            assert len(doc_lines) == len(labels)
            #print(len(labels)-np.sum(labels))
            data_point = {"doc": '\n'.join(doc_lines), "summaries":'\n'.join(summ_lines),"labels":'\n'.join([str(int(l)) for l in labels]) }
            data_str = json.dumps(data_point)
            data.append(data_str)
    
    with open(out_path, mode='wt', encoding='utf-8') as myfile:
        myfile.write('\n'.join(data))
        print(len(data))
    
    myfile.close()
    

for split in ['train', 'val', 'test']:
    print(f'\n\n{split} data')
    prepare_data(f'/content/Long-Text-Summarization/data/reuters/exp2/{split}', f'/content/Long-Text-Summarization/data/reuters/summarunner/{split}.json')

# Update Repo

In [None]:
!cp /content/SummaRuNNer/checkpoints/SR_RNN_RNN_seed_1.pt  /content/Long-Text-Summarization/models/

In [None]:
%cd /content/Long-Text-Summarization
!git add .
!git status
!git config --global user.email "abhinavbohra@iitkgp.ac.in"
!git config --global user.name "abhinav-bohra"

In [None]:
!git commit -m "Added SummaRuNNer Model Checkpoint"
!git push https://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@github.com/abhinav-bohra/Long-Text-Summarization.git

In [None]:
!git pull