In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
from preprocessing_algorithms import*

In [69]:
# load the T5 model and tokenizer
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name, model_max_length=4214)
model = T5ForConditionalGeneration.from_pretrained(model_name, max_length=4214)


# \Local\Programs\Python\Python310\lib\site-packages\transformers\models\t5\tokenization_t5.py:163: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.
# For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
# - Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
# - If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
# - To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.
#   warnings.warn(
# to fix this use model_max_length=512 and  max_length=512

In [3]:
data = pd.read_csv('DUC/Duc_first_reference/main_dataset/Duc_dataset_first_ref_summary.csv')
text = data.iloc[0,0]
text

"\r\nCambodian leader Hun Sen on Friday rejected opposition parties' demands \r\nfor talks outside the country, accusing them of trying to ``internationalize'' \r\nthe political crisis. Government and opposition parties have asked \r\nKing Norodom Sihanouk to host a summit meeting after a series of post-election \r\nnegotiations between the two opposition groups and Hun Sen's party \r\nto form a new government failed. Opposition leaders Prince Norodom \r\nRanariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition \r\nfigures after two alleged attempts on his life, said they could not \r\nnegotiate freely in Cambodia and called for talks at Sihanouk's residence \r\nin Beijing. Hun Sen, however, rejected that. ``I would like to make \r\nit clear that all meetings related to Cambodian affairs must be conducted \r\nin the Kingdom of Cambodia,'' Hun Sen told reporters after a Cabinet \r\nmeeting on Friday. ``No-one should internationalize Cambodian affairs. \r\nIt is detrimenta

In [57]:
sentences,filtered_sentences = preprocessing_text_with_spacy(text,remove_stopwords=True,lemmatization= True)

In [58]:
for i in filtered_sentences:
    text += i   


In [59]:
data['Original Summary'][0]

"Prospects were dim for resolution of the political crisis in Cambodia in October 1998.\r\nPrime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad.\r\nKing Sihanouk declined to chair talks in either place.\r\nA U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.\r\nBut in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.\r\nLeft out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians.\r\n"

In [70]:
# tokenize the text and generate the summary
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", truncation=True, padding=True)

In [71]:
summary_ids = model.generate(inputs, num_beams=4, max_length=100, early_stopping=True)

In [72]:
len(summary_ids[0])

64

In [73]:
for i,j in enumerate(summary_ids[0]):
    print(j, tokenizer.decode(summary_ids[0][i], skip_special_tokens=True))

tensor(0) <pad>
tensor(3) 
tensor(1765) king
tensor(3701) nor
tensor(32) o
tensor(5012) dom
tensor(108) si
tensor(2618) han
tensor(32) o
tensor(1598) uk
tensor(845) says
tensor(3) 
tensor(88) he
tensor(19) is
tensor(3) 
tensor(31) '
tensor(25866) strong
tensor(120) ly
tensor(1638) interested
tensor(31) '
tensor(16) in
tensor(3) 
tensor(9) a
tensor(126) new
tensor(789) government
tensor(3) 
tensor(5) .
tensor(3) 
tensor(88) he
tensor(845) says
tensor(8) the
tensor(8263) opposition
tensor(19) is
tensor(1119) trying
tensor(12) to
tensor(3) 
tensor(31) '
tensor(27817) international
tensor(1737) ize
tensor(31) '
tensor(8) the
tensor(1827) political
tensor(5362) crisis
tensor(3) 
tensor(5) .
tensor(3) 
tensor(88) he
tensor(845) says
tensor(8) the
tensor(8263) opposition
tensor(19) is
tensor(3) 
tensor(31) '
tensor(202) un
tensor(8894) will
tensor(53) ing
tensor(12) to
tensor(143) make
tensor(136) any
tensor(12326) compromise
tensor(31) '
tensor(3) 
tensor(5) .
tensor(1) </s>


In [74]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [75]:
summary

"king norodom sihanouk says he is'strongly interested' in a new government. he says the opposition is trying to 'internationalize' the political crisis. he says the opposition is 'unwilling to make any compromise'."

In [76]:
# score each sentence in the original text based on how closely it matches the summary
sentences = text.split('. ')
scores = []
for sentence in sentences:
    input_text = "summarize: " + sentence
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    summary_ids = model.generate(input_ids, num_beams=4, max_length=20, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    if summary:
        score = len(set(summary.split()).intersection(set(text.split()))) / len(set(summary.split()))
        scores.append(score)
    else:
        scores.append(0.0)

In [77]:
summary_size=5

In [78]:
sentence_scores = []
# print the scores for each sentence
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    sentence_scores.append(scores[i])
    print(f"Score: {scores[i]}")
len(sentence_scores)

Sentence 1: 
Cambodian leader Hun Sen on Friday rejected opposition parties' demands 
for talks outside the country, accusing them of trying to ``internationalize'' 
the political crisis
Score: 0.75
Sentence 2: Government and opposition parties have asked 
King Norodom Sihanouk to host a summit meeting after a series of post-election 
negotiations between the two opposition groups and Hun Sen's party 
to form a new government failed
Score: 1.0
Sentence 3: Opposition leaders Prince Norodom 
Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition 
figures after two alleged attempts on his life, said they could not 
negotiate freely in Cambodia and called for talks at Sihanouk's residence 
in Beijing
Score: 0.8888888888888888
Sentence 4: Hun Sen, however, rejected that
Score: 1.0
Sentence 5: ``I would like to make 
it clear that all meetings related to Cambodian affairs must be conducted 
in the Kingdom of Cambodia,'' Hun Sen told reporters after a Cabinet 
meeting on Frid

202

In [79]:
import numpy as np
sorted_ix = np.argsort(scores)[::-1]

summary_index=[]
for i in sorted_ix[:summary_size]:
    summary_index.append(i)
print(summary_index)

summary = ""
for i in summary_index:
    summary += sentences[i]       
print(summary)

[100, 200, 107, 106, 105]
24Other details of the Senate, including how much 
power it will be given in the promulgation of legislation, have yet 
to be ironed out by the two partiesHun Sen's Cambodian People's Party 
dropped insistence on a joint assembly chairmanship shared by Ranariddh 
and party boss Chea Sim, the current speakerIn a 
long-elusive compromise, opposition leader Prince Norodom Ranariddh 
will become president of the National Assembly resulting from disputed 
elections in July, even though Hun Sen's party holds a majority of 
64 seats in the 122-member chamber
 
Cambodia's bickering political parties broke a three-month deadlock 
Friday and agreed to a coalition government leaving strongman Hun 
Sen as sole prime minister, King Norodom Sihanouk announced


In [80]:


# batch the texts
batch_size = 8
text_batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]

# generate summaries for each batch
summaries = []
for batch in text_batches:
    input_ids = tokenizer.batch_encode_plus(batch, return_tensors="pt", max_length=512, truncation=True)["input_ids"].to(device)
    summary_ids = model.generate(input_ids, num_beams=4, max_length=20, early_stopping=True)
    batch_summaries = [tokenizer.decode(summary_ids[i], skip_special_tokens=True) for i in range(len(batch))]
    summaries.extend(batch_summaries)


NameError: name 'texts' is not defined