In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# IMPORTS

In [2]:

!pip install -q transformers  rouge sentence-transformers

[K     |████████████████████████████████| 5.5 MB 4.5 MB/s 
[K     |████████████████████████████████| 85 kB 4.2 MB/s 
[K     |████████████████████████████████| 7.6 MB 54.6 MB/s 
[K     |████████████████████████████████| 182 kB 33.5 MB/s 
[K     |████████████████████████████████| 1.3 MB 34.6 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [3]:
import spacy
nlp = spacy.load('en_core_web_sm') # Load the English Model

In [4]:
import numpy as np
from scipy.sparse.csgraph import connected_components
from scipy.special import softmax


In [20]:
import pandas as pd
import regex as re

In [6]:
import os

In [8]:
%%capture
import nltk
nltk.download('all')

In [8]:
from sentence_transformers import SentenceTransformer, util
import numpy as np


In [9]:
from tqdm import tqdm

# Calc LEX Rank

In [10]:
def degree_centrality_scores(similarity_matrix,threshold=None,increase_power=True):
    
    if not (threshold is None or isinstance(threshold, float)and 0 <= threshold < 1):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None',
        )

    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)

    else:
        markov_matrix = create_markov_matrix_discrete(
            similarity_matrix,
            threshold,
        )

    scores = stationary_distribution(
        markov_matrix,
        increase_power=increase_power,
        normalized=False,
    )

    return scores


def _power_method(transition_matrix, increase_power=True, max_iter=10000):
    eigenvector = np.ones(len(transition_matrix))

    if len(eigenvector) == 1:
        return eigenvector

    transition = transition_matrix.transpose()

    for _ in range(max_iter):
        eigenvector_next = np.dot(transition, eigenvector)

        if np.allclose(eigenvector_next, eigenvector):
            return eigenvector_next

        eigenvector = eigenvector_next

        if increase_power:
            transition = np.dot(transition, transition)

    print("Maximum number of iterations for power method exceeded without convergence!")
    return eigenvector_next


def connected_nodes(matrix):
    _, labels = connected_components(matrix)

    groups = []

    for tag in np.unique(labels):
        group = np.where(labels == tag)[0]
        groups.append(group)

    return groups


def create_markov_matrix(weights_matrix):
    n_1, n_2 = weights_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'weights_matrix\' should be square')

    row_sum = weights_matrix.sum(axis=1, keepdims=True)

    # normalize probability distribution differently if we have negative transition values
    if np.min(weights_matrix) <= 0:
        return softmax(weights_matrix, axis=1)

    return weights_matrix / row_sum


def create_markov_matrix_discrete(weights_matrix, threshold):
    discrete_weights_matrix = np.zeros(weights_matrix.shape)
    ixs = np.where(weights_matrix >= threshold)
    discrete_weights_matrix[ixs] = 1

    return create_markov_matrix(discrete_weights_matrix)


def stationary_distribution(transition_matrix,increase_power=True,normalized=True,):
    n_1, n_2 = transition_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'transition_matrix\' should be square')

    distribution = np.zeros(n_1)

    grouped_indices = connected_nodes(transition_matrix)

    for group in grouped_indices:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        distribution[group] = eigenvector

    if normalized:
        distribution /= n_1

    return distribution

# Load the data and preprocess

In [46]:
train_df = pd.read_csv("/content/drive/MyDrive/NLP Project/Data/T1_Text_Summarization_English/T1_Text_Summarization_English_train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/NLP Project/Data/T1_Text_Summarization_English/T1_Text_Summarization_English_test.csv")


In [48]:
def handle_abbr(example):
    for i in range(len(example)):
        tok = example[i]
        if example[-1]=='.':
            q = tok[:-1]
            if '.' in q:
                r = example[i].split('.')
                example[i] = ''.join(r)
        if ".." in example[i]:
            r = example[i].split('.')
            r = ''.join(r)
            example[i] = r        

        if ("u.n."in example[i] or "u.k."in example[i] or "u.s." in example[i]  or "u.s.a." in example[i]):
            tok = example[i].split('.')
            example[i] = ''.join(tok)
        
    example = ' '.join(example)
    return example

def preprocess(row):
    row = str(row).lower()
    row = re.split(r"\*",row)
    row = row[0]
    
    row = row.split(" ")
    row =  handle_abbr(row)

    row = re.sub("(\\t)", " ", str(row)).lower()
    row = re.sub("(\\r)", " ", str(row)).lower()
    #row = re.sub("(\\n)", " ", str(row)).lower()

    # Remove _ if it occurs more than one time consecutively
    row = re.sub("(__+)", " ", str(row)).lower()

    # Remove - if it occurs more than one time consecutively
    row = re.sub("(--+)", " ", str(row)).lower()

    # Remove ~ if it occurs more than one time consecutively
    row = re.sub("(~~+)", " ", str(row)).lower()

    # Remove + if it occurs more than one time consecutively
    row = re.sub("(\+\++)", " ", str(row)).lower()

    #remove slashes
    row = re.sub(r"\\", "", str(row)).lower()
    row = re.sub(r"/", "", str(row)).lower()

    #remove apostrophe
    row = re.sub("'","", str(row)).lower()


    # Remove \x9* in text
    row = re.sub(r"(\\x9\d)", " ", str(row)).lower()

    #remove unicode space symbols
    row = re.sub(r'\xa0', r' ', str(row)).lower()

    # Replace any url to only the domain name
    try:
        url = re.search(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", str(row))
        repl_url = url.group(3)
        row = re.sub(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", repl_url, str(row))
    except:
          pass

    # Remove multiple spaces
    row = re.sub("(\s+)", " ", str(row)).lower()

    # Remove the single character hanging between any two spaces
    row = re.sub("(\s+.\s+)", " ", str(row)).lower()
      

    return row

In [90]:
train_df['cleaned_article'] = train_df['Article'].apply(preprocess)
test_df['cleaned_article'] = test_df['Article'].apply(preprocess)


## Generate Extractive Summaries

In [14]:
preprocess(list(train_df['cleaned_article'])[0])

'at sco, india refuses to back chinas belt and road projectindia on sunday refused to back chinas ambitious belt and road initiative at the sco summit with prime minister narendra modi asserting that any mega connectivity project must respect sovereignty and territorial integrity of the countries. india was the only country in the eight-nation shanghai cooperation organisation grouping to oppose the bri project by china. declaration signed by leaders of the shanghai cooperation organisation (sco) countries, including modi, at the end of two-day summit of the bloc said russia, pakistan, kazakhstan, uzbekistan, kyrgyzstan and tajikistan have been "reaffirming" their support for chinas belt and road initiative (bri).related stories sco summit: president xi accepts pm modis invitation for informal summit in india in 2019 pm modi at sco summit 2018: connectivity with neighbourhood and in sco region india’s prioritypm modi, pak pres hussain shake hands at sco summitsco summit: pm modi calls 

In [15]:

model = SentenceTransformer('sentence-transformers/stsb-mpnet-base-v2')

Downloading:   0%|          | 0.00/868 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/588 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [16]:
def return_extractive_summ(doc):
  
  sentences = nltk.sent_tokenize(doc)
  print(len(sentences))
  
  #Compute the sentence embeddings
  embeddings = model.encode(sentences, convert_to_tensor=True)

  #Compute the pair-wise cosine similarities
  cos_scores = util.cos_sim(embeddings, embeddings).numpy()

  #Compute the centrality for each sentence
  centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

  #We argsort so that the first element is the sentence with the highest score
  most_central_sentence_indices = np.argsort(-centrality_scores)


  res = []
  #return the top 5 sentences
  for idx in most_central_sentence_indices[0:5]:
    res.append(sentences[idx].strip())
    
  return ''.join(res)


In [25]:
list_of_article = list(test_df['cleaned_article'])

In [26]:
list_of_article[1977]

' '

In [None]:
list_of_ex_sum = []

for idx,article in enumerate(tqdm(list_of_article)):
  try:
    res = return_extractive_summ(article)
  except:
    print("Summary not generated at ",idx)

    res = " "
  
  list_of_ex_sum.append(res)

In [28]:
len(list_of_ex_sum)

2513

In [29]:
res = []

res.append(list_of_ex_sum[10])

res

['small private plane carrying six passengers and crew of two crashed sunday into vacant, two-story office building in milan suburb, and italian news reports said all aboard perished.the lapresse news agency initially quoted firefighters at the scene saying the pilot and all five passengers aboard were killed.fire officials couldnt immediately be reached to confirm the nationalities or the number of people aboard the plane.firefighters tweeted that no one other than those aboard was involved in the early afternoon crash near subway station in san donato milanese, small town near milan.firefighters work on the site of plane crash, in san donato milanese suburb of milan, italy, sunday.they said several cars in nearby parking lot were set ablaze, but apparently, the vehicles were unoccupied at the time.a thick column of dark smoke rose from the crash site and was visible for kilometers.firefighters were extinguishing the flames of the now-charred building, which reportedly was under renov

In [2]:
import pickle

In [31]:
with open(r"/content/drive/MyDrive/NLP Project/Results/Test_extractive.pkl", "wb") as output_file:
   pickle.dump(list_of_ex_sum, output_file)
   

## Generate Results from Extractive

In [78]:
with open(r"/content/drive/MyDrive/NLP Project/Results/Test_extractive.pkl", "rb") as file:
   list_of_ex_sum = pickle.load(file)
   

In [79]:
res = []

res.append(list_of_ex_sum[1])
res

['ppp instead asked its leaders, khursheed shah, the leader of the opposition in the national assembly, and naveed qamar to file their nomination papers.hope that god will help me in furthering nawaz sharif’s policies,” he told media persons.nawaz sharif on saturday announced that his younger brother, punjab chief minister shehbaz sharif, would be his successor.he nominated abbasi as the interim prime minister as it will take about two months for shehbaz to be elected to the national assembly.meanwhile, the opposition parties on monday failed to agree on joint candidate, with each party throwing its weight behind different candidates who submitted their nomination papers.pakistan’s national assembly will elect new prime minister today to replace ousted leader nawaz sharif .a total of six candidates are in the race to become the new prime minister although pakistan muslim league (nawaz) candidate shahid khaqan abbasi has clear edge.in 342-member house, the pml-n and its coalition partne

In [80]:
doc = list_of_ex_sum[1]

In [81]:
sentences = doc.split('.')[0]
print(len(sentences))

150


In [82]:
sentences

'ppp instead asked its leaders, khursheed shah, the leader of the opposition in the national assembly, and naveed qamar to file their nomination papers'

In [83]:
def handle_abbr(example):
    for i in range(len(example)):
        tok = example[i]
        if example[-1]=='.':
            q = tok[:-1]
            if '.' in q:
                r = example[i].split('.')
                example[i] = ''.join(r)
        if ".." in example[i]:
            r = example[i].split('.')
            r = ''.join(r)
            example[i] = r        

        if ("u.n."in example[i] or "u.k."in example[i] or "u.s." in example[i]  or "u.s.a." in example[i]):
            tok = example[i].split('.')
            example[i] = ''.join(tok)
        
    example = ' '.join(example)
    return example

In [84]:
def gen_csv(list_of_ex_summ):
  list_of_idx = []
  list_of_summ = []

  for idx,summ in enumerate(list_of_ex_sum):
    summ = summ.split(" ")
    summ = handle_abbr(summ)
    list_of_idx.append(idx)

    # sent tokenize each summ
    first_sen = summ.split('.')[0]

    list_of_summ.append(first_sen)

  res_df = pd.DataFrame({"Summary":list_of_summ, "id":list_of_idx})
  

  return res_df






In [85]:
res_df = gen_csv(list_of_ex_sum)
res_df.to_csv("/content/drive/MyDrive/NLP Project/Results/LexRank_Extractive_Test.csv")

how many empty pred

In [74]:
count = 0
for summ in list_of_ex_sum:
  if not summ:
    count = count + 1



In [76]:
count # awesome

0

#Testing on train_data

In [44]:
with open(r"/content/Train_extractive.pkl", "rb") as file:
   list_of_ex_sum = pickle.load(file)

In [45]:

res_df_train = gen_csv(list_of_ex_sum)
res_df_train.to_csv("res_df_train.csv")

In [49]:
train_df['cleaned_summary'] = train_df['Summary'].apply(preprocess)

In [50]:
gold_summ = list(train_df['cleaned_summary'])

In [63]:
pred_summ = list(res_df_train['Summary'])

In [52]:
pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [53]:
from rouge import Rouge

In [None]:
pred_summ

In [None]:
gold_summ

In [65]:
new_pred = []
new_gold = []
for pred,summ in zip(pred_summ,gold_summ):
  
  if not pred:
    continue

  new_pred.append(pred)
  new_gold.append(summ)
  

In [66]:
len(new_pred)

10042

In [68]:
len(new_gold)

10042

In [71]:
rouge = Rouge()

scores = rouge.get_scores(new_pred,new_gold,avg=True) # hyp,ref

In [72]:
scores

{'rouge-1': {'r': 0.36357489067363125,
  'p': 0.3650794271125573,
  'f': 0.3519916067098555},
 'rouge-2': {'r': 0.2264376679528056,
  'p': 0.22039065052152135,
  'f': 0.21575764710876053},
 'rouge-l': {'r': 0.32208910265328494,
  'p': 0.3244935965421752,
  'f': 0.31251766195903224}}

# Adding and saving the train_df with the extractive summaries

In [86]:
with open(r"/content/Train_extractive.pkl", "rb") as file:
   list_of_ex_sum_train = pickle.load(file)

In [87]:
train_df["Ext_Summary"] = list_of_ex_sum_train

In [88]:
train_df.to_csv("/content/drive/MyDrive/NLP Project/Data/Extractive/Train_With_EXTSumm")

# SAving the Test Df

In [91]:
test_df

Unnamed: 0,Heading,Article,id,cleaned_article
0,EXPLAINER: How worrying is the variant first s...,How worrying is the variant first seen in Indi...,0,how worrying is the variant first seen in indi...
1,Pakistan Parliament to elect new prime ministe...,Pakistan’s National Assembly will elect a new ...,1,pakistan’s national assembly will elect new pr...
2,Indian-origin pathologist accused of botching ...,Dr. Khalid AhmedAn Indian-origin pathologist h...,2,dr. khalid ahmedan indian-origin pathologist h...
3,China begins world's biggest census drive to c...,China begins world's biggest census drive to c...,3,china begins worlds biggest census drive to co...
4,"Indonesia prison fire kills 41 drug inmates, i...","Indonesia prison fire kills 41 drug inmates, i...",4,"indonesia prison fire kills 41 drug inmates, i..."
...,...,...,...,...
2508,"Arab League calls for Israel boycott, terms it...",The Arab League (AL) called on Arab States on ...,2508,the arab league (al) called on arab states on ...
2509,Beirut explosion among most powerful non-nucle...,Beirut explosion among most powerful non-nucle...,2509,beirut explosion among most powerful non-nucle...
2510,Anti-aircraft gun bullets found near Pak PM Im...,Imran KhanPolice in Pakistan have seized 18 li...,2510,imran khanpolice in pakistan have seized 18 li...
2511,Air-Launched Ballistic Missile will realise Ch...,Representational ImageThe US Department of Def...,2511,representational imagethe us department of def...


In [93]:
with open(r"/content/drive/MyDrive/NLP Project/Results/Test_extractive.pkl", "rb") as file:
   list_of_ex_sum_test = pickle.load(file)
   

In [94]:
test_df["Ext_Summary"] = list_of_ex_sum_test

In [95]:
test_df

Unnamed: 0,Heading,Article,id,cleaned_article,Ext_Summary
0,EXPLAINER: How worrying is the variant first s...,How worrying is the variant first seen in Indi...,0,how worrying is the variant first seen in indi...,experts think the next few weeks should provid...
1,Pakistan Parliament to elect new prime ministe...,Pakistan’s National Assembly will elect a new ...,1,pakistan’s national assembly will elect new pr...,"ppp instead asked its leaders, khursheed shah,..."
2,Indian-origin pathologist accused of botching ...,Dr. Khalid AhmedAn Indian-origin pathologist h...,2,dr. khalid ahmedan indian-origin pathologist h...,"in may last year, the senior coroner at the no..."
3,China begins world's biggest census drive to c...,China begins world's biggest census drive to c...,3,china begins worlds biggest census drive to co...,china begins worlds biggest census drive to co...
4,"Indonesia prison fire kills 41 drug inmates, i...","Indonesia prison fire kills 41 drug inmates, i...",4,"indonesia prison fire kills 41 drug inmates, i...",block was stuffed full of 122 convicts when th...
...,...,...,...,...,...
2508,"Arab League calls for Israel boycott, terms it...",The Arab League (AL) called on Arab States on ...,2508,the arab league (al) called on arab states on ...,the arab league (al) called on arab states on ...
2509,Beirut explosion among most powerful non-nucle...,Beirut explosion among most powerful non-nucle...,2509,beirut explosion among most powerful non-nucle...,"""this was an unprecedented event because never..."
2510,Anti-aircraft gun bullets found near Pak PM Im...,Imran KhanPolice in Pakistan have seized 18 li...,2510,imran khanpolice in pakistan have seized 18 li...,imran khanpolice in pakistan have seized 18 li...
2511,Air-Launched Ballistic Missile will realise Ch...,Representational ImageThe US Department of Def...,2511,representational imagethe us department of def...,the office of the us secretary of defence note...


In [96]:
test_df.to_csv("/content/drive/MyDrive/NLP Project/Data/Extractive/Test_With_EXTSumm")