## Extractive Summarization

In [61]:
# Importing libraries
import re            
from tqdm import tqdm,trange 
import os  
import numpy as np   
import pandas as pd  
import nltk    
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.metrics.pairwise import cosine_similarity
# !pip install rouge
# !pip install networkx
import networkx as nx
from rouge import Rouge
import string

### The available article categories are Entertainment, Business, Politics, Sport, Tech

In [105]:
print("Article Categories - Entertainment, Business, Politics, Sport, Tech")
type_of_article = input("Enter the category of articles: ") #entertainment/business/politices/sport/tech
print(type_of_article)
root_path = r"C:\Users\HP PAV -15 AU111TX\Desktop\NLP\Project\archive (1)\BBC News Summary\BBC News Summary"
num_of_article = len(os.listdir(f"{root_path}/News Articles/{type_of_article}"))
print(f'"Reading the {type_of_article} data"')
df = pd.DataFrame(columns=['title','article','summary'])

for i in tqdm(range(num_of_article)):
    with open(f'{root_path}/News Articles/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
        article = f.read().splitlines()
    with open(f'{root_path}/Summaries/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
        summary = f.read().splitlines()
    df.loc[i] = [article[0],article[2:],summary[0]]

Article Categories - Entertainment, Business, Politics, Sport, Tech
Enter the category of articles: Entertainment
Entertainment
"Reading the Entertainment data"


100%|████████████████████████████████████████████████████████████████████████████████| 386/386 [00:20<00:00, 18.80it/s]


In [106]:
df.head()

Unnamed: 0,title,article,summary
0,Gallery unveils interactive tree,[A Christmas tree that can receive text messag...,"The messages will be ""unwrapped"" by sculptor R..."
1,Jarre joins fairytale celebration,[French musician Jean-Michel Jarre is to perfo...,Bloom is to be formally presented with the Han...
2,Musical treatment for Capra film,[The classic film It's A Wonderful Life is to ...,The classic film It's A Wonderful Life is to b...
3,Richard and Judy choose top books,[The 10 authors shortlisted for a Richard and ...,"""It was very hard to follow last year's extrem..."
4,Poppins musical gets flying start,[The stage adaptation of children's film Mary ...,Mary Poppins was originally created by author ...


### Example of the article

In [107]:
def article(article):
    """
    Iterates and prints the articles in the dataset.
    Parameter: Article index  
    Return: Article
    """
    print("Example Article: ")
    for i in article:
        print(i)
article(df['article'][2])

Example Article: 
The classic film It's A Wonderful Life is to be turned into a musical by the producer of the controversial hit show Jerry Springer - The Opera.

Frank Capra's 1946 movie starring James Stewart, is being turned into a Â£7m musical by producer Jon Thoday. He is working with Steve Brown, who wrote the award-winning musical Spend Spend Spend. A spokeswoman said the plans were in the "very early stages", with no cast, opening date or theatre announced.

A series of workshops have been held in London, and on Wednesday a cast of singers unveiled the musical to a select group of potential investors. Mr Thoday said the idea of turning the film into a musical had been an ambition of his for almost 20 years. It's a Wonderful Life was based on a short story, The Greatest Gift, by Philip van Doren Stern. Mr Thoday managed to buy the rights to the story from Van Doren Stern's family in 1999, following Mr Brown's success with Spend Spend Spend. He later secured the film rights from 

In [67]:
def sentence_tokenize(sentence):
    """
    Tokenizes each sentence of the article
    Parameter: list of split sentence
    Return: Tokenized sentences
    """
    l= []
    temp_list = []
    for i in range(len(sentence)):
        if not (len(temp_list)==0 and sentence[i]==' '):
            temp_list.append(sentence[i])
        if i==len(sentence)-1 or (sentence[i]=='.' and (not(sentence[i+1].isdigit()) or sentence[i+1]==" ")):
            l.append(''.join(temp_list))
            temp_list = []
    return l

def split_sentence(article):
    """
    Splits the articles into sentences
    Parameter: article with index
    Return: Tokenized sentences
    """
    sentences = []
    for s in article:
        sentences.extend(sentence_tokenize(s))  
    return sentences

def lower_sentence(sentence):
    """
    Converts all the sentences into lowercase
    Parameter: Sentences
    Return: lowercase sentences
    """
    lower_sentences = [s.lower() for s in sentence]
    return lower_sentences

In [68]:
# shows the stop words obtained from NLTK library
stop_words[:20:2]

['i',
 'my',
 'we',
 'ours',
 'you',
 "you've",
 "you'd",
 'yours',
 'yourselves',
 'him']

In [69]:
def stopwords_removal(sentence):
    """
    Removes stop words from the sentences
    Parameter: list of sentences
    Return: Sentences with removed stopwords
    """
    new_sentence = " ".join([i for i in sentence if i not in stop_words])
    return new_sentence

def clean_sentence(sentences):
    """
    Removes all the special characters from the sentences
    Parameter: list of sentences
    Return: Sentences with removed special characters
    """    
    s=pd.Series(sentences)
    clean_sentences = s.str.replace("[^a-zA-Z]", " ",regex=True)
    return clean_sentences

In [70]:
def word_embeddings(path):
    """
    Get the word embedding vectors
    Parameter: Glove 100d path
    Return: word embedding vectors
    """
    word_embeddings = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            values = line.split() 
            word = values[0]      
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
    return word_embeddings

In [71]:
# Checking the word embeddings
word= word_embeddings(r"C:\Users\HP PAV -15 AU111TX\Desktop\NLP\Project\glove.6B\glove.6B.100d.txt")
print('Word vector size',len(word['all']))
word['all']

Word vector size 100


array([-0.21823 ,  0.69199 ,  0.70441 , -0.59642 , -0.21818 ,  0.55387 ,
       -0.32052 ,  0.52602 , -0.31667 , -0.19129 ,  0.76109 ,  0.047439,
        0.43199 ,  0.12232 ,  0.25664 , -0.52453 ,  0.048994,  0.81621 ,
       -0.53336 ,  0.53093 ,  0.24589 , -0.046352,  0.38898 , -0.41434 ,
        0.28169 , -0.35422 ,  0.24713 , -0.44007 ,  0.023343, -0.38592 ,
        0.31762 ,  0.26774 , -0.19487 ,  0.024135, -0.056042,  0.33799 ,
        0.12103 ,  0.32306 , -0.67209 , -0.028449, -0.79051 , -0.29798 ,
        0.25696 , -0.1822  , -0.066176,  0.28468 ,  0.019382, -0.51672 ,
       -0.065801, -0.74178 , -0.043   ,  0.10303 , -0.22385 ,  0.96676 ,
       -0.38914 , -2.1671  ,  0.25583 ,  0.067169,  2.0256  ,  0.86387 ,
       -0.14699 ,  1.0254  , -0.42629 ,  0.19325 ,  0.83025 ,  0.097585,
        0.79303 ,  0.4349  ,  0.26404 , -0.17101 , -0.13859 , -0.55096 ,
        0.020747, -0.39791 ,  0.43081 ,  0.37966 , -0.52257 , -0.20961 ,
       -1.1568  , -0.38041 ,  0.81093 , -0.050365, 

In [72]:
def get_sentence_vector(clean_sentences,word_embeddings,dim):
    """
    Takes the sentences and word embeddings and gives the sentence vector
    Parameter: cleaned sentences, word embeddings, dimension of vector 
    Return: sentence vector
    """
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0 and len(i.split())!=0:
            v = sum([word_embeddings.get(w, np.zeros((dim,))) for w in i.split()])/ (len(i.split()))
        else:
            v = np.zeros((dim,))
        sentence_vectors.append(v)
    return sentence_vectors

def get_sim_mat(sentences,sentence_vectors,dim):
    """
    Takes the sentences and sentence vectors to create similarity matrix using cosine similarity
    Parameter: sentences, sentence vectors, dimension of vector 
    Return: similarity matrix
    """
    similarity_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,dim), sentence_vectors[j].reshape(1,dim))[0,0]
    return similarity_mat

def rank_sentence(similarity_matrix,sentences):
    """
    Takes the sentences and similarity matrix to rank the sentences using networkx library
    Parameter: similarity matrix, sentences
    Return: ranked sentences
    """
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],i) for i,s in enumerate(sentences)), reverse=True)
    return ranked_sentences

def extract_summary(ranked_sentences, sentence_number):
    """
    Takes the ranked sentences and number of sentences that should be taken from the original summary to extract the new summary
    Parameter: ranked sentences, number of sentences that should be considered from the original summary
    Return: extracted summary
    """
    result_lst = []
    for i in range(sentence_number):
        result_lst.append(ranked_sentences[i][1])
    return result_lst

In [109]:
def test(i,word_embeddings,dim):
    """
    The function tests the extractive summarizartion model by calling all the functions for a particular article and gives the accuracy
    Parameter: index of article, word embeddings, dimension of vector
    Return: accuracy, old summary, new summary, ranked sentence list
    """
    sentences = split_sentence(df['article'][i])
    stop_words = stopwords.words('english')
    ls=lower_sentence(sentences)
    clean_stopword_sentences = [stopwords_removal(r.split()) for r in ls]
    clean_sentences = clean_sentence(clean_stopword_sentences)
    sentence_vectors = get_sentence_vector(clean_sentences,word,dim)
    similarity_matrix = get_sim_mat(sentences,sentence_vectors,dim)
    ranked_sentences = rank_sentence(similarity_matrix,sentences)
    data = df['summary'][i]
    data = sentence_tokenize(data)
    gold = split_sentence(data)
    sentence_num = len(gold)
    result_lst = extract_summary(ranked_sentences,sentence_num)
    result_lst.sort()
    gold_lst = []
    for i,sent in enumerate(sentences):
        if sent in gold:
            gold_lst.append(i)
    correct = 0
    for i in range(len(result_lst)):
        if result_lst[i] in gold_lst:
            correct+=1
    accuracy=correct/sentence_num
    return accuracy, data, sentences, result_lst

In [110]:
dim = 100
word =word_embeddings(r"C:\Users\HP PAV -15 AU111TX\Desktop\NLP\Project\glove.6B\glove.6B.100d.txt")
t=trange(len(df))

def result(t,word_embeddings,dim):
    """
    Gives the accuracy of the model 
    Parameter: length of dataframe, word embeddings and dimesion of vector
    Return: None
    """
    result = []
    for i in t:
        accuracy,data,sentences,result_lst = test(i,word,dim)
        result.append(accuracy)
        t.set_postfix(Acc='%g' % accuracy)
    print("Accuracy: ",round(sum(result)/len(result),2))

res=result(t,word,dim)

100%|██████████████████████████████████████████████████████████████████| 386/386 [01:07<00:00,  5.74it/s, Acc=0.444444]

Accuracy:  0.54





In [111]:
def rouge_score(data,sentences,result_lst):
    """
    Gives the rouge score of original summary and extractive summary of any article 
    Parameter: length of dataframe, word embeddings and dimesion of vector
    Return: None
    """
    hypothesis = ''.join([sentences[i] for i in result_lst])
    reference = ''.join([ i for i in data])
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference, avg=True)
    score_1 = round(scores['rouge-1']['f'], 2)    
    score_2 = round(scores['rouge-2']['f'], 2)    
    score_L = round(scores['rouge-l']['f'], 2)  
    print("\nExtracted Summary: \n",hypothesis)
    print("\nOriginal Summary: \n",reference)
    print("\nArticle summary:" ,"rouge1:", score_1, "| rouge2:", score_2, "| rougeL:",score_2, "\nAverage rouge:",
          round(np.mean([score_1,score_2,score_L]), 2))

i=50
accuracy,data,sentences,result_lst = test(i,word,dim)
score=rouge_score(data,sentences,result_lst)


Extracted Summary: 
 Jamie Foxx and Hilary Swank have won the Screen Actors Guild Awards for best male and female film actors, boosting their Oscars hopes this month.Foxx's portrayal of late soul-singer Ray Charles in Ray had already earned him a prestigious Golden Globe award.Modest wine country comedy Sideways knocked out favourites Million Dollar Baby and The Aviator by taking the top prize for best cast performance.Veteran actor Morgan Freeman took the best supporting actor award for playing a prize-fighter turned gym manager in Million Dollar Baby."Thank you for Ray Charles for just living so complex and so interesting, and making us all just come together," said Foxx, accepting his award in Los Angeles on Saturday.He also praised the film director: "Thank you for Taylor Hackford for taking a chance with an African-American film." Swank, too, was full of praise for her director and co-star Clint Eastwood.Both Foxx and Swank are now considered to be among the favourites to get Osc

## TextRank using Gensim

In [75]:
# !pip install "gensim==3.8.2"

In [76]:
import gensim

In [77]:
gensim.__version__

'3.8.1'

In [78]:
from gensim.summarization import summarize
from gensim.summarization import keywords

In [99]:
df['article'][50]

['A blind student has developed software that turns colours into musical notes so that he can read weather maps.',
 '',
 'Victor Wong, a graduate student from Hong Kong studying at Cornell University in New York State, had to read coloured maps of the upper atmosphere as part of his research. To study "space weather" Mr Wong needed to explore minute fluctuations in order to create mathematical models. A number of solutions were tried, including having a colleague describe the maps and attempting to print them in Braille. Mr Wong eventually hit upon the idea of translating individual colours into music, and enlisted the help of a computer graphics specialist and another student to do the programming work.',
 '',
 '"The images have three dimensions and I had to find a way of reading them myself," Mr Wong told the BBC News website. "For the sake of my own study - and for the sake of blind scientists generally - I felt it would be good to develop software that could help us to read colour 

In [112]:
def textrank(article, ratio=0.2):
    """
    Summarizes the article with TextRank using Gensim summarizer.  
    Parameter: article, ratio or length of summary (ex. 20% of the text)
    Return: list of summaries
    """
    article=str(article)
    if type(article) is str:     
        article = [article]  
    list_summaries = [gensim.summarization.summarize(i,ratio=ratio) for i in article]    
    return list_summaries
i=50
summary = textrank(article=df['article'][i], ratio=0.5)    
summary[0]

'[\'Jamie Foxx and Hilary Swank have won the Screen Actors Guild Awards for best male and female film actors, boosting their Oscars hopes this month.\', \'\', "Foxx\'s portrayal of late soul-singer Ray Charles in Ray had already earned him a prestigious Golden Globe award.\nSwank triumphed for playing a gutsy female boxer in Million Dollar Baby.\nModest wine country comedy Sideways knocked out favourites Million Dollar Baby and The Aviator by taking the top prize for best cast performance.", \'\', \'The Screen Actors Guild (SAG) represents US film and TV actors.\nVeteran actor Morgan Freeman took the best supporting actor award for playing a prize-fighter turned gym manager in Million Dollar Baby.\', \'\', \'"Thank you for Ray Charles for just living so complex and so interesting, and making us all just come together," said Foxx, accepting his award in Los Angeles on Saturday.\', \'\', \'He also praised the film director: "Thank you for Taylor Hackford for taking a chance with an Afric

In [119]:
def evaluate_summary(y_test, predicted):  
    
    rouge = Rouge()
    scores = rouge.get_scores(y_test, predicted, avg=True)       
    score_1 = round(scores['rouge-1']['f'], 2)    
    score_2 = round(scores['rouge-2']['f'], 2)    
    score_L = round(scores['rouge-l']['f'], 2)    
    print("\nArticle Summary: ", "rouge1:", score_1, "| rouge2:", score_2, "| rougeL:",
         score_2, "\nAverage rouge:", round(np.mean([score_1,score_2,score_L]), 2))

In [120]:
def summ(i):
    """
    Uses TextRank function to extract the summary from the article
    Parameter: index of article
    Return: None
    """
    summary = textrank(df['article'][i], ratio=0.2)
    s=summary[0]  
    punc = '''!()-[]{};:'"\,<>/@#$%^&*_~[""'''
    for element in s:
        if element in punc:
            test = s.replace(element, "")
    return test

# index of article
i=50
s=summ(i)
evaluate_summary(df["summary"][i], s)



Article Summary:  rouge1: 0.56 | rouge2: 0.46 | rougeL: 0.46 
Average rouge: 0.53


In [121]:
def printsumm(i):
    """
    Prints the original summary and the predicted summary
    Parameter: index of article
    Return: None
    """
    predicted_summary= summ(i)
    print("\nExtracted Summary\n","".join(predicted_summary))
    original_summary=df["summary"][i]
    print("\nOriginal Summary\n",original_summary)
    s=summ(i)
    evaluate_summary(df["summary"][i], s)
    
printsumm(50)


Extracted Summary
 Modest wine country comedy Sideways knocked out favourites Million Dollar Baby and The Aviator by taking the top prize for best cast performance.", '', 'The Screen Actors Guild (SAG) represents US film and TV actors.
Veteran actor Morgan Freeman took the best supporting actor award for playing a prizefighter turned gym manager in Million Dollar Baby.', '', '"Thank you for Ray Charles for just living so complex and so interesting, and making us all just come together," said Foxx, accepting his award in Los Angeles on Saturday.', '', 'He also praised the film director: "Thank you for Taylor Hackford for taking a chance with an AfricanAmerican film.

Original Summary
 Jamie Foxx and Hilary Swank have won the Screen Actors Guild Awards for best male and female film actors, boosting their Oscars hopes this month.Swank triumphed for playing a gutsy female boxer in Million Dollar Baby.Both Foxx and Swank are now considered to be among the favourites to get Oscars - the Hol