# Extractive Text Summarization

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import re

In [2]:
data = pd.read_csv("../data/data_news.csv", encoding="utf-8")
data

Unnamed: 0,category,summary,text
0,entertainment,Nigel McCune from the Musicians' Union said Br...,Musicians to tackle US red tape Musicians' gr...
1,entertainment,But they still want more.They have to want to ...,"U2's desire to be number one U2, who have won..."
2,entertainment,"Babyshambles, which he formed after his acrimo...",Rocker Doherty in on-stage fight Rock singer ...
3,entertainment,A Series of Unfortunate Events also stars Scot...,Snicket tops US box office chart The film ada...
4,entertainment,"Ocean's Twelve, the crime caper sequel starrin...",Ocean's Twelve raids box office Ocean's Twelv...
...,...,...,...
2219,tech,Writing a Microsoft Word document can be a dan...,Warning over Windows Word files Writing a Mic...
2220,tech,The lifts take only 30 seconds to whisk passen...,Fast lifts rise into record books Two high-sp...
2221,tech,Nintendo is releasing an adapter for its DS ha...,Nintendo adds media playing to DS Nintendo is...
2222,tech,The new strains of the Cabir mobile phone viru...,Fast moving phone viruses appear Security fir...


In [3]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix




In [4]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)


In [14]:
def generate_summary(text, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []
    
    # Step 1 - Read text and tokenize
    sentences =  sent_tokenize(text)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)

    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)
    
    for i in range(top_n):
        summarize_text.append(ranked_sentence[i][1])

    # Step 5 - Offcourse, output the summarize text
    return " ".join(summarize_text)

In [15]:
# get data from df
all_news_text = [news for news in data.text.values]
all_summary_text = [summary for summary in data.summary.values]

In [25]:
# let's begin

n = 0

print(f"""
      Original Text: 
      
      {all_news_text[n]}
      
      Original Summary: 
      
      {all_summary_text[n]}
      
      Generated Summary: 
      
      {generate_summary(all_news_text[n], 5)}
      
      """)


      Original Text: 
      
      Musicians to tackle US red tape  Musicians' groups are to tackle US visa regulations which are blamed for hindering British acts' chances of succeeding across the Atlantic.  A singer hoping to perform in the US can expect to pay $1,300 (£680) simply for obtaining a visa. Groups including the Musicians' Union are calling for an end to the "raw deal" faced by British performers. US acts are not faced with comparable expense and bureaucracy when visiting the UK for promotional purposes.  Nigel McCune from the Musicians' Union said British musicians are "disadvantaged" compared to their US counterparts. A sponsor has to make a petition on their behalf, which is a form amounting to nearly 30 pages, while musicians face tougher regulations than athletes and journalists. "If you make a mistake on your form, you risk a five-year ban and thus the ability to further your career," says Mr McCune.  "The US is the world's biggest music market, which means somethi

In [22]:
# it's funny