In [None]:
#Data scraping for testing the algorithm step by step.
import requests
from bs4 import BeautifulSoup
import re

r = requests.get('https://en.wikipedia.org/wiki/COVID-19_pandemic')
page = r.content
soup = BeautifulSoup(page, 'html.parser')
parag = soup.find_all('p')
title = soup.title.text

l = []
for p in parag[1], parag[2], parag[3], parag[4]:
    txt=p.text.strip('\n')
    text=txt.replace('\xa0',' ')
    l.append(text)
    data=' '.join(l)
    document=re.sub('[[0-9]*]','', data)



In [None]:
document = '''Liverpool's all-conquering season that brought them the club's first league title in 30 years seemed an age away as they were eliminated by Real Madrid in the Champions League quarter-final at a deserted Anfield.

Jurgen Klopp's side, who carried all before them in the league last season, are now left with only the Premier League's top four - and a place in Europe's elite competition next term - to aim for if they are to salvage something from this fragmented, disappointing, injury-hit campaign.

As Zinedine Zidane's players celebrated a professional job that saw them set up a Champions League semi-final against Chelsea, Klopp was left to ponder Liverpool's fall from grace in the past 12 months.

He will no doubt face calls from frustrated Reds fans to revamp a squad that has served him and the club magnificently in recent seasons - indeed the social media jury was already delivering that knee-jerk verdict moments after elimination.

It's clear we lost the quarter-final tie in Madrid - Klopp
The manager, quite rightly, will guard against over-reaction. Liverpool have achieved too much and have too many high-class players for that - but that does not mean he does not have serious questions to consider.

So does this Liverpool squad really need a major overhaul, or just minor renewal?

A team as good as Liverpool have been seems an unlikely candidate for an extensive rebuild - but fresh faces can create momentum, renewal and a new mood, and this group has been together a long time.

When Liverpool and Real met in the Champions League final in Kyiv in 2018, Klopp's line-up included seven of those who started at Anfield on Wednesday.
'''

In [None]:
#Split the document into sentences.
from nltk.tokenize.punkt import PunktSentenceTokenizer

document = ' '.join(document.strip().split('\n'))
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)

In [None]:
sentences

["Liverpool's all-conquering season that brought them the club's first league title in 30 years seemed an age away as they were eliminated by Real Madrid in the Champions League quarter-final at a deserted Anfield.",
 "Jurgen Klopp's side, who carried all before them in the league last season, are now left with only the Premier League's top four - and a place in Europe's elite competition next term - to aim for if they are to salvage something from this fragmented, disappointing, injury-hit campaign.",
 "As Zinedine Zidane's players celebrated a professional job that saw them set up a Champions League semi-final against Chelsea, Klopp was left to ponder Liverpool's fall from grace in the past 12 months.",
 'He will no doubt face calls from frustrated Reds fans to revamp a squad that has served him and the club magnificently in recent seasons - indeed the social media jury was already delivering that knee-jerk verdict moments after elimination.',
 "It's clear we lost the quarter-final t

In [None]:
#Convert the document into a bag-of-words
from collections import Counter
 
def bag_of_words(sentence):
    return Counter(word.lower().strip('.,') for word in sentence.split(' '))

In [None]:
bag_of_words(sentences[0])

Counter({'30': 1,
         'a': 1,
         'age': 1,
         'all-conquering': 1,
         'an': 1,
         'anfield': 1,
         'as': 1,
         'at': 1,
         'away': 1,
         'brought': 1,
         'by': 1,
         'champions': 1,
         "club's": 1,
         'deserted': 1,
         'eliminated': 1,
         'first': 1,
         'in': 2,
         'league': 2,
         "liverpool's": 1,
         'madrid': 1,
         'quarter-final': 1,
         'real': 1,
         'season': 1,
         'seemed': 1,
         'that': 1,
         'the': 2,
         'them': 1,
         'they': 1,
         'title': 1,
         'were': 1,
         'years': 1})

In [None]:
#Build SciPy matrices out of a collections of texts.
from sklearn.feature_extraction.text import CountVectorizer

c = CountVectorizer()
bow_array = c.fit_transform([sentences[0]])
bow_array.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 1, 1, 1, 1]])

In [None]:
#Apply this to the entire collection, and get back a matrix.
from sklearn.feature_extraction.text import CountVectorizer

c = CountVectorizer()
bow_matrix = c.fit_transform(sentences)
bow_matrix

<9x180 sparse matrix of type '<class 'numpy.int64'>'
	with 250 stored elements in Compressed Sparse Row format>

In [None]:
#Converting to a Graph
from sklearn.feature_extraction.text import TfidfTransformer

#Normalize our matrix using Scikit-learn's TfidfTransformer.
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)

similarity_graph = normalized_matrix * normalized_matrix.T
similarity_graph.toarray()

array([[1.        , 0.19481437, 0.18518798, 0.10603924, 0.16758751,
        0.03824826, 0.01580574, 0.09412877, 0.27127046],
       [0.19481437, 1.        , 0.1579694 , 0.08884095, 0.07867906,
        0.04706443, 0.02514469, 0.04729526, 0.15180013],
       [0.18518798, 0.1579694 , 1.        , 0.11801513, 0.12149322,
        0.08211884, 0.01680086, 0.04862441, 0.17491438],
       [0.10603924, 0.08884095, 0.11801513, 1.        , 0.08891205,
        0.09666963, 0.0371855 , 0.04183458, 0.06398199],
       [0.16758751, 0.07867906, 0.12149322, 0.08891205, 1.        ,
        0.        , 0.        , 0.        , 0.12042031],
       [0.03824826, 0.04706443, 0.08211884, 0.09666963, 0.        ,
        1.        , 0.09114083, 0.13497709, 0.02366474],
       [0.01580574, 0.02514469, 0.01680086, 0.0371855 , 0.        ,
        0.09114083, 1.        , 0.08066602, 0.01796402],
       [0.09412877, 0.04729526, 0.04862441, 0.04183458, 0.        ,
        0.13497709, 0.08066602, 1.        , 0.0362595 ],


In [None]:
# Using Pagerank with networkx
import networkx as nx

nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
scores

{0: 0.1287068954471938,
 1: 0.1145050084489093,
 2: 0.12035936449947437,
 3: 0.10874175543628124,
 4: 0.10360490881924775,
 5: 0.10551237805702114,
 6: 0.09758578917950579,
 7: 0.10375385117983243,
 8: 0.11723004893253386}

In [None]:
#Mapping of sentence indices to scores.
ranked = sorted(((scores[i],s) for i,s in enumerate(sentences)),
                reverse=True)
ranked[0][1]

"Liverpool's all-conquering season that brought them the club's first league title in 30 years seemed an age away as they were eliminated by Real Madrid in the Champions League quarter-final at a deserted Anfield."

In [None]:
ranked #this holds all text

[(0.1287068954471938,
  "Liverpool's all-conquering season that brought them the club's first league title in 30 years seemed an age away as they were eliminated by Real Madrid in the Champions League quarter-final at a deserted Anfield."),
 (0.12035936449947437,
  "As Zinedine Zidane's players celebrated a professional job that saw them set up a Champions League semi-final against Chelsea, Klopp was left to ponder Liverpool's fall from grace in the past 12 months."),
 (0.11723004893253386,
  "When Liverpool and Real met in the Champions League final in Kyiv in 2018, Klopp's line-up included seven of those who started at Anfield on Wednesday."),
 (0.1145050084489093,
  "Jurgen Klopp's side, who carried all before them in the league last season, are now left with only the Premier League's top four - and a place in Europe's elite competition next term - to aim for if they are to salvage something from this fragmented, disappointing, injury-hit campaign."),
 (0.10874175543628124,
  'He wi

In [None]:
#All in one function
import networkx as nx
import numpy as np
 
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer 

def textrank(document):

    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(document)
 
    bow_matrix = CountVectorizer().fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
 
    similarity_graph = normalized * normalized.T
 
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    ranked = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    temp = dict(ranked)
    select_rank = [a_tuple[0] for a_tuple in ranked] 
    
    #Extracting sentences for ranking
    if len(ranked) == 1:
      res = [temp.get(t, 0) for t in select_rank[:1]]
      summary = ' '.join([str(elem) for elem in res])
    
    elif len(ranked) <= 5:
      res = [temp.get(t, 0) for t in select_rank[:int(len(select_rank)/2)]]
      summary = ' '.join([str(elem) for elem in res])
    
    else:
      x = input()
      x = int(x)
      res = [temp.get(t, 0) for t in select_rank[:x]]
      summary = ' '.join([str(elem) for elem in res])
       
    return summary

In [None]:
####### Testing Algorithm ########

In [None]:
textrank(document)

4


"Liverpool's all-conquering season that brought them the club's first league title in 30 years seemed an age away as they were eliminated by Real Madrid in the Champions League quarter-final at a deserted Anfield. As Zinedine Zidane's players celebrated a professional job that saw them set up a Champions League semi-final against Chelsea, Klopp was left to ponder Liverpool's fall from grace in the past 12 months. When Liverpool and Real met in the Champions League final in Kyiv in 2018, Klopp's line-up included seven of those who started at Anfield on Wednesday. Jurgen Klopp's side, who carried all before them in the league last season, are now left with only the Premier League's top four - and a place in Europe's elite competition next term - to aim for if they are to salvage something from this fragmented, disappointing, injury-hit campaign."

In [None]:
tr='''Gelecek Partisi Genel Başkanı Ahmet Davutoğlu, partisinin Alanya ilçe 1’inci Olağan Kongresi’nde yaptığı konuşmada TL’nin değer kaybına ilişkin açıklamalarda bulundu.

Davutoğlu’nun açıklamalarının bir kısmı şöyle:

"Dün gece yarısı bir sosyal medya kampanyasıyla neymiş, emperyalistlere karşı Hazine ve Maliye Bakanı'nı koruyacakmışız. Bizim görevimiz bir bakanı korumak değil, sizin ve bizim göreviniz Türk lirasını korumak.

"Maalesef Türk lirası da düşmekte olduğu zirvelerden bir kademe daha aşağıya düştü. Ben Mayıs 2016'da Başbakanlığı devrettiği zaman dolar 2.85'ti. Şimdi 7.30'da tutmaya çalışıyorlar. 4 yılda Türk lirası yüzde 155 değer kaybetti. Seçilmiş son Başbakanı yerinden etmek için ayak oyunları çevirenlerin bu yüzde 155'lik değer kaybı nereden geldi diye hesap vermesi lazım.

"Bize takdim ederken dediler ki 'Türk ekonomisi uçacak, etkin kararlar alacağız, Merkez Bankası kontrolümüze geçecek.' Ne oldu? 2018'ten bu yana 2 yılda Türk parası yüzde 60 değer kaybetti. Allah aşkını, Türk lirasına dünya piyasasının aksine bu değeri kaybettirenlerin Türkiye'ye yeniden itibar kazandırmaları mümkün mü? Türk lirasını bugün bu seviyeye düşürenler milli ekonomiden bahsedemezler, millete karşı bir ihanet içindedirler. Önce onlar hesap versin."'''

In [None]:
textrank(tr)

'4 yılda Türk lirası yüzde 155 değer kaybetti. 2018\'ten bu yana 2 yılda Türk parası yüzde 60 değer kaybetti. "Maalesef Türk lirası da düşmekte olduğu zirvelerden bir kademe daha aşağıya düştü.'