In [4]:
import requests
from bs4 import BeautifulSoup
import re

r = requests.get('https://en.wikipedia.org/wiki/COVID-19_pandemic')
page = r.content
soup = BeautifulSoup(page, 'html.parser')
parag = soup.find_all('p')
title = soup.title.text

l = []
for p in parag[1], parag[2], parag[3], parag[4]:
    txt=p.text.strip('\n')
    text=txt.replace('\xa0',' ')
    l.append(text)
    data=' '.join(l)
    document=re.sub('[[0-9]*]','', data)

In [5]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [6]:
document = ' '.join(document.strip().split('\n'))

In [7]:
sentence_tokenizer = PunktSentenceTokenizer()

In [8]:
sentences = sentence_tokenizer.tokenize(document)

In [9]:
sentences

['The COVID-19 pandemic, also known as the coronavirus pandemic, is an ongoing pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2).',
 'It was first identified in December 2019 in Wuhan, China.',
 'The World Health Organization declared the outbreak a Public Health Emergency of International Concern in January 2020 and a pandemic in March 2020.',
 'As of 28 December 2020, more than 81.1 million cases have been confirmed, with more than 1.77 million deaths attributed to COVID-19.',
 'Symptoms of COVID-19 are highly variable, ranging from none to severe illness.',
 'The virus spreads mainly through the air when people are near each other.',
 '[b It leaves an infected person as they breathe, cough, sneeze, or speak and enters another person via their mouth, nose, or eyes.',
 'It may also spread via contaminated surfaces.',
 'People remain infectious for up to two weeks, and can spread the virus even if they do not show sym

In [18]:
from collections import Counter
 
def bag_of_words(sentence):
    return Counter(word.lower().strip('.,') for word in sentence.split(' '))

In [19]:
bag_of_words(sentences[0])

Counter({'the': 2,
         'covid-19': 1,
         'pandemic': 3,
         'also': 1,
         'known': 1,
         'as': 1,
         'coronavirus': 3,
         'is': 1,
         'an': 1,
         'ongoing': 1,
         'of': 1,
         'disease': 1,
         '2019': 1,
         '(covid-19)': 1,
         'caused': 1,
         'by': 1,
         'severe': 1,
         'acute': 1,
         'respiratory': 1,
         'syndrome': 1,
         '2': 1,
         '(sars-cov-2)': 1})

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer()
bow_array = c.fit_transform([sentences[0]])
bow_array.toarray()

array([[2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2]])

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer()
bow_matrix = c.fit_transform(sentences)
bow_matrix

<19x216 sparse matrix of type '<class 'numpy.int64'>'
	with 308 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)

In [24]:
similarity_graph = normalized_matrix * normalized_matrix.T
similarity_graph.toarray()

array([[1.        , 0.04222127, 0.12365039, 0.11414665, 0.19566879,
        0.0628034 , 0.04794437, 0.04534055, 0.02517598, 0.        ,
        0.        , 0.05833562, 0.02906037, 0.0800811 , 0.18038161,
        0.06184139, 0.        , 0.        , 0.04433147],
       [0.04222127, 1.        , 0.17338549, 0.04991025, 0.        ,
        0.        , 0.03695374, 0.0723026 , 0.        , 0.07133342,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.03256068, 0.        , 0.        , 0.        ],
       [0.12365039, 0.17338549, 1.        , 0.0807391 , 0.02313602,
        0.0794774 , 0.00983075, 0.        , 0.0431015 , 0.10032384,
        0.01897858, 0.03213329, 0.01232811, 0.07427931, 0.13489198,
        0.07840652, 0.        , 0.01478899, 0.03068369],
       [0.11414665, 0.04991025, 0.0807391 , 1.        , 0.12648709,
        0.        , 0.02290888, 0.        , 0.01932959, 0.        ,
        0.        , 0.01949534, 0.02450632, 0.07110186, 0.        ,
        0.040

In [28]:
import networkx as nx
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
scores

{0: 0.058607608354242294,
 1: 0.04601850507134548,
 2: 0.057010612605271425,
 3: 0.05146310022030544,
 4: 0.05292206864494559,
 5: 0.055478147303211045,
 6: 0.052324712879283315,
 7: 0.04650125802396069,
 8: 0.05514990276837194,
 9: 0.05748267984597134,
 10: 0.04760188651193677,
 11: 0.046632574078683535,
 12: 0.04481193684950849,
 13: 0.060230270320510274,
 14: 0.056320045974837064,
 15: 0.061130538190084016,
 16: 0.04689382572969491,
 17: 0.04587183540789658,
 18: 0.0575484912199396}

In [29]:
ranked = sorted(((scores[i],s) for i,s in enumerate(sentences)),
                reverse=True)
ranked[0][1]

'It has led to the postponement or cancellation of events, widespread supply shortages exacerbated by panic buying, agricultural disruption and food shortages, and decreased emissions of pollutants and greenhouse gases.'

In [30]:
sentences[0]

'The COVID-19 pandemic, also known as the coronavirus pandemic, is an ongoing pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2).'

In [12]:
import networkx as nx
import numpy as np
 
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
 
def textrank(document):
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(document)
 
    bow_matrix = CountVectorizer().fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
 
    similarity_graph = normalized * normalized.T
 
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    return sorted(((scores[i],s) for i,s in enumerate(sentences)),
                  reverse=True)

In [32]:
textrank(document)

[(0.061130538190084016,
  'It has led to the postponement or cancellation of events, widespread supply shortages exacerbated by panic buying, agricultural disruption and food shortages, and decreased emissions of pollutants and greenhouse gases.'),
 (0.060230270320510274,
  'Many places have also worked to increase testing capacity and trace contacts of the infected.'),
 (0.058607608354242294,
  'The COVID-19 pandemic, also known as the coronavirus pandemic, is an ongoing pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2).'),
 (0.0575484912199396,
  'There have been incidents of xenophobia and discrimination against Chinese people and against those perceived as being Chinese or as being from areas with high infection rates.'),
 (0.05748267984597134,
  "Recommended preventive measures include social distancing, wearing face masks in public, ventilation and air-filtering, hand washing, covering one's mouth when sneezing 