In [None]:
!python -m spacy download en_core_web_lg

In [78]:
import spacy
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote
from collections import Counter
import string

In [79]:
nlp = spacy.load('en_core_web_lg')

In [105]:
# Define and fetch topic results from a google search

topic = 'Pollution'

topic = topic.replace(' ', '+')
resp = requests.get(f'https://www.google.com/search?q={topic}+site%3Awikipedia.org')
soup = BeautifulSoup(resp.content, "html.parser")

In [106]:
# Parse google search results and collect all wikipedia links
# Clean the collected links to process them further

wiki_domain = 'https://en.wikipedia.org/wiki/'
def extract_topic(link): return link[len(wiki_domain):].replace('_', ' ')
def get_base_link(link):
  o = urlparse(unquote(link))
  return f'{o.scheme}://{o.netloc}{o.path}'

all_links      = map(lambda link: link['href'], soup.find_all('a'))
search_links   = filter(lambda link: link.startswith('/url?q='), all_links)
stripped_links = map(lambda link: link[7:link.find('&')], search_links)
wiki_links     = filter(lambda link: link.startswith(wiki_domain), stripped_links)
base_links     = map(get_base_link, wiki_links)
nocateg_links  = filter(lambda link: not extract_topic(link).startswith('Category:'), base_links)

links = list(set(nocateg_links))

In [107]:
# Sort collected links in order of relevance

topic_doc = nlp(topic)

related_topics = [extract_topic(link) for link in links]
similarity_map = {topic: topic_doc.similarity(nlp(topic)) for topic in related_topics}
links.sort(reverse=True, key=lambda link: similarity_map[extract_topic(link)])

In [108]:
# Get all textual data from collected links

docs_map = {}
original_len = 0

for link in links:
  resp = requests.get(link)
  soup = BeautifulSoup(resp.content, "html.parser")
  paragraphs = soup.find_all('p')
  text = ' '.join([p.get_text() for p in paragraphs])
  text = re.sub(r'\[\d+\]+', '', text)
  docs_map[link] = nlp(text)

  original_len += len(text)

In [109]:
# Generate Word Frequency table

wordFreq = Counter()

for doc in docs_map.values():
  for token in doc:
    if not token.is_stop and token.lemma_ not in [*string.punctuation,'\n ']:
      wordFreq[token.lemma_] += 1


In [110]:
# Rank sentences based on Word Frequency table

sentence_scores = Counter()

all_sent_len = 0
total_sent = 0

for doc in docs_map.values():
  for sent in doc.sents:
    for word in sent:
      sentence_scores[sent.text] += wordFreq[word.lemma_]
    sentence_scores[sent.text] = sentence_scores[sent.text] / len(sent)
    total_sent += 1
    all_sent_len += len(sent.text)
    
avg_sent_len = all_sent_len // total_sent

In [111]:
# Generate summary

avg_sent_score = sum(sentence_scores.values()) // len(sentence_scores)

summary = []
summary_threshold = 3

for sent, score in sentence_scores.items():
  if score > (avg_sent_score * summary_threshold) and len(sent) > (avg_sent_len / summary_threshold):
    clean_sent = sent.strip();
    summary.append(clean_sent.replace('\n', ' '))

result = ' '.join(summary);
final_perc = (len(result) * 100) / original_len
print(f'Final size: {round(final_perc, 2)}% of original text')
print(result)

Final size: 1.58% of original text
Pollution is often classed as point source or nonpoint source pollution. Major forms of pollution include air pollution, light pollution, litter, noise pollution, plastic pollution, soil contamination, radioactive contamination, thermal pollution, visual pollution, and water pollution. Air pollution has always accompanied civilizations. Severe incidents of pollution helped increase consciousness. Pollution can also create costs for the firms producing the pollution. Motor vehicle emissions are one of the leading causes of air pollution. Pollution control is a term used in environmental management. The Rossby waves impact on air pollution has been observed in the daily fluctuations in surface air pollution. Water pollution results when contaminants are introduced into the natural environment. Water pollution can be classified as surface water or groundwater pollution. Marine pollution and nutrient pollution are subsets of water pollution. Sources of wa