In [1]:
#Imports
import nltk
import re
import numpy as np
import pickle
from nltk.corpus import stopwords
from sklearn.datasets import load_files
import bs4 as bs

In [2]:
import urllib.request

In [3]:
# pulling data
source = urllib.request.urlopen("https://en.wikipedia.org/wiki/Climate_change").read()

In [1]:
source

In [2]:
#!pip install lxml

In [6]:
soup = bs.BeautifulSoup(source, 'lxml')

In [3]:
soup

# Data Extraction

In [11]:
text_raw = ""
for paragraph in soup.find_all('p'):
    text_raw += paragraph.text

In [4]:
text_raw

# Data Preprocessing

In [17]:
text = re.sub(r'\[[0-9]*\]', ' ', text_raw)
text = re.sub(r'\s+', ' ', text)
clean_text = text.lower()
clean_text = re.sub(r'\W', ' ', clean_text)
clean_text = re.sub(r'\d', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text)

In [5]:
clean_text

# Tokenize Sentences

In [21]:
sentences = nltk.sent_tokenize(text)

In [22]:
len(sentences)

475

# Word Count / Histogram

In [23]:
stop_words = stopwords.words('english')

In [24]:
word_count = dict()
for word in nltk.word_tokenize(clean_text):
    if word not in stop_words:
        if word not in word_count.keys(): word_count[word] = 1
        else: word_count[word] += 1

In [25]:
word_count

{'contemporary': 2,
 'climate': 138,
 'change': 84,
 'includes': 2,
 'global': 55,
 'warming': 75,
 'impacts': 20,
 'earth': 24,
 'weather': 7,
 'patterns': 5,
 'previous': 1,
 'periods': 1,
 'current': 10,
 'changes': 23,
 'distinctly': 1,
 'rapid': 5,
 'due': 14,
 'natural': 15,
 'causes': 5,
 'instead': 5,
 'caused': 8,
 'emission': 3,
 'greenhouse': 40,
 'gases': 27,
 'mostly': 2,
 'carbon': 44,
 'dioxide': 7,
 'co': 48,
 'methane': 10,
 'burning': 5,
 'fossil': 15,
 'fuels': 9,
 'energy': 40,
 'use': 17,
 'creates': 1,
 'emissions': 63,
 'agriculture': 8,
 'steelmaking': 1,
 'cement': 3,
 'production': 9,
 'forest': 9,
 'loss': 7,
 'additional': 7,
 'sources': 2,
 'transparent': 4,
 'sunlight': 14,
 'allowing': 1,
 'heat': 34,
 'surface': 21,
 'emits': 1,
 'infrared': 3,
 'radiation': 6,
 'absorb': 7,
 'trapping': 2,
 'near': 5,
 'planet': 5,
 'heats': 2,
 'like': 10,
 'reflecting': 4,
 'snow': 7,
 'cover': 6,
 'amplifying': 2,
 'land': 26,
 'temperatures': 11,
 'risen': 3,
 'twic

# Weighted Histogram

In [28]:
max_count = max(word_count.values())

In [29]:
for key in word_count.keys():
    word_count[key] = word_count[key]/max_count 

In [30]:
word_count

{'contemporary': 0.014492753623188406,
 'climate': 1.0,
 'change': 0.6086956521739131,
 'includes': 0.014492753623188406,
 'global': 0.39855072463768115,
 'warming': 0.5434782608695652,
 'impacts': 0.14492753623188406,
 'earth': 0.17391304347826086,
 'weather': 0.050724637681159424,
 'patterns': 0.036231884057971016,
 'previous': 0.007246376811594203,
 'periods': 0.007246376811594203,
 'current': 0.07246376811594203,
 'changes': 0.16666666666666666,
 'distinctly': 0.007246376811594203,
 'rapid': 0.036231884057971016,
 'due': 0.10144927536231885,
 'natural': 0.10869565217391304,
 'causes': 0.036231884057971016,
 'instead': 0.036231884057971016,
 'caused': 0.057971014492753624,
 'emission': 0.021739130434782608,
 'greenhouse': 0.2898550724637681,
 'gases': 0.1956521739130435,
 'mostly': 0.014492753623188406,
 'carbon': 0.3188405797101449,
 'dioxide': 0.050724637681159424,
 'co': 0.34782608695652173,
 'methane': 0.07246376811594203,
 'burning': 0.036231884057971016,
 'fossil': 0.108695652

# Sentences Scores

In [31]:
sent_scores = dict()
for sentence in sentences:
    for word in nltk.word_tokenize(sentence.lower()):
        if word in word_count.keys():
            if len(sentence.split(" ")) < 25:
                if sentence not in sent_scores: sent_scores[sentence] = word_count[word]
                else: sent_scores[sentence] += word_count[word]

In [37]:
sent_scores = dict()
for sentence in sentences:
    if len(sentence.split(" ")) < 25:        # checking the sentence first
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_count.keys():
                if sentence not in sent_scores: sent_scores[sentence] = word_count[word]
                else: sent_scores[sentence] += word_count[word]

In [38]:
sent_scores

{" Contemporary climate change includes both global warming and its impacts on Earth's weather patterns.": 2.985507246376812,
 'There have been previous periods of climate change, but the current changes are distinctly more rapid and not due to natural causes.': 2.152173913043479,
 'Instead, they are caused by the emission of greenhouse gases, mostly carbon dioxide (CO2) and methane.': 1.0579710144927537,
 'Burning fossil fuels for energy use creates most of these emissions.': 1.0869565217391304,
 'Agriculture, steelmaking, cement production, and forest loss are additional sources.': 0.33333333333333337,
 "Greenhouse gases are transparent to sunlight, allowing it through to heat the Earth's surface.": 1.1956521739130437,
 "When the Earth emits that heat as infrared radiation the gases absorb it, trapping the heat near the Earth's surface.": 1.3623188405797102,
 'As the planet heats up it causes changes like the loss of sunlight-reflecting snow cover, amplifying global warming.': 1.4275

# Getting Best lines

In [33]:
import heapq

In [39]:
# getting best lines
number_of_best_lines = 5
best_lines = heapq.nlargest(number_of_best_lines, sent_scores, key=sent_scores.get)

In [40]:
best_lines

['Global warming usually refers to human-induced warming of the Earth system, whereas climate change can refer to natural or anthropogenic change.',
 'Climate change can be mitigated by reducing greenhouse gas emissions and by enhancing sinks that absorb greenhouse gases from the atmosphere.',
 'The long-term effects of climate change include further ice melt, ocean warming, sea level rise, and ocean acidification.',
 'To determine the human contribution to climate change, known internal climate variability and natural external forcings need to be ruled out.',
 " Contemporary climate change includes both global warming and its impacts on Earth's weather patterns."]

In [41]:
for sent in best_lines: print(sent)

Global warming usually refers to human-induced warming of the Earth system, whereas climate change can refer to natural or anthropogenic change.
Climate change can be mitigated by reducing greenhouse gas emissions and by enhancing sinks that absorb greenhouse gases from the atmosphere.
The long-term effects of climate change include further ice melt, ocean warming, sea level rise, and ocean acidification.
To determine the human contribution to climate change, known internal climate variability and natural external forcings need to be ruled out.
 Contemporary climate change includes both global warming and its impacts on Earth's weather patterns.
