In [53]:
import nltk
import requests
from bs4 import BeautifulSoup
from string import punctuation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re

In [83]:
# ------ PREPROCESS ------
# request data from URL
URL = "https://en.wikipedia.org/wiki/The_Great_Wave_off_Kanagawa"
page = requests.get(URL).text

# scrape it off and find all text contents
soup = BeautifulSoup(page, "html.parser")
paragraphs = soup.find_all('p')

# combine all paragraphs into text without <p> tags
text = ""
for i in paragraphs:
    text += i.text

# split into sentences and tokenizing
text = text.lower()
text = re.sub(r'\[[0-9]*\]', ' ', text)
sentences = nltk.sent_tokenize(text)
word_token = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
filtered_words = [word for word in word_token if word not in stop_words]
filtered_words = [word for word in filtered_words if word not in punctuation]

In [84]:
# compute TF-IDF on all the words
fdist = nltk.FreqDist(filtered_words)
for i in fdist.keys():
    fdist[i] /= len(filtered_words)

sentence_scores = {}
for sent in sentences:
    for word in nltk.word_tokenize(sent.lower()):
        if word in fdist:
            if sent not in sentence_scores:
                sentence_scores[sent] = fdist[word]
            else:
                sentence_scores[sent] += fdist[word]

for sent in sentence_scores:
    sentence_scores[sent] /= len(sent)

threshhold = np.mean(list(sentence_scores.values()))

In [85]:
# get summary
summary = ""
for sent in sentence_scores:
    if sentence_scores[sent] >= threshhold * 1.5:
        summary += sent
print(len(summary))
print(summary)

3680

the great wave off kanagawa (japanese: 神奈川沖浪裏, hepburn: kanagawa-oki nami ura, lit.'under the wave off kanagawa')[a] is a woodblock print by japanese ukiyo-e artist hokusai, created in late 1831 during the edo period of japanese history.the print is hokusai's best-known work and the first in his series thirty-six views of mount fuji, in which the use of prussian blue revolutionized japanese prints.several museums throughout the world hold copies of the great wave, many of which came from 19th-century private collections of japanese prints.the great wave off kanagawa has been described as "possibly the most reproduced image in the history of all art",  as well as being a contender for the "most famous artwork in japanese history".in the great wave off kanagawa, mount fuji is depicted in blue with white highlights in a similar way to the wave in the foreground.the big wave's foam-curves generate other curves, which are divided into many small waves that repeat the image of the larg