In [1]:
from bs4 import BeautifulSoup
import requests
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

In [2]:
def extract_website_data(url):
    response = requests.get(url)
    content = response.content
    soup = soup = BeautifulSoup(content, 'html.parser') 
    links = soup.find_all('h2',{'class':'c-entry-box--compact__title'})
    return soup, links

In [3]:
def get_only_text(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    title = ' '.join(soup.title.stripped_strings)
    return title, text

In [4]:
list_of_links = []
list_of_summaries = []
list_of_titles = []
def generate_content(links):
    for i in list(range(len(links))):
        link = links[i].a.get('href')
        list_of_links.append(link)
        page_content = get_only_text(link)
        summary = summarize(repr(page_content[1]), ratio=0.05)
        list_of_summaries.append(summary)
        list_of_titles.append(page_content[0])

In [5]:
soup, links = extract_website_data("https://www.vox.com/")
generate_content(links)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
summary_mat = vectorizer.fit_transform(list_of_summaries)

In [15]:
print(summary_mat.shape)

(49, 2408)


In [16]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=6)
nmf_features = nmf.fit_transform(summary_mat)



In [18]:
import pandas as pd
from sklearn.preprocessing import normalize
norm_features = normalize(nmf_features)
df = pd.DataFrame(norm_features, index=list_of_titles)

In [19]:
print(list_of_titles)

['The Supreme Court’s polls are in free fall. Will that matter? - Vox', 'Effective altruism went from underfunded idea to  philanthropic force - Vox', 'The Inflation Reduction Act: The policies in the IRA, explained - Vox', 'How to come to terms with your own mortality - Vox', 'Recovering America’s Wildlife Act, explained - Vox', 'Nathan Fielder’s wild HBO show The Rehearsal, explained - Vox', 'Your monkeypox vaccine questions, answered - Vox', 'Vaccines for Covid-19 aren’t required in schools this fall - Vox', 'Vox’s audience support program, explained - Vox', 'How will the Inflation Reduction Act affect you? - Vox', 'Why the Inflation Reduction Act’s Senate passage is so significant - Vox', 'What does the Inflation Reduction Act do for climate change? - Vox', 'The Year of Miracles: Ella Risbridger cooks through the end of the world - Vox', 'Free online crossword puzzles from Vox - Vox', 'China-US relations will worsen after Taiwan crisis  - Vox', 'The best $4 I ever spent: A sparkly 

In [24]:
current_article = df.loc['Your monkeypox vaccine questions, answered - Vox']
similarities = df.dot(current_article)
print(similarities.sort_values(ascending=False))

Your monkeypox vaccine questions, answered - Vox                                                                   1.000000
How the CDC and public health agencies dropped the ball on the Monkeypox reponse - Vox                             0.999409
How a polio case in New York affects global eradication efforts - Vox                                              0.999375
Second Covid-19 vaccine boosters, explained  - Vox                                                                 0.997559
Vaccines for Covid-19 aren’t required in schools this fall - Vox                                                   0.996610
Your AC can be cut off in a heat wave over an unpaid energy bill - Vox                                             0.979745
We have an affordable housing crisis. States are realizing they, too, can build homes. - Vox                       0.973052
How the US monkeypox response is failing gay men  - Vox                                                            0.931747
The Bide