In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
url = 'https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en'
response = requests.get(url)
response

<Response [200]>

In [3]:
bs = BeautifulSoup(response.text,'xml')

In [4]:
print(bs.prettify())

<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
 <channel>
  <generator>
   NFE/5.0
  </generator>
  <title>
   Top stories - Google News
  </title>
  <link>
   https://news.google.com/?hl=en-US&amp;gl=US&amp;ceid=US:en
  </link>
  <language>
   en-US
  </language>
  <webMaster>
   news-webmaster@google.com
  </webMaster>
  <copyright>
   2023 Google Inc.
  </copyright>
  <lastBuildDate>
   Tue, 01 Aug 2023 14:09:27 GMT
  </lastBuildDate>
  <description>
   Google News
  </description>
  <item>
   <title>
    Wife of Gilgo Beach serial killings suspect and her attorney open up about the family’s experience since Rex Heuermann’s arrest - CNN
   </title>
   <link>
    https://news.google.com/rss/articles/CBMiWWh0dHBzOi8vd3d3LmNubi5jb20vMjAyMy8wOC8wMS91cy9yZXgtaGV1ZXJtYW5uLXdpZmUtZ2lsZ28tYmVhY2gta2lsbGluZ3MtaG9tZS9pbmRleC5odG1s0gFdaHR0cHM6Ly9hbXAuY25uLmNvbS9jbm4vMjAyMy8wOC8wMS91cy9yZXgtaGV1ZXJtYW5uLXdpZmUtZ2lsZ28tYmVhY2gta2lsbGluZ3Mta

In [5]:
articles = bs.find_all('item')

In [6]:
from datetime import datetime, timedelta

data = []
cutoff = datetime.now() - timedelta(hours=24)
for article in articles:
    pub_date = datetime.strptime(article.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z")
    if pub_date>=cutoff:
        title = article.title.text
        link = article.link.text
        desc = article.description.text
        data.append([title, link, desc])

df = pd.DataFrame(data, columns=['title','link','desc'])
df.head()

Unnamed: 0,title,link,desc
0,Wife of Gilgo Beach serial killings suspect an...,https://news.google.com/rss/articles/CBMiWWh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
1,Trump's campaign finances are strained as lega...,https://news.google.com/rss/articles/CBMic2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
2,Hunter Biden associate tells Congress that VP'...,https://news.google.com/rss/articles/CBMiQmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
3,Lori Vallow Sentenced to Life in Prison Withou...,https://news.google.com/rss/articles/CBMiiwFod...,"<ol><li><a href=""https://news.google.com/rss/a..."
4,"Biden Shores Up Democratic Support, but Faces ...",https://news.google.com/rss/articles/CBMiRGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."


In [16]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['desc'])
X

<35x1146 sparse matrix of type '<class 'numpy.float64'>'
	with 2174 stored elements in Compressed Sparse Row format>

In [17]:
kms = KMeans(n_clusters=5, random_state=42)
kms.fit(X)
kms.labels_



array([3, 1, 3, 3, 1, 1, 3, 0, 1, 3, 1, 1, 3, 3, 3, 3, 1, 2, 2, 2, 3, 3,
       3, 0, 4, 3, 1, 4, 3, 3, 3, 3, 1, 3, 3])

In [18]:
df['cluster'] = kms.labels_

In [22]:
df['cluster'].value_counts().sort_index()

0     2
1     9
2     3
3    19
4     2
Name: cluster, dtype: int64