In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

In [26]:
# Define the url
url = "https://news.google.com/rss?h1=en-US&gl=US&ceid=US:en&hl=en-US"

In [27]:
response = requests.get(url)

In [28]:
bs = BeautifulSoup(response.content, "xml")
articles = bs.find_all("item")

for each in articles:
    print(f'{each}\n\n\n')

<item><title>What to watch for during Trump’s court appearance - CNN</title><link>https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LmNubi5jb20vMjAyMy8wNi8xMy9wb2xpdGljcy90cnVtcC1jb3VydC1hcHBlYXJhbmNlLXdoYXQtdG8td2F0Y2gvaW5kZXguaHRtbNIBW2h0dHBzOi8vYW1wLmNubi5jb20vY25uLzIwMjMvMDYvMTMvcG9saXRpY3MvdHJ1bXAtY291cnQtYXBwZWFyYW5jZS13aGF0LXRvLXdhdGNoL2luZGV4Lmh0bWw?oc=5</link><guid isPermaLink="false">2126021645</guid><pubDate>Tue, 13 Jun 2023 09:01:00 GMT</pubDate><description>&lt;ol&gt;&lt;li&gt;&lt;a href="https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LmNubi5jb20vMjAyMy8wNi8xMy9wb2xpdGljcy90cnVtcC1jb3VydC1hcHBlYXJhbmNlLXdoYXQtdG8td2F0Y2gvaW5kZXguaHRtbNIBW2h0dHBzOi8vYW1wLmNubi5jb20vY25uLzIwMjMvMDYvMTMvcG9saXRpY3MvdHJ1bXAtY291cnQtYXBwZWFyYW5jZS13aGF0LXRvLXdhdGNoL2luZGV4Lmh0bWw?oc=5" target="_blank"&gt;What to watch for during Trump’s court appearance&lt;/a&gt;&amp;nbsp;&amp;nbsp;&lt;font color="#6f6f6f"&gt;CNN&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="https://news.google.

In [29]:
data = []
cutoff_time = datetime.now() - timedelta(hours = 24)

In [30]:
for article in articles:
    pub_date = datetime.strptime(article.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z")
    if pub_date >=cutoff_time:
        title = article.title.text
        link = article.link.text 
        description = article.description.text 
        
        data.append([title,link,description])
        
df = pd.DataFrame(data, columns = ["Title", "Link", "Description"])

In [31]:
df

Unnamed: 0,Title,Link,Description
0,What to watch for during Trump’s court appeara...,https://news.google.com/rss/articles/CBMiV2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
1,"Russia-Ukraine war: List of key events, day 47...",https://news.google.com/rss/articles/CBMiVmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
2,Russian missile attack on Zelenskyy's hometown...,https://news.google.com/rss/articles/CBMiZmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
3,Three people found dead in major incident in N...,https://news.google.com/rss/articles/CBMiR2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
4,Chris Christie slams 'loser' Trump over docume...,https://news.google.com/rss/articles/CBMiaGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
5,Trump indictment in Miami: Experts monitor cha...,https://news.google.com/rss/articles/CBMibGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
6,Former President Donald Trump in Doral ahead o...,https://news.google.com/rss/articles/CBMieGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
7,1 dead after tour boat capsizes inside Erie Ca...,https://news.google.com/rss/articles/CBMiV2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
8,"I-95 Philadelphia collapse: body recovered, de...",https://news.google.com/rss/articles/CCAiC213S...,"<ol><li><a href=""https://news.google.com/rss/a..."
9,Climate change lawsuit: Montana youth file cli...,https://news.google.com/rss/articles/CCAiC01mO...,"<ol><li><a href=""https://news.google.com/rss/a..."


## Apply Supervised Clustering almgorithm to get the labelled data

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [33]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Description"])

In [34]:
# Apply K Means
k = 5
kmeans = KMeans(n_clusters = k, random_state = 42)
kmeans.fit(X)



In [35]:
labels = kmeans.labels_
labels

array([4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
       2, 3, 3, 2, 2, 0, 2, 2, 2, 2, 2], dtype=int32)

In [36]:
df["Cluster"] = labels

In [37]:
cluster_counts = df["Cluster"].value_counts().sort_index()
cluster_counts

Cluster
0     2
1     2
2    23
3     2
4     4
Name: count, dtype: int64