# Scrapping Hacker News Data for last 24 hours

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime,timedelta

In [2]:
#define the URL for Google News RSS feed (Rick Site Summary)
url="https://news.google.com/rss?h1=en-US&gl=US&ceid=US:en&hl=en-US"


In [4]:
#make a request to the Rss feed url
response=requests.get(url)

In [5]:
#parse the XML content using BeautifulSoup
soup=BeautifulSoup(response.content,"xml")

In [6]:
#find all <item> elements which represent individual news articles
articles=soup.find_all("item")
articles

[<item><title>The different ways Republicans defend Trump over indictment is revealing - BBC</title><link>https://news.google.com/rss/articles/CBMiMWh0dHBzOi8vd3d3LmJiYy5jb20vbmV3cy93b3JsZC11cy1jYW5hZGEtNjU4NjMzOTnSATVodHRwczovL3d3dy5iYmMuY29tL25ld3Mvd29ybGQtdXMtY2FuYWRhLTY1ODYzMzk5LmFtcA?oc=5</link><guid isPermaLink="false">2126021645</guid><pubDate>Mon, 12 Jun 2023 23:44:25 GMT</pubDate><description>&lt;ol&gt;&lt;li&gt;&lt;a href="https://news.google.com/rss/articles/CBMiMWh0dHBzOi8vd3d3LmJiYy5jb20vbmV3cy93b3JsZC11cy1jYW5hZGEtNjU4NjMzOTnSATVodHRwczovL3d3dy5iYmMuY29tL25ld3Mvd29ybGQtdXMtY2FuYWRhLTY1ODYzMzk5LmFtcA?oc=5" target="_blank"&gt;The different ways Republicans defend Trump over indictment is revealing&lt;/a&gt;&amp;nbsp;&amp;nbsp;&lt;font color="#6f6f6f"&gt;BBC&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="https://news.google.com/rss/articles/CCAiC0l0eG5aLWVjRENNmAEB?oc=5" target="_blank"&gt;Trump DOJ Charges: 'Smoking gun' evidence came from Trump himself, reports Ari Melber&lt

In [7]:
data=[]#list to all store scrapped data

In [8]:
#calculate the cutoff time for the past 24 hours
cutoff_time=datetime.now() - timedelta(hours=24)

In [16]:
for article in articles:
    #extract relevat data from each article
    pub_date=datetime.strptime(article.pubDate.text,"%a, %d %b %Y %H:%M:%S %Z")
    #check if the article was published within the last 24 hours
    if pub_date >= cutoff_time:
        title=article.title.text #extract title
        link=article.link.text #Extract link
        description=article.description.text #Extract description
        
        #Append the extracted data to the list
        data.append([title,link,description])

df=pd.DataFrame(data,columns=['title','link','description'])

# Apply UnSupervised Clustering algorithm to get the labelled data

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [19]:
#COnvert the Description text into numericla features
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(df["description"])


In [27]:
#APply  k-Means clustering
k=5 #Number of clusters
kmeans=KMeans(n_clusters=k,random_state=42)
kmeans.fit(X)



In [29]:
#GEt the cluster labels for each news article
labels=kmeans.labels_
labels

array([3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 2, 0, 1, 0, 1, 1, 1, 1, 1, 4, 4, 1, 3, 1, 1, 3, 3, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 0, 1, 0, 1, 1, 1, 1,
       1, 4, 4, 1], dtype=int32)

In [30]:
df['Cluster']=labels

In [32]:
#COunt the number of articles in each cluster
cluster_counts=df['Cluster'].value_counts().sort_index()

In [33]:
cluster_counts


Cluster
0     4
1    52
2     4
3     6
4     4
Name: count, dtype: int64