In [16]:
!pip install feedparser
#here is code example shows how to extract news data from a RSS feed
import feedparser
import pandas as pd
import requests
from bs4 import BeautifulSoup

rawrss = [
    #this RSS feed is from the BBC NEWS 
    'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/front_page/rss.xml'
    ]

   
posts = []

##first to loop through all “links” in the RSS feed and parsing feed using python library feedparser at each iteration
for url in rawrss:
    feed = feedparser.parse(url)
    
    for post in feed.entries:

        #Request the article url to get the web page content and create a BeautifulSoup object with the HTML from that page
        article = requests.get(post.link)
        articles = BeautifulSoup(article.content, 'html.parser')

        #extract all paragraph elements inside the page body, and for each paragraph , extract its element text and append it to a list. 
        articles_body = articles.findAll('body')    
        p_blocks = articles_body[0].findAll('p')

        body=[]
        # Loop trough paragraph to extract its element text 
        for i in range(0,len(p_blocks)):
  
          body.append(p_blocks[i].text)
        #unpack list  
        body=''.join(body)

        #in each post, save its link, title, description and text.
        posts.append((post.title, post.link, post.description, body))

#create a Pandas dataframe from RSS parsing results above with title, link, description and text of all news articles  
df = pd.DataFrame(posts, columns=['title', 'link','description','Text'])


 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
#number of words
df['# of words']=[len(x.split()) for x in df['Text']]
#number of characters
df['# of characters']=[len(x) for x in df['Text']]
#average word length
ave=[]
for x in df['Text']:
  words = x.split()
  ave.append(sum(map(len, words))/len(words))
df['Average word length']=ave
#number of stopwords
import nltk
nltk.download('stopwords')    
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['# of stopwords'] = df['Text'].str.split().apply(lambda x: len(set(x) & stop_words))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
df.head()

Unnamed: 0,title,link,description,Text,# of words,# of characters,Average word length,# of stopwords
0,Health bosses seek strike deal as talks take p...,https://www.bbc.co.uk/news/health-63627958?at_...,Unions to meet Health Secretary Steve Barclay ...,A deal must be found to end the NHS strikes ah...,676,4193,5.204142,62
1,"As the 8 billionth child is born, who were 5th...",https://www.bbc.co.uk/news/world-63623307?at_m...,"The world is still growing, but not as fast as...",The UN says the world's population has hit eig...,1317,7665,4.820805,83
2,Princess Anne and Prince Edward to become stan...,https://www.bbc.co.uk/news/uk-63626113?at_medi...,"The King requests extra stand-ins, as Prince A...",King Charles has begun the process of increasi...,662,4022,5.077039,56
3,Michelle Obama: Being kind to myself is a chal...,https://www.bbc.co.uk/news/entertainment-arts-...,The ex-first lady tells the BBC about her nega...,This video can not be playedWatch: Michelle Ob...,1140,6518,4.718421,92
4,Ukraine war: US and Russian spy chiefs meet fa...,https://www.bbc.co.uk/news/world-63631100?at_m...,William Burns meets his Russian counterpart in...,US and Russian spy chiefs have met face-to-fac...,828,5045,5.091787,58


In [19]:
df.to_csv('bbcdataframe.csv') 

## Bag of words

In [20]:
#Bag-of-words using count vectorization (TF)
from sklearn.feature_extraction.text import CountVectorizer
#token_pattern=r'\b[a-zA-Z]{3,}\b' exclude anything that has numbers in it.
vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words='english')
X = vectorizer.fit_transform(df['Text'])
#in matrix form 
tf=pd.DataFrame(X.toarray(), 
             columns=vectorizer.get_feature_names())
tf.head()



Unnamed: 0,abandoned,abandoning,abdullah,abdullahi,aberdeen,aberdeenshire,ability,able,abolished,abroad,...,zealand,zealandavailable,zelensky,zero,zhou,zlotys,zone,zones,zoopla,zooplashelteronly
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [21]:
#Bag-of-words using TF_IDF
from sklearn.feature_extraction.text import TfidfVectorizer
#token_pattern=r'\b[a-zA-Z]{3,}\b' exclude anything that has numbers in it.
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words='english')
X = vectorizer.fit_transform(df['Text'])
#in matrix form
tf_idf=pd.DataFrame(X.toarray(), 
             columns=vectorizer.get_feature_names())
tf_idf.head()



Unnamed: 0,abandoned,abandoning,abdullah,abdullahi,aberdeen,aberdeenshire,ability,able,abolished,abroad,...,zealand,zealandavailable,zelensky,zero,zhou,zlotys,zone,zones,zoopla,zooplashelteronly
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047851,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.040722,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#Printing the features gives the following output
print(vectorizer.get_feature_names())





## example 2

In [23]:
##example 2

rawrss = [
    'https://www.yahoo.com/news/rss/'
    ]
posts = []
for url in rawrss:
    feed = feedparser.parse(url)
    
    for post in feed.entries:
        #Request the article url to get the web page content.
        article = requests.get(post.link)

        # 1. extract all paragraph elements inside the page body
        articles = BeautifulSoup(article.content, 'html.parser')
        articles_body = articles.findAll('body')  
          
        p_blocks = articles_body[0].findAll('p')

        body=[]
        # Loop trough paragraph to extract its element text 
        for i in range(0,len(p_blocks)):
  
          body.append(p_blocks[i].text)

        #unpack list  
        body=''.join(body)
        posts.append((post.title, post.link, post.published, body))


df = pd.DataFrame(posts, columns=['title', 'link','published_date','Text'])


#number of words
df['# of words']=[len(x.split()) for x in df['Text']]
#number of characters
df['# of characters']=[len(x) for x in df['Text']]
#average word length
ave=[]
for x in df['Text']:
  words = x.split()
  ave.append(sum(map(len, words))/len(words))
df['Average word length']=ave
#number of stopwords
import nltk
nltk.download('stopwords')    
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['# of stopwords'] = df['Text'].str.split().apply(lambda x: len(set(x) & stop_words))


df.to_csv('yohoodataframe.csv')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
df.head()

Unnamed: 0,title,link,published_date,Text,# of words,# of characters,Average word length,# of stopwords
0,"DEA’s most corrupt agent: Parties, sex amid 'u...",https://news.yahoo.com/dea-most-corrupt-agent-...,2022-11-14T13:43:05Z,"SAN JUAN, Puerto Rico (AP) — José Irizarry acc...",3634,22550,5.204733,101
1,California man attacks Thai restaurant owner's...,https://news.yahoo.com/california-man-attacks-...,2022-11-14T23:55:19Z,A customer was captured on surveillance footag...,1159,7148,5.163934,80
2,‘Crime Of Passion:’ Cops Identify Four Univers...,https://news.yahoo.com/crime-passion-cops-iden...,2022-11-14T12:54:27Z,The four University of Idaho students allegedl...,1384,8772,5.321532,78
3,Herschel Walker: U.S. should keep 'gas-guzzlin...,https://news.yahoo.com/herschel-walker-us-shou...,2022-11-14T20:15:06Z,"Campaigning in Georgia on Sunday, Republican S...",1350,8261,5.115556,83
4,Ivanka Trump Cropped Kimberly Guilfoyle Out of...,https://news.yahoo.com/ivanka-trump-cropped-ki...,2022-11-14T16:50:00Z,Ivanka Trump definitely had a theme for her ha...,931,5878,5.312567,70


## example 3

In [25]:
##example 3

rawrss = [
    'http://www.huffingtonpost.co.uk/feeds/index.xml'
    ]
posts = []
for url in rawrss:
    feed = feedparser.parse(url)
    
    for post in feed.entries:
        #Request the article url to get the web page content.
        article = requests.get(post.link)

        # 1. extract all paragraph elements inside the page body
        articles = BeautifulSoup(article.content, 'html.parser')
        articles_body = articles.findAll('body')  
          
        p_blocks = articles_body[0].findAll('p')

        body=[]
        # Loop trough paragraph to extract its element text 
        for i in range(0,len(p_blocks)):
  
          body.append(p_blocks[i].text)

        #unpack list  
        body=''.join(body)
        posts.append((post.title, post.link, post.published, body))


df = pd.DataFrame(posts, columns=['title', 'link','published_date','Text'])


#number of words
df['# of words']=[len(x.split()) for x in df['Text']]
#number of characters
df['# of characters']=[len(x) for x in df['Text']]
#average word length
ave=[]
for x in df['Text']:
  words = x.split()
  ave.append(sum(map(len, words))/len(words))
df['Average word length']=ave
#number of stopwords
import nltk
nltk.download('stopwords')    
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['# of stopwords'] = df['Text'].str.split().apply(lambda x: len(set(x) & stop_words))

df.to_csv('huffingdataframe.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
df.head()

Unnamed: 0,title,link,published_date,Text,# of words,# of characters,Average word length,# of stopwords
0,Matt Hancock Admits He's 'Starting To Enjoy' T...,https://www.huffingtonpost.co.uk/entry/matt-ha...,"Mon, 14 Nov 2022 22:00:49 +0000","News Editor, HuffPost UKMatt Hancock has admit...",344,2054,4.973837,46
1,Former Minister Admits UK's Post-Brexit Trade ...,https://www.huffingtonpost.co.uk/entry/george-...,"Mon, 14 Nov 2022 18:57:39 +0000","News Editor, HuffPost UKFormer environment sec...",616,3896,5.326299,63
2,Deadly Shooting At The University Of Virginia,https://www.huffingtonpost.co.uk/entry/deadly-...,"Mon, 14 Nov 2022 18:03:02 +0000",Microbio,1,8,8.0,0
3,"UK Economy ‘Permanently Damaged’ By Brexit, Sa...",https://www.huffingtonpost.co.uk/entry/brexit-...,"Mon, 14 Nov 2022 17:16:48 +0000","News Editor, HuffPost UKA former Bank of Engla...",443,2845,5.424379,46
4,The Unusual Covid Symptom Affecting Kids You S...,https://www.huffingtonpost.co.uk/entry/covid-t...,"Mon, 14 Nov 2022 16:52:44 +0000","Head, shoulders, knees and Covid... toes?As ca...",289,1838,5.363322,49


## example 4

In [38]:
##example 4

rawrss = [
    'https://buzzfeed.com/world.xml'
    ]
posts = []
for url in rawrss:
    feed = feedparser.parse(url)
    
    for post in feed.entries:
        #Request the article url to get the web page content.
        article = requests.get(post.link)

        # 1. extract all paragraph elements inside the page body
        articles = BeautifulSoup(article.content, 'html.parser')
        articles_body = articles.findAll('body')  
          
        p_blocks = articles_body[0].findAll('p')

        body=[]
        
        # Loop trough paragraph to extract its element text 
        for i in range(0,len(p_blocks)):
  
          body.append(p_blocks[i].text)

        #unpack list  
        body=''.join(body)
        posts.append((post.title, post.link, body))

df = pd.DataFrame(posts, columns=['title', 'link','Text'])

#number of words
df['# of words']=[len(x.split()) for x in df['Text']]
#number of characters
df['# of characters']=[len(x) for x in df['Text']]
#average word length
ave=[]
for x in df['Text']:
  words = x.split()
  ave.append(sum(map(len, words))/len(words))
df['Average word length']=ave
#number of stopwords
import nltk
nltk.download('stopwords')    
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['# of stopwords'] = df['Text'].str.split().apply(lambda x: len(set(x) & stop_words))


df.to_csv('buzzfeed.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
df.head()

Unnamed: 0,title,link,Text,# of words,# of characters,Average word length,# of stopwords
0,The Taliban In Afghanistan Is Still Preventing...,https://www.buzzfeednews.com/article/syedzabiu...,“It is in their interest to impose restriction...,1660,10441,5.290361,83
1,Brittney Griner's Lawyers Don't Know Where She...,https://www.buzzfeednews.com/article/davidmack...,The WNBA star was sentenced to nine years in p...,466,2836,5.085837,59
2,A New WhatsApp “Communities” Feature Makes Org...,https://www.buzzfeednews.com/article/pranavdix...,A researcher doubts WhatsApp's claim that Comm...,939,6026,5.41853,69
3,At Least 146 People Were Killed After A Crowd ...,https://www.buzzfeednews.com/article/stefficao...,A witness told BuzzFeed News she was making he...,706,4123,4.84136,68
4,Thousands Marched In Solidarity With Iranian W...,https://www.buzzfeednews.com/article/stefficao...,Protesters around the world gathered to protes...,243,1608,5.621399,33
