# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [48]:
import feedparser
import re

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [5]:
radar = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [6]:
radar.keys()

dict_keys(['feed', 'entries', 'bozo', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [8]:
radar['feed'].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'subtitle', 'subtitle_detail', 'updated', 'updated_parsed', 'language', 'sy_updateperiod', 'sy_updatefrequency', 'generator_detail', 'generator', 'feedburner_info', 'geo_lat', 'geo_long', 'feedburner_emailserviceid', 'feedburner_feedburnerhostname'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [28]:
title    = radar['feed']['title']
subtitle = radar['feed']['subtitle']
link     = radar['feed']['link']
author   = [radar['entries'][i]['author'] for i in range(len(radar.entries))]

### 5. Count the number of entries that are contained in this RSS feed.

In [29]:
len(radar['entries'])

60

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [39]:
radar['entries'][i].keys()
    

dict_keys(['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink'])

### 7. Extract a list of entry titles.

In [46]:
titles_lst = [radar['entries'][i]['title'] for i in range(len(radar['entries']))]
titles_lst

['Four Short Links: 19 August 2020',
 'Why Best-of-Breed is a Better Choice than All-in-One Platforms for Data Science',
 'Four short links: 14 August 2020',
 'The Least Liked Programming Languages',
 'Four short links: 11 Aug 2020',
 'Four short links: 7 Aug 2020',
 'Four short links: 5 August 2020',
 'Radar trends to watch: August 2020',
 'Four short links: 31 July 2020',
 'Four short links: 30 July 2020',
 'Four short links: 29 July 2020',
 'Bringing an AI Product to Market',
 'Power, Harms, and Data',
 'Four short links: 27 July 2020',
 'Four short links: 24 July 2020',
 'Four short links: 26 July 2020',
 'Four short links: 22 July 2020',
 'AI, Protests, and Justice',
 'Four short links: 21 July 2020',
 'Four short links: 20 July 2020',
 'Four short links: 17 July 2020',
 'Four short links: 16 July 2020',
 'Microservices Adoption in 2020',
 'Four short links: 15 July 2020',
 'Society-Centered Design',
 'Four short links: 14 July 2020',
 'Four short links: 13 July 2020',
 'Four shor

### 8. Calculate the percentage of "Four short links" entry titles.

In [53]:
fsl = 0
for i in titles_lst:
    if bool(re.search(r'four\sshort\slinks',i.lower())):
        fsl += 1
print(f'Percentage of four short links: {(fsl/len(titles_lst))*100}%')

Percentage of four short links: 75.0%


### 9. Create a Pandas data frame from the feed's entries.

In [54]:
import pandas as pd

In [117]:
df = pd.DataFrame(radar['entries'])
df.head(3)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,Four Short Links: 19 August 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Wed, 19 Aug 2020 11:44:06 +0000","(2020, 8, 19, 11, 44, 6, 2, 232, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13228,False,The Design Space of Computational Notebooks &#...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
1,Why Best-of-Breed is a Better Choice than All-...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/why-best-of-bree...,"Tue, 18 Aug 2020 11:30:42 +0000","(2020, 8, 18, 11, 30, 42, 1, 231, 0)",[{'name': 'Matthew Rocklin and Hugo Bowne-Ande...,Matthew Rocklin and Hugo Bowne-Anderson,{'name': 'Matthew Rocklin and Hugo Bowne-Ander...,"[{'term': 'AI & ML', 'scheme': None, 'label': ...",https://www.oreilly.com/radar/?p=13220,False,So you need to redesign your company’s data in...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/why-best-of-bree...,0,https://www.oreilly.com/radar/why-best-of-bree...
2,Four short links: 14 August 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 14 Aug 2020 11:38:56 +0000","(2020, 8, 14, 11, 38, 56, 4, 227, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=13217,False,Sinter &#8212; Sinter uses the user-mode Endpo...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...


### 10. Count the number of entries per author and sort them in descending order.

In [77]:
df['author'].value_counts(ascending=False)

Nat Torkington                                      45
Mike Loukides                                        9
Justin Norman, Peter Skomoroch and Mike Loukides     1
Mike Loukides and Steve Swoyer                       1
Hugo Bowne-Anderson                                  1
Adam Jacob, Nat Torkington and Mike Loukides         1
Sarah Gold                                           1
Matthew Rocklin and Hugo Bowne-Anderson              1
Name: author, dtype: int64

In [80]:
df.groupby('author', as_index=False).agg({'title':'count'}).sort_values('title',ascending=False)

Unnamed: 0,author,title
6,Nat Torkington,45
4,Mike Loukides,9
0,"Adam Jacob, Nat Torkington and Mike Loukides",1
1,Hugo Bowne-Anderson,1
2,"Justin Norman, Peter Skomoroch and Mike Loukides",1
3,Matthew Rocklin and Hugo Bowne-Anderson,1
5,Mike Loukides and Steve Swoyer,1
7,Sarah Gold,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [92]:
df['length of title'] = [len(df['title'][i]) for i in range(len(df['title']))]
df[['title','author','length of title']].head(3)

Unnamed: 0,title,author,length of title
0,Four Short Links: 19 August 2020,Nat Torkington,32
1,Why Best-of-Breed is a Better Choice than All-...,Matthew Rocklin and Hugo Bowne-Anderson,79
2,Four short links: 14 August 2020,Nat Torkington,32


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [114]:
ml_title_lst = []
ml_lst = []
for i,j in zip(df['summary'],df['title']):
    if bool(re.search(r'machine learning',i.lower())):
        ml_title_lst.append(j)
        ml_lst.append(i)

print(ml_title_lst)

['Four short links: 8 July 2020', 'Machine Learning and the Production Gap']


In [115]:
ml_lst[0]

'When Data is Messy &#8212; I love stories that illustrate the ways machine learning can draw the wrong conclusions. Researchers at the University of Tuebingen trained a neural net to recognize images, and then had it point out which parts of the images were the most important for its decision. When they asked it to [&#8230;]'

In [116]:
ml_lst[1]

'The biggest problem facing machine learning today isn&#8217;t the need for better algorithms; it isn&#8217;t the need for more computing power to train models; it isn&#8217;t even the need for more skilled practitioners. It&#8217;s getting machine learning from the researcher&#8217;s laptop to production. That&#8217;s the real gap. It&#8217;s one thing to build a model; it&#8217;s [&#8230;]'