# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [17]:
!pip install feedparser
import feedparser



### 1. Use feedparser to parse the following RSS feed URL.

In [18]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [19]:
new = feedparser.parse(url)

### 2. Obtain a list of components (keys) that are available for this feed.

In [20]:
list(new.keys())

['feed',
 'entries',
 'bozo',
 'headers',
 'etag',
 'updated',
 'updated_parsed',
 'href',
 'status',
 'encoding',
 'version',
 'namespaces']

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [21]:
list(new['feed'].keys())

['title',
 'title_detail',
 'links',
 'link',
 'subtitle',
 'subtitle_detail',
 'updated',
 'updated_parsed',
 'language',
 'sy_updateperiod',
 'sy_updatefrequency',
 'generator_detail',
 'generator',
 'feedburner_info',
 'geo_lat',
 'geo_long',
 'feedburner_emailserviceid',
 'feedburner_feedburnerhostname']

### 4. Extract and print the feed title, subtitle, author, and link.

In [32]:
feed_title = new['feed'].title
feed_subtitle = new['feed'].subtitle
feed_link = new['feed'].link
#feed_author = new['feed'].author
#This will give us an error, because the author is not defined
print(feed_title,feed_subtitle,feed_link)

Radar Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology https://www.oreilly.com/radar


### 5. Count the number of entries that are contained in this RSS feed.

In [38]:
entries_number = new.entries

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [44]:
entry = entries_number[0]

entry_keys_feed_list = list(entry.keys())
print(entry_keys_feed_list)

['title', 'title_detail', 'links', 'link', 'comments', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'id', 'guidislink', 'summary', 'summary_detail', 'content', 'wfw_commentrss', 'slash_comments', 'feedburner_origlink']


### 7. Extract a list of entry titles.

In [47]:
entry_titles = [new.entries[i].title for i in range(0, 18)]

In [48]:
print(entry_titles)

['Four short links: 6 April 2020', 'Four short links: 3 April 2020', 'Four short links: 2 April 2020', 'Four short links: 1 April 2020', 'Four short links: 31 March 2020', 'What you need to know about product management for AI', 'The unreasonable importance of data preparation', 'Four short links: 24 March 2020', '3 ways to confront modern business challenges', 'An enterprise vision is your company’s North Star', 'Leaders need to mobilize change-ready workforces', 'Great leaders inspire innovation and creativity from within their workforces', 'Strong leaders forge an intersection of knowledge and experience', 'Four short links: 23 March 2020', 'Four short links: 20 March 2020', '6 trends framing the state of AI and ML', 'Four short links: 19 March 2020', 'It’s an unprecedented crisis: 8 things to do right now']


### 8. Calculate the percentage of "Four short links" entry titles.

In [51]:
four_short_links = [i for i in entry_titles if i.startswith('Four short links:')]

percentage = len(four_short_links)/len(entry_titles) *100

print(f'The percentage of "Four short links" entry titles equals to {percentage:.3}')

The percentage of "Four short links" entry titles equals to 50.0


### 9. Create a Pandas data frame from the feed's entries.

In [54]:
import pandas as pd

In [60]:
df = pd.DataFrame(entries_number)
df.head(2)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink
0,Four short links: 6 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Mon, 06 Apr 2020 11:53:01 +0000","(2020, 4, 6, 11, 53, 1, 0, 97, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12590,False,Rufus &#8212; Create bootable USB drives the e...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...
1,Four short links: 3 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 03 Apr 2020 11:59:08 +0000","(2020, 4, 3, 11, 59, 8, 4, 94, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12585,False,The Zero Trust Learning Curve (Palo Alto Netwo...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...


### 10. Count the number of entries per author and sort them in descending order.

In [62]:
df1 = df['authors'].value_counts()
print(df1)

[{'name': 'Nat Torkington'}]                       37
[{'name': 'Jenn Webb'}]                             4
[{'name': 'Roger Magoulas and Steve Swoyer'}]       4
[{'name': 'Mike Loukides'}]                         2
[{'name': 'George Fairbanks'}]                      1
[{'name': 'Mary Poppendieck'}]                      1
[{'name': 'Martin Fowler'}]                         1
[{'name': 'Tim O’Reilly'}]                          1
[{'name': 'Pamela Rucker'}]                         1
[{'name': 'Mark Richards'}]                         1
[{'name': 'Cynthia Owens'}]                         1
[{'name': 'Mac Slocum'}]                            1
[{'name': 'Peter Skomoroch and Mike Loukides'}]     1
[{'name': 'Hugo Bowne-Anderson'}]                   1
[{'name': 'Kai Holnes'}]                            1
[{'name': 'Rita J. King'}]                          1
[{'name': 'Rachel Laycock and Neal Ford'}]          1
Name: authors, dtype: int64


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [68]:
number_of_characters = df_entries['title'].str.len()

df_entries['title_length'] = number_of_characters
df_entries.head(2)

Unnamed: 0,title,title_detail,links,link,comments,published,published_parsed,authors,author,author_detail,tags,id,guidislink,summary,summary_detail,content,wfw_commentrss,slash_comments,feedburner_origlink,title_length
0,Four short links: 6 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Mon, 06 Apr 2020 11:53:01 +0000","(2020, 4, 6, 11, 53, 1, 0, 97, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12590,False,Rufus &#8212; Create bootable USB drives the e...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...,30
1,Four short links: 3 April 2020,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,https://www.oreilly.com/radar/four-short-links...,"Fri, 03 Apr 2020 11:59:08 +0000","(2020, 4, 3, 11, 59, 8, 4, 94, 0)",[{'name': 'Nat Torkington'}],Nat Torkington,{'name': 'Nat Torkington'},"[{'term': 'Four Short Links', 'scheme': None, ...",https://www.oreilly.com/radar/?p=12585,False,The Zero Trust Learning Curve (Palo Alto Netwo...,"{'type': 'text/html', 'language': None, 'base'...","[{'type': 'text/html', 'language': None, 'base...",https://www.oreilly.com/radar/four-short-links...,0,https://www.oreilly.com/radar/four-short-links...,30


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [69]:
phrase = list(df_entries['summary'].str.contains('machine learning.'))

machine_learning = []

for i,e in enumerate(phrase):
    if e == True:
        machine_learning.append(df_entries.iloc[i,0])

print(machine_learning)

['What you need to know about product management for AI', 'Four short links: 13 February 2020']
