In [1]:
import requests
import random
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4.element import Comment
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from secrets import rss_links

In [2]:
def get_id(entry):
    id_ = entry.find('id')
    try:
        return id_.text.split(':')[-1]
    except:
        return None

def get_text(item):
    try:
        return item.text
    except:
        return None

def get_url(item):
    try:
        url = item['href']
        return url.split('url=')[-1].split('&')[0]
    except:
        return None

def get_author(entry):
    authors = entry.find('author')
    if len(authors) == 0:
        return None
    try:
        return '; '.join(a.text for a in authors)
    except:
        return None

def get_page_text(url):
    try:
        page = requests.get(url, headers=random.choice(headers_list))
        if page.status_code != 200:
            return None
        soup = BeautifulSoup(page.content, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)
        return u" ".join(t.strip() for t in visible_texts if len(t.strip()) > 0).replace('\n', ' ')
    except:
        return None

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [3]:
def get_content(url):
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = soup.findAll('entry')
    entry_list = []
    entry_list = []
    for entry in entries:
        page_url = get_url(entry.find('link'))
        entry_list.append({
            'id': get_id(entry),
            'title': get_text(entry.find('title')),
            'url': page_url,
            'published_date': get_text(entry.find('published')),
            'updated_date': get_text(entry.find('updated')),
            'content': get_text(entry.find('content')),
            'author': get_author(entry),
            'page_text': get_page_text(page_url),
        })

    df = pd.DataFrame.from_dict(entry_list)
    df['published_date'] = pd.to_datetime(df['published_date'])
    df['updated_date'] = pd.to_datetime(df['updated_date'])
    return df

In [4]:
df_ds = get_content(rss_links['Data Science'])
df_ds.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,263989786796450142,UI professor uses machine learning to indicate...,https://dailyiowan.com/2021/09/14/university-o...,2021-09-15 02:36:47+00:00,2021-09-15 02:36:47+00:00,"Steven Baek, an associate professor of <b>data...",,Close Menu Search News Politics COVID-19 Sport...
1,16484538185634236000,"iCubed Seminar: Paul Ardis, GE Research - The ...",https://datascience.columbia.edu/event/icubed-...,2021-09-15 02:28:41+00:00,2021-09-15 02:28:41+00:00,Join to learn about real-world uses of <b>data...,,Faculty Students Alumni Events Data Science Da...
2,9911918169098648189,New university-industry partnership will help ...,https://www.blackengineer.com/article/new-univ...,2021-09-15 01:30:00+00:00,2021-09-15 01:30:00+00:00,"... computer engineering, <b>data science</b> ...",,Subscribe Login × Close / Sign Up × Close ` Su...
3,56486881612013991,Hayden AI Welcomes Bryan Shea as Vice Presiden...,https://www.prnewswire.com/news-releases/hayde...,2021-09-14 22:07:30+00:00,2021-09-14 22:07:30+00:00,"Led by a team of experts in machine learning, ...",,Resources Blog Journalists Log In Sign Up Data...
4,16332187922152765913,Syneos Health Acquires StudyKIK | Your Money |...,https://www.kulr8.com/news/money/syneos-health...,2021-09-14 21:33:45+00:00,2021-09-14 21:33:45+00:00,... capabilities to improve the patient journe...,,You have permission to edit this article. Edit...


In [5]:
df_ai = get_content(rss_links['Artificial Intelligence'])
df_ai.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,1627639864080882518,Museum Of Wild And Newfangled Art&#39;s Openin...,https://www.broadwayworld.com/article/Museum-O...,2021-09-14 23:33:53+00:00,2021-09-14 23:33:53+00:00,The <b>Artificial Intelligence</b> curated exh...,,Skip to main content Skip to footer site map W...
1,9150869453030743568,"Elon is Right, AI is Hard: Five Pitfalls to Av...",https://www.eweek.com/big-data-and-analytics/e...,2021-09-14 22:46:55+00:00,2021-09-14 22:46:55+00:00,"Elon is Right, AI is Hard: Five Pitfalls to Av...",,Close Latest News Cybersecurity Big Data and A...
2,10363741836443607626,Is the new federal AI Advisory Committee too l...,https://www.natlawreview.com/article/new-feder...,2021-09-14 20:46:19+00:00,2021-09-14 20:46:19+00:00,The Department of Commerce issued a press rele...,,"Skip to main content September 14, 2021 Volume..."
3,2757328550078784558,De Wandel and Mathews: <b>Artificial intellige...,https://www.stltoday.com/opinion/columnists/de...,2021-09-14 20:41:49+00:00,2021-09-14 20:41:49+00:00,"Briefly put, <b>artificial intelligence</b> ca...",,Skip to main content Return to homepage × Subs...
4,13406682930218379223,Love &amp; data: Stephanie Dinkins&#39; new UM...,https://news.umich.edu/love-data-stephanie-din...,2021-09-14 19:41:15+00:00,2021-09-14 19:41:15+00:00,“My intention is to encourage action towards m...,,Skip to content Home All Stories Podcasts Facu...


In [6]:
df_ml = get_content(rss_links['Machine Learning'])
df_ml.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,11734179441753964654,<b>Machine Learning</b> &amp; Big Data Analyti...,https://www.stillwatercurrent.com/machine-lear...,2021-09-15 04:18:45+00:00,2021-09-15 04:18:45+00:00,Global <b>Machine Learning</b> &amp; Big Data ...,,Business Health Science Technology World Publi...
1,10613670742011434573,Data science hasn&#39;t fixed its huge gender ...,https://venturebeat.com/2021/09/14/data-scienc...,2021-09-15 03:56:15+00:00,2021-09-15 03:56:15+00:00,“Given the shortage of qualified employees in ...,,Events GamesBeat Jobs Low Code / No Code Summi...
2,4364049398981071401,UI professor uses <b>machine learning</b> to i...,https://dailyiowan.com/2021/09/14/university-o...,2021-09-15 02:36:47+00:00,2021-09-15 02:36:47+00:00,UI professor uses <b>machine learning</b> to i...,,Close Menu Search News Politics COVID-19 Sport...
3,14128517232156713636,<b>Machine learning</b> model enables shippers...,https://aircargoworld.com/news/machine-learnin...,2021-09-15 00:33:45+00:00,2021-09-15 00:33:45+00:00,Carriers and freight forwarders are investing ...,,Subscribe News Data & Tools Events 2021 Webina...
4,1658357943575483617,"Elon is Right, AI is Hard: Five Pitfalls to Av...",https://www.eweek.com/big-data-and-analytics/e...,2021-09-14 22:46:55+00:00,2021-09-14 22:46:55+00:00,<b>Machine Learning</b> development lifecycle ...,,Close Latest News Cybersecurity Big Data and A...


In [7]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,10363741836443607626,Is the new federal AI Advisory Committee too l...,https://www.natlawreview.com/article/new-feder...,2021-09-14 20:46:19+00:00,2021-09-14 20:46:19+00:00,The Department of Commerce issued a press rele...,,"Skip to main content September 14, 2021 Volume..."
1,10613670742011434573,Data science hasn&#39;t fixed its huge gender ...,https://venturebeat.com/2021/09/14/data-scienc...,2021-09-15 03:56:15+00:00,2021-09-15 03:56:15+00:00,“Given the shortage of qualified employees in ...,,Events GamesBeat Jobs Low Code / No Code Summi...
2,11127234293902512321,KNIME Fall Data Talks: Bringing Business and <...,https://datatalks.knime.com/fall2021,2021-09-14 14:56:13+00:00,2021-09-14 14:56:13+00:00,Join our fall event to learn how the newest fe...,,No notifications yet. Sign Out Toggle navigati...
3,11330190616341987755,Apple&#39;s most popular iPad delivers even mo...,https://www.apple.com/newsroom/2021/09/apples-...,2021-09-14 17:26:05+00:00,2021-09-14 17:26:05+00:00,The Neural Engine in A13 Bionic also powers ne...,,Global Nav Open Menu Global Nav Close Menu App...
4,11488070297902970385,HDSI Keynote w. Elizabeth Stuart – 10/5 - Harv...,https://www.hsph.harvard.edu/biostatistics/202...,2021-09-14 14:53:46+00:00,2021-09-14 14:53:46+00:00,"Oct 05, 12:00 PM – 1:30 PM EDT Webinar – RSVP ...",,Menu Close Menu Skip to content Information Fo...


In [8]:
df.loc[df['page_text'].isna()]

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
13,14092562582309430869,The latest research news in <b>Artificial Inte...,https://www.newswise.com/articles/the-latest-r...,2021-09-14 17:37:30+00:00,2021-09-14 17:37:30+00:00,from Purdue University. For <b>artificial inte...,,
20,1573469020536301082,UNESCO launches <b>Artificial Intelligence</b>...,https://www.plenglish.com/index.php%3Fo%3Drn%2...,2021-09-14 15:00:18+00:00,2021-09-14 15:00:18+00:00,"... of the Globalpolicy.ai page, which will se...",,
27,16865084505065755165,AstronomicAL: An interactive dashboard for vis...,http://www.spaceref.com/news/viewsr.html%3Fpid...,2021-09-14 15:20:54+00:00,2021-09-14 15:20:54+00:00,This makes the software a tool for experimenti...,,
42,4318192895469453662,Incorporating <b>Artificial Intelligence</b> I...,https://www.itnonline.com/article/incorporatin...,2021-09-14 18:42:44+00:00,2021-09-14 18:42:44+00:00,This is an <b>artificial intelligence</b> syst...,,


In [9]:
df.to_csv('results/contents.csv', mode='a', header=False, index=False)