In [1]:
import requests
import random
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from secrets import rss_links

In [12]:
def get_content(url):
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = soup.findAll('entry')
    entry_list = []
    entry_list = []
    for entry in entries:
        entry_list.append({
            'id': get_id(entry),
            'title': get_text(entry.find('title')),
            'url': get_url(entry.find('link')),
            'published_date': get_text(entry.find('published')),
            'updated_date': get_text(entry.find('updated')),
            'content': get_text(entry.find('content')),
            'author': get_author(entry),
        })

    df = pd.DataFrame.from_dict(entry_list)
    df['published_date'] = pd.to_datetime(df['published_date'])
    df['updated_date'] = pd.to_datetime(df['updated_date'])
    return df

In [13]:
def get_id(entry):
    id_ = entry.find('id')
    try:
        return id_.text.split(':')[-1]
    except:
        return None

def get_text(item):
    try:
        return item.text
    except:
        return None

def get_url(item):
    try:
        return item['href']
    except:
        return None

def get_author(entry):
    authors = entry.find('author')
    if len(authors) == 0:
        return None
    try:
        return '; '.join(a.text for a in authors)
    except:
        return None

In [19]:
df_ds = get_content(rss_links['Data Science'])
df_ds.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,17140745780390785304,A 10-Course Introduction to <b>Data Science</b...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 00:18:17+00:00,2021-09-13 00:18:17+00:00,Data is now everywhere. And those who can harn...,
1,9727360739504798718,"Who&#39;s hiring in Pittsburgh? Aurora, Carlow...",https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 00:00:00+00:00,2021-09-13 00:00:00+00:00,Aurora has an opening for an Autonomy Safety <...,
2,12332808078289061785,"Our classroom is becoming more flexible, hybri...",https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 22:30:00+00:00,2021-09-12 22:30:00+00:00,We also launched the online BSc in Programming...,
3,15919742716733308327,Radio ratings to include livestreaming data - ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 21:00:00+00:00,2021-09-12 21:00:00+00:00,... strongest radio markets in the world and t...,
4,17834455628474359737,Axiata Digital Labs announces first-ever ADL A...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 21:00:00+00:00,2021-09-12 21:00:00+00:00,“This event will enable <b>data science</b> an...,


In [20]:
df_ai = get_content(rss_links['Artificial Intelligence'])
df_ai.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,2067305417285265579,<b>Artificial Intelligence</b> (AI) in Cyberse...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 00:56:15+00:00,2021-09-13 00:56:15+00:00,Organizations across industries are turning to...,
1,14877738212203842889,CMS Awards NCI $112M Contract Extension Suppor...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 22:52:30+00:00,2021-09-12 22:52:30+00:00,Under the Comprehensive Error Rate Testing con...,
2,6357025403358493258,Patrol cams adding more security to local park...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 22:18:03+00:00,2021-09-12 22:18:03+00:00,Lindsey also says the cameras use <b>artificia...,
3,7397605872053070764,BioXcel Therapeutics to Present Updates from O...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 21:56:15+00:00,2021-09-12 21:56:15+00:00,... a clinical-stage biopharmaceutical company...,
4,13462936679277205821,Decisiv Unveils Automatic VMRS Encoding Soluti...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 21:53:21+00:00,2021-09-12 21:53:21+00:00,The VMRS Encoding for Fleets analyzes individu...,


In [21]:
df_ml = get_content(rss_links['Machine Learning'])
df_ml.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,2346801807311996214,Artificial Intelligence (AI) in Cybersecurity ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 00:56:15+00:00,2021-09-13 00:56:15+00:00,Platforms enhanced by <b>machine learning</b> ...,
1,2817446043385123314,"Who&#39;s hiring in Pittsburgh? Aurora, Carlow...",https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 00:00:00+00:00,2021-09-13 00:00:00+00:00,Peptilogics seeks a <b>Machine Learning</b> Sc...,
2,15874292272497799359,CMS Awards NCI $112M Contract Extension Suppor...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 22:52:30+00:00,2021-09-12 22:52:30+00:00,“As federal agencies continue to embrace artif...,
3,14512119619492113720,SJC sets limits on use of police body camera f...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 22:07:30+00:00,2021-09-12 22:07:30+00:00,“When combined with limitless data storage and...,
4,17187327455440134029,BioXcel Therapeutics to Present Updates from O...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 21:56:15+00:00,2021-09-12 21:56:15+00:00,... validated product candidates together with...,


In [22]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,10058609923349150707,Computer science continues to be favourite amo...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 18:33:45+00:00,2021-09-12 18:33:45+00:00,"Once again, Computer Science, <b>Artificial In...",
1,10257980602360460347,Yinqiu He - The <b>Data Science</b> Institute ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 08:46:44+00:00,2021-09-12 08:46:44+00:00,Yinqiu He. <b>Data Science</b> Institute: Post...,
2,10369779799486769808,<b>Data Science</b> Renee on Twitter: &quot; ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 07:27:28+00:00,2021-09-12 07:27:28+00:00,<b>Data Science</b> Renee &middot; @BecomingDa...,
3,10882429297873722339,Risks versus opportunities in national securit...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 20:00:13+00:00,2021-09-12 20:00:13+00:00,Advances in additive manufacturing (also known...,
4,11381515111146361903,SAMK among top for applied sciences - Telangan...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-12 18:27:30+00:00,2021-09-12 18:27:30+00:00,Of all the technologies that drive digital tra...,


In [24]:
df.to_csv('results/contents.csv', mode='a', header=False, index=False)