In [1]:
import requests
import random
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from secrets import rss_links

In [12]:
def get_content(url):
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = soup.findAll('entry')
    entry_list = []
    entry_list = []
    for entry in entries:
        entry_list.append({
            'id': get_id(entry),
            'title': get_text(entry.find('title')),
            'url': get_url(entry.find('link')),
            'published_date': get_text(entry.find('published')),
            'updated_date': get_text(entry.find('updated')),
            'content': get_text(entry.find('content')),
            'author': get_author(entry),
        })

    df = pd.DataFrame.from_dict(entry_list)
    df['published_date'] = pd.to_datetime(df['published_date'])
    df['updated_date'] = pd.to_datetime(df['updated_date'])
    return df

In [13]:
def get_id(entry):
    id_ = entry.find('id')
    try:
        return id_.text.split(':')[-1]
    except:
        return None

def get_text(item):
    try:
        return item.text
    except:
        return None

def get_url(item):
    try:
        return item['href']
    except:
        return None

def get_author(entry):
    authors = entry.find('author')
    if len(authors) == 0:
        return None
    try:
        return '; '.join(a.text for a in authors)
    except:
        return None

In [14]:
df_ds = get_content(rss_links['Data Science'])
df_ds.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,12787498176642890753,Data scientist: Will there be enough of them? ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 03:58:30+00:00,2021-09-10 03:58:30+00:00,Businesses will need <b>data science</b> perso...,
1,2846781292743719960,Six factors to consider when undertaking <b>da...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 02:15:00+00:00,2021-09-10 02:15:00+00:00,One of the exciting things in <b>data science<...,
2,16097979489213616773,What do new Israeli data say about effect of v...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 02:03:04+00:00,2021-09-10 02:03:04+00:00,Covid-19 <b>Data Science</b> &middot; Home &mi...,
3,4055865653021147402,Clinical <b>Data Science</b> and AI: Current r...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 23:34:21+00:00,2021-09-09 23:34:21+00:00,"With presenting successful projects, we want t...",
4,3736738143324814272,NUWC Division Keyport observes Hispanic Herita...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 23:15:00+00:00,2021-09-09 23:15:00+00:00,She received her undergraduate degree in Busin...,


In [15]:
df_ai = get_content(rss_links['Artificial Intelligence'])
df_ai.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,11910678115065525275,How Do We Use <b>Artificial Intelligence</b> E...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 05:48:45+00:00,2021-09-10 05:48:45+00:00,How Do We Use <b>Artificial Intelligence</b> E...,
1,14409766189043817002,53 Schools in Madhya Pradesh to Teach <b>Artif...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 05:03:45+00:00,2021-09-10 05:03:45+00:00,At least 53 schools in Madhya Pradesh will tea...,
2,16588844248336343721,Harnessing <b>artificial intelligence</b> to h...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 04:07:30+00:00,2021-09-10 04:07:30+00:00,They write below on the potential for harnessi...,
3,4936977401380553336,Ed Vasicek: Not all of us are on board with ro...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 03:22:30+00:00,2021-09-10 03:22:30+00:00,"In our times, however, <b>Artificial Intellige...",
4,6165229525177409776,Enterprise <b>Artificial Intelligence</b> Mark...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 22:52:30+00:00,2021-09-09 22:52:30+00:00,Latest Study on Industrial Growth of Worldwide...,


In [16]:
df_ml = get_content(rss_links['Machine Learning'])
df_ml.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,9436700736077957601,"The Edge Is Just A Massive, Geographically Dis...",https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 01:30:00+00:00,2021-09-10 01:30:00+00:00,Sponsored If you have a hundred or a thousand ...,
1,7732805458052610372,Microsoft explores <b>machine learning</b> in ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-10 00:04:34+00:00,2021-09-10 00:04:34+00:00,Microsoft has signed a MoU with the University...,
2,2698608161748421201,DataPlus 3.0 Uses <b>Machine Learning</b> to H...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 23:26:15+00:00,2021-09-09 23:26:15+00:00,DataPlus 3.0 Uses <b>Machine Learning</b> to H...,
3,16648851031499874060,Kangaroo Court: Developing Competitive Strateg...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 23:26:15+00:00,2021-09-09 23:26:15+00:00,The recent developments in AI and <b>machine l...,
4,6217991123822188099,<b>Machine Learning</b> &amp; High Dimensional...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 22:00:30+00:00,2021-09-09 22:00:30+00:00,<b>Machine learning</b> focuses on the creatio...,


In [17]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,10183835959684607262,IT Leaders Adjust Hiring Strategies With Tech ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 20:12:59+00:00,2021-09-09 20:12:59+00:00,<b>Data science</b> jobs are difficult roles t...,
1,10920338801564153710,Assessment Industry Leader Holds Summit on <b>...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 16:58:06+00:00,2021-09-09 16:58:06+00:00,"Global ATP Chair John Kleeman, Founder of Ques...",
2,11214010975535366483,Ethical <b>Artificial Intelligence</b> is Focu...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 18:11:15+00:00,2021-09-09 18:11:15+00:00,Ethical <b>Artificial Intelligence</b> is Focu...,
3,1127802956531070073,Global Artificial Intelligence Market (2021 to...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 16:52:30+00:00,2021-09-09 16:52:30+00:00,"“Artificial Intelligence Market, By Offering (...",
4,11409262356049543199,Featurespace&#39;s ARIC Risk Hub™ v3.19 delive...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-09 17:37:30+00:00,2021-09-09 17:37:30+00:00,“Our <b>machine learning</b> models are proven...,


In [18]:
df.to_csv('results/contents.csv', index=False)