In [49]:
import requests
import random
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4.element import Comment
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from secrets import rss_links

In [71]:
def get_id(entry):
    id_ = entry.find('id')
    try:
        return id_.text.split(':')[-1]
    except:
        return None

def get_text(item):
    try:
        return item.text
    except:
        return None

def get_url(item):
    try:
        url = item['href']
        return url.split('url=')[-1].split('&')[0]
    except:
        return None

def get_author(entry):
    authors = entry.find('author')
    if len(authors) == 0:
        return None
    try:
        return '; '.join(a.text for a in authors)
    except:
        return None

def get_page_text(url):
    try:
        page = requests.get(url, headers=random.choice(headers_list))
        if page.status_code != 200:
            return None
        soup = BeautifulSoup(page.content, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)
        return u" ".join(t.strip() for t in visible_texts if len(t.strip()) > 0).replace('\n', ' ')
    except:
        return None

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [12]:
def get_content(url):
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = soup.findAll('entry')
    entry_list = []
    entry_list = []
    for entry in entries:
        page_url = get_url(entry.find('link'))
        entry_list.append({
            'id': get_id(entry),
            'title': get_text(entry.find('title')),
            'url': page_url,
            'published_date': get_text(entry.find('published')),
            'updated_date': get_text(entry.find('updated')),
            'content': get_text(entry.find('content')),
            'author': get_author(entry),
            'page_text': get_page_text(page_url),
        })

    df = pd.DataFrame.from_dict(entry_list)
    df['published_date'] = pd.to_datetime(df['published_date'])
    df['updated_date'] = pd.to_datetime(df['updated_date'])
    return df

In [25]:
df_ds = get_content(rss_links['Data Science'])
df_ds.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,7179758955197475935,The 3 Best Databricks Tutorials on YouTube to ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 21:33:45+00:00,2021-09-13 21:33:45+00:00,This list of the best Databricks tutorials on ...,
1,12221833554656008570,"CUIMC Update - Sept. 13, 2021 | Columbia Unive...",https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:59:19+00:00,2021-09-13 20:59:19+00:00,"Harry Reyes, PhD student in Biomedical Informa...",
2,12256308127531032409,PepsiCo Launches &#39;Pepviz&#39; Data Practic...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:37:30+00:00,2021-09-13 20:37:30+00:00,... has launched an in-house data practice cal...,
3,10726530668617939684,Accenture to Buy Product Management Specialist...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:26:15+00:00,2021-09-13 20:26:15+00:00,Accenture is to buy French-based product manag...,
4,15715324497736364757,NSF Awards Nearly $3 Million for Graduate Rese...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:03:13+00:00,2021-09-13 20:03:13+00:00,<b>Data science</b> and artificial intelligenc...,


In [26]:
df_ai = get_content(rss_links['Artificial Intelligence'])
df_ai.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,11650460663231519892,How Do You Build a Better Machine? You Can Use...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 23:15:00+00:00,2021-09-13 23:15:00+00:00,Machines are built now using <b>artificial int...,
1,15480672277869411898,"Meet C.L.Ai.R.A., The First Female Afro-Latina...",https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 21:56:15+00:00,2021-09-13 21:56:15+00:00,"R.A., the first <b>artificial intelligence</b>...",
2,1391972872686827912,Advantages and Disadvantages of <b>Artificial ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:44:45+00:00,2021-09-13 20:44:45+00:00,<b>Artificial intelligence</b> is a major mark...,
3,11817590913806961005,&#39;National <b>Artificial Intelligence</b> A...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:03:45+00:00,2021-09-13 20:03:45+00:00,"During the Trump administration, the federal g...",
4,14516395019874402958,Is <b>Artificial Intelligence</b> the future o...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 18:21:23+00:00,2021-09-13 18:21:23+00:00,The Benefits of Combining <b>Artificial Intell...,


In [27]:
df_ml = get_content(rss_links['Machine Learning'])
df_ml.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,2950556556104518871,Joaquin Quiñonero Candela Has Left Facebook&#3...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 23:05:24+00:00,2021-09-13 23:05:24+00:00,MIT Tech Review reported Candela had direct ac...,
1,6992381706304313706,Google&#39;s TensorFlow Similarity helps AI mo...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 22:18:45+00:00,2021-09-13 22:18:45+00:00,"... TensorFlow Similarity, a Python package de...",
2,10500345553214248792,AI on the Edge – Scaling <b>Machine Learning</...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:40:10+00:00,2021-09-13 20:40:10+00:00,AI on the Edge – Scaling <b>Machine Learning</...,
3,2881740393808953397,AI can estimate corporate greenhouse gas emiss...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 19:52:30+00:00,2021-09-13 19:52:30+00:00,... claim to have successfully trained a <b>ma...,
4,16410110446674799631,Inside eBay&#39;s Optimization Techniques for ...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 19:07:30+00:00,2021-09-13 19:07:30+00:00,eBay&#39;s head of <b>machine learning</b> and...,


In [28]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author
0,10500345553214248792,AI on the Edge – Scaling <b>Machine Learning</...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:40:10+00:00,2021-09-13 20:40:10+00:00,AI on the Edge – Scaling <b>Machine Learning</...,
1,10726530668617939684,Accenture to Buy Product Management Specialist...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 20:26:15+00:00,2021-09-13 20:26:15+00:00,Accenture is to buy French-based product manag...,
2,11584893623844855328,Watch live on Sept. 29 at 12:30 p.m. ET: The a...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 16:52:30+00:00,2021-09-13 16:52:30+00:00,... solutions integrate proprietary technology...,
3,11623022194422864250,Aunalytics to Present on Natural Language Inte...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 15:56:15+00:00,2021-09-13 15:56:15+00:00,The architecture of the NL2SQL is built on Wik...,
4,11650460663231519892,How Do You Build a Better Machine? You Can Use...,https://www.google.com/url?rct=j&sa=t&url=http...,2021-09-13 23:15:00+00:00,2021-09-13 23:15:00+00:00,Machines are built now using <b>artificial int...,


In [29]:
df.to_csv('results/contents.csv', mode='a', header=False, index=False)