In [1]:
import requests
import random
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4.element import Comment
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from skill_extraction import extract_skills, extract_ignore
from secrets import rss_links

In [2]:
def get_id(entry):
    id_ = entry.find('id')
    try:
        return id_.text.split(':')[-1]
    except:
        return None

def get_text(item):
    try:
        return item.text
    except:
        return None

def get_url(item):
    try:
        url = item['href']
        return url.split('url=')[-1].split('&')[0]
    except:
        return None

def get_author(entry):
    authors = entry.find('author')
    if len(authors) == 0:
        return None
    try:
        return '; '.join(a.text for a in authors)
    except:
        return None

def get_page_text(url):
    try:
        page = requests.get(url, headers=random.choice(headers_list))
        if page.status_code != 200:
            return None
        soup = BeautifulSoup(page.content, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)
        return u" ".join(t.strip() for t in visible_texts if len(t.strip()) > 0).replace('\n', ' ')
    except:
        return None

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [3]:
def get_content(url):
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = soup.findAll('entry')
    entry_list = []
    entry_list = []
    for entry in entries:
        page_url = get_url(entry.find('link'))
        entry_list.append({
            'id': get_id(entry),
            'title': get_text(entry.find('title')),
            'url': page_url,
            'published_date': get_text(entry.find('published')),
            'updated_date': get_text(entry.find('updated')),
            'content': get_text(entry.find('content')),
            'author': get_author(entry),
            'page_text': get_page_text(page_url),
        })

    df = pd.DataFrame.from_dict(entry_list)
    df['published_date'] = pd.to_datetime(df['published_date'])
    df['updated_date'] = pd.to_datetime(df['updated_date'])
    return df

In [4]:
df_ds = get_content(rss_links['Data Science'])
df_ds.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,17587951140663912394,Zeotap launches Consent Orchestration to simpl...,https://www.prnewswire.com/news-releases/zeota...,2021-09-23 04:52:30+00:00,2021-09-23 04:52:30+00:00,"... launch of Predictive Audiences, which enab...",,Resources Blog Journalists Log In Sign Up Data...
1,2911744195267280153,Are Eastern European startups overlooked and u...,https://techcrunch.com/2021/09/22/are-eastern-...,2021-09-23 03:20:57+00:00,2021-09-23 03:20:57+00:00,... Skills Index found that Russian learners h...,,Are Eastern European startups overlooked and u...
2,16361801888048029235,WU to offer <b>data science</b> major as joint...,https://www.studlife.com/news/2021/09/22/wu-to...,2021-09-23 02:15:00+00:00,2021-09-23 02:15:00+00:00,Washington University will offer a Bachelor of...,,News Sports Forum Scene Cadenza Multimedia Spe...
3,6300445606079918611,Metropolitan Chicago Data-science Corps to par...,https://news.northwestern.edu/stories/2021/sep...,2021-09-23 01:58:59+00:00,2021-09-23 01:58:59+00:00,The MCDC team also includes Northwestern facul...,,Skip to main content Search Search this site S...
4,16203871111303959639,David Bader Selected to Receive the 2021 IEEE ...,https://www.hpcwire.com/off-the-wire/david-bad...,2021-09-23 00:11:15+00:00,2021-09-23 00:11:15+00:00,Bader is a Distinguished Professor and founder...,,Search the site Go Tabor Network: Datanami Ent...


In [5]:
df_ai = get_content(rss_links['Artificial Intelligence'])
df_ai.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,18333500483809827862,4 ways startups can use AI to make a real soci...,https://www.startupdaily.net/2021/09/artificia...,2021-09-23 23:27:57+00:00,2021-09-23 23:27:57+00:00,The possibilities of <b>artificial intelligenc...,,News News & Analysis Profiles Insights Insight...
1,1113762748151444535,Vulnerabilities May Slow Air Force&#39;s Adopt...,https://www.defenseone.com/threats/2021/09/vul...,2021-09-23 22:18:45+00:00,2021-09-23 22:18:45+00:00,More data on the battlefield means a wider att...,,"Continue to the site Trump’s Red, White & Blue..."
2,5942997513806668189,What green <b>artificial intelligence</b> need...,https://www.gulf-times.com/story/701013/What-g...,2021-09-23 21:54:47+00:00,2021-09-23 21:54:47+00:00,Data and technologies like <b>artificial intel...,,"Friday, September 24, 2021 4:37 AM Daily Newsp..."
3,14776869013826394660,These Are The Top Tech Startups Attracting Tal...,https://www.inc.com/gabrielle-bienasz/linkedin...,2021-09-23 19:35:05+00:00,2021-09-23 19:35:05+00:00,Several of them have succeeded through their u...,,Login Subscribe Subscribe Follow: Subscribe Fo...
4,5328550499125077373,"Simmons Cancer Center, MD Anderson scientists ...",https://www.utsouthwestern.edu/newsroom/articl...,2021-09-23 18:33:45+00:00,2021-09-23 18:33:45+00:00,"DALLAS – Sept. 23, 2021 – Researchers and data...",,COVID-19 Updates Back to UT Southwestern Home ...


In [6]:
df_ml = get_content(rss_links['Machine Learning'])
df_ml.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,15431908268898467278,Researchers apply deep learning to PS-OCT syst...,https://www.news-medical.net/news/20210923/Res...,2021-09-23 06:22:30+00:00,2021-09-23 06:22:30+00:00,"Deep learning, a subset of <b>machine learning...",,We use cookies to enhance your experience. By ...
1,7584600740429265422,"Cogitativo Introduces &#39;Visión&#39;, A <b>M...",https://www.marktechpost.com/2021/09/22/cogita...,2021-09-23 05:03:45+00:00,2021-09-23 05:03:45+00:00,Visión combines cutting-edge <b>machine learni...,,Home Free AI Courses Free AI Intro Course Coun...
2,14133170768379532714,Harnessing <b>machine learning</b> to help pat...,https://www.irishtimes.com/news/science/harnes...,2021-09-23 04:52:30+00:00,2021-09-23 04:52:30+00:00,What inspired your interest in using <b>machin...,,html Sign up for alerts from The Irish Times N...
3,544016708985531042,<b>Machine Learning</b> - Oracle Blogs,https://blogs.oracle.com/javamagazine/category...,2021-09-23 04:29:57+00:00,2021-09-23 04:29:57+00:00,<b>Machine Learning</b>. How to program <b>mac...,,This site requires JavaScript to be enabled.
4,7488568507391957345,Spotting and rooting out bias in AI algorithms...,https://medcitynews.com/2021/09/spotting-and-r...,2021-09-22 23:03:45+00:00,2021-09-22 23:03:45+00:00,“We know it&#39;s a real phenomenon in the <b>...,,Menu Search Home Investing & Startups AI Digit...


In [7]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,10153889852810309872,<b>Artificial intelligence</b> optimizes CNC m...,https://www.compositesworld.com/news/artificia...,2021-09-23 14:03:03+00:00,2021-09-23 14:03:03+00:00,The Augsburg AI (<b>artificial intelligence</b...,,Gardner Business Media | Modern Machine Shop |...
1,1069999399235548346,Save 98% off this Premium Machine Learning <b>...,https://www.neowin.net/news/save-98-off-this-p...,2021-09-23 16:41:15+00:00,2021-09-23 16:41:15+00:00,Today&#39;s highlighted deal comes via our Onl...,,Neowin Login Sign up Facebook Twitter Follow ...
2,1113762748151444535,Vulnerabilities May Slow Air Force&#39;s Adopt...,https://www.defenseone.com/threats/2021/09/vul...,2021-09-23 22:18:45+00:00,2021-09-23 22:18:45+00:00,More data on the battlefield means a wider att...,,"Continue to the site Trump’s Red, White & Blue..."
3,11282058973657029588,<b>Artificial Intelligence</b>: The Future Of ...,https://www.oodaloop.com/technology/2021/09/23...,2021-09-23 14:15:00+00:00,2021-09-23 14:15:00+00:00,Read Steve Durbin argue how <b>Artificial Inte...,,"Understand tomorrow, today. Home OODA Analysis..."
4,11320432209041687096,Economics faculty discuss changes gift will br...,https://news.harvard.edu/gazette/story/2021/09...,2021-09-22 16:46:46+00:00,2021-09-22 16:46:46+00:00,Targeted to first years and sophomores — and h...,,Skip to content The Harvard Gazette Search for...


In [8]:
df.loc[df['page_text'].isna()]

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text


In [9]:
df.to_csv('results/contents.csv', mode='a', header=False, index=False)

In [10]:
for i, row in df.iterrows():
    content = row['page_text']
    if not isinstance(content, str):
        content = row['content']
    if not isinstance(content, str):
        continue
    all_skills = extract_skills(content)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    df.loc[i, 'skills'] = '; '.join(keep_skills)

df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text,skills
0,10153889852810309872,<b>Artificial intelligence</b> optimizes CNC m...,https://www.compositesworld.com/news/artificia...,2021-09-23 14:03:03+00:00,2021-09-23 14:03:03+00:00,The Augsburg AI (<b>artificial intelligence</b...,,Gardner Business Media | Modern Machine Shop |...,Aerospace; Algorithm; Artificial Intelligence ...
1,1069999399235548346,Save 98% off this Premium Machine Learning <b>...,https://www.neowin.net/news/save-98-off-this-p...,2021-09-23 16:41:15+00:00,2021-09-23 16:41:15+00:00,Today&#39;s highlighted deal comes via our Onl...,,Neowin Login Sign up Facebook Twitter Follow ...,Advertising; Algorithm; Artificial Intelligenc...
2,1113762748151444535,Vulnerabilities May Slow Air Force&#39;s Adopt...,https://www.defenseone.com/threats/2021/09/vul...,2021-09-23 22:18:45+00:00,2021-09-23 22:18:45+00:00,More data on the battlefield means a wider att...,,"Continue to the site Trump’s Red, White & Blue...",Advertising; Agriculture; Analysis; Analytical...
3,11282058973657029588,<b>Artificial Intelligence</b>: The Future Of ...,https://www.oodaloop.com/technology/2021/09/23...,2021-09-23 14:15:00+00:00,2021-09-23 14:15:00+00:00,Read Steve Durbin argue how <b>Artificial Inte...,,"Understand tomorrow, today. Home OODA Analysis...",Analysis; Artificial Intelligence (AI); C; Con...
4,11320432209041687096,Economics faculty discuss changes gift will br...,https://news.harvard.edu/gazette/story/2021/09...,2021-09-22 16:46:46+00:00,2021-09-22 16:46:46+00:00,Targeted to first years and sophomores — and h...,,Skip to content The Harvard Gazette Search for...,Apache Spark; Arts; Behavioural Economics; Big...


In [11]:
df.loc[df['skills'].isna()]

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text,skills


In [12]:
df.drop(columns=['page_text']).to_csv('results/contents_skills.csv', mode='a', header=False, index=False)