In [1]:
import requests
import random
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4.element import Comment
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from skill_extraction import extract_skills, extract_ignore
from secrets import rss_links

In [2]:
def get_id(entry):
    id_ = entry.find('id')
    try:
        return id_.text.split(':')[-1]
    except:
        return None

def get_text(item):
    try:
        return item.text
    except:
        return None

def get_url(item):
    try:
        url = item['href']
        return url.split('url=')[-1].split('&')[0]
    except:
        return None

def get_author(entry):
    authors = entry.find('author')
    if len(authors) == 0:
        return None
    try:
        return '; '.join(a.text for a in authors)
    except:
        return None

def get_page_text(url):
    try:
        page = requests.get(url, headers=random.choice(headers_list))
        if page.status_code != 200:
            return None
        soup = BeautifulSoup(page.content, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)
        return u" ".join(t.strip() for t in visible_texts if len(t.strip()) > 0).replace('\n', ' ')
    except:
        return None

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [3]:
def get_content(url):
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = soup.findAll('entry')
    entry_list = []
    entry_list = []
    for entry in entries:
        page_url = get_url(entry.find('link'))
        entry_list.append({
            'id': get_id(entry),
            'title': get_text(entry.find('title')),
            'url': page_url,
            'published_date': get_text(entry.find('published')),
            'updated_date': get_text(entry.find('updated')),
            'content': get_text(entry.find('content')),
            'author': get_author(entry),
            'page_text': get_page_text(page_url),
        })

    df = pd.DataFrame.from_dict(entry_list)
    df['published_date'] = pd.to_datetime(df['published_date'])
    df['updated_date'] = pd.to_datetime(df['updated_date'])
    return df

In [4]:
df_ds = get_content(rss_links['Data Science'])
df_ds.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,16203871111303959639,David Bader Selected to Receive the 2021 IEEE ...,https://www.hpcwire.com/off-the-wire/david-bad...,2021-09-23 00:11:15+00:00,2021-09-23 00:11:15+00:00,Bader is a Distinguished Professor and founder...,,Search the site Go Tabor Network: Datanami Ent...
1,15198693570844161937,"Tech jobs thriving, while others are in declin...",https://www.ktvh.com/news/tech-jobs-thriving-w...,2021-09-22 23:57:54+00:00,2021-09-22 23:57:54+00:00,"On the other hand, the U.S Bureau of Labor Sta...",,1 weather alerts 1 closings/delays 1 weather a...
2,12221072494161426516,LMU-Led Team Wins National Science Foundation ...,https://www.prnewswire.com/news-releases/lmu-l...,2021-09-22 22:52:30+00:00,2021-09-22 22:52:30+00:00,"... computer science, <b>data science</b> and ...",,Resources Blog Journalists Log In Sign Up Data...
3,4181479459365450241,UHV graduate students find time for new skills...,https://news.uhv.edu/release.aspx%3Fid%3D9409,2021-09-22 21:45:00+00:00,2021-09-22 21:45:00+00:00,He also holds several certificates and license...,,
4,12508106254385350086,Charlotte Jackson - D.C. Policy Center,https://www.dcpolicycenter.org/people/charlott...,2021-09-22 21:33:45+00:00,2021-09-22 21:33:45+00:00,Charlotte Lee Jackson is a <b>data science</b>...,,COVID-19 Publications Articles Reports Testimo...


In [5]:
df_ai = get_content(rss_links['Artificial Intelligence'])
df_ai.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,4372502486794381264,Pages - Special Webcast: <b>Artificial Intelli...,https://na.theiia.org/training/eLearning/Pages...,2021-09-22 23:27:09+00:00,2021-09-22 23:27:09+00:00,As <b>artificial intelligence</b> (AI) becomes...,,You may be trying to access this site from a s...
1,13653759635895529833,Pentagon looks to cement career paths for soft...,https://federalnewsnetwork.com/workforce/2021/...,2021-09-22 21:56:15+00:00,2021-09-22 21:56:15+00:00,... like <b>artificial intelligence</b> and ma...,,Technology Artificial Intelligence Ask the CIO...
2,11440374372692045446,<b>Artificial Intelligence</b> for Asset Manag...,https://www.computerworld.com/resources/224422...,2021-09-22 21:27:25+00:00,2021-09-22 21:27:25+00:00,This paper discusses the applications of using...,,computerworld UNITED STATES United States Aust...
3,9296931072927057980,Britain publishes 10-year National <b>Artifici...,https://www.theregister.com/2021/09/22/uk_10_y...,2021-09-22 21:11:15+00:00,2021-09-22 21:11:15+00:00,Britain publishes 10-year National <b>Artifici...,,Sign in Topics Security Off-Prem All Off-Prem ...
4,18377640414608949856,UK publishes National <b>Artificial Intelligen...,https://www.theregister.com/2021/09/22/uk_nati...,2021-09-22 21:11:15+00:00,2021-09-22 21:11:15+00:00,Britain publishes 10-year National <b>Artifici...,,Sign in Topics Security Off-Prem All Off-Prem ...


In [6]:
df_ml = get_content(rss_links['Machine Learning'])
df_ml.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,2384421490710584169,Professors Use <b>Machine Learning</b> to Guid...,https://news.syr.edu/blog/2021/09/22/professor...,2021-09-22 23:15:00+00:00,2021-09-22 23:15:00+00:00,<b>Machine learning</b>. two headshots. Davoud...,,Skip to main content Home About Faculty Expert...
1,15914158271175535678,The state of MLOps in 2021 is dominated by sta...,https://venturebeat.com/2021/09/22/the-state-o...,2021-09-22 21:22:30+00:00,2021-09-22 21:22:30+00:00,Algorithmia&#39;s 2021 enterprise trends in <b...,,Skip to main content Events GamesBeat Jobs Low...
2,17175140344776868876,"FDA OKs new pathology AI software, launches AI...",https://www.raps.org/news-and-articles/news-ar...,2021-09-22 20:55:05+00:00,2021-09-22 20:55:05+00:00,"Separately, the agency announced Wednesday tha...",,RAPS.org needs your explicit consent to store ...
3,1240293248898276964,"Mike Schroepfer, Facebook&#39;s C.T.O., to Ste...",https://www.nytimes.com/2021/09/22/technology/...,2021-09-22 20:50:55+00:00,2021-09-22 20:50:55+00:00,"In 2013, Mr. Schroepfer was promoted to chief ...",,Sections SEARCH Skip to content Skip to site i...
4,682628171577016345,StethoMe&#39;s smart stethoscope lets your kid...,https://techcrunch.com/2021/09/22/stethomes-sm...,2021-09-22 19:18:45+00:00,2021-09-22 19:18:45+00:00,"StethoMe, a team competing in this week&#39;s ...",,StethoMe’s smart stethoscope lets your kid’s d...


In [7]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,10270955116780684517,MicroAI To Bring AI Training To Renesas MCUs -...,https://aithority.com/machine-learning/microai...,2021-09-22 16:46:49+00:00,2021-09-22 16:46:49+00:00,"MicroAI, the pioneer in edge-native artificial...",,Artificial Intelligence | News | Insights | Ai...
1,10587977690130051795,Using <b>artificial intelligence</b> to manage...,https://www.mddionline.com/artificial-intellig...,2021-09-22 18:45:00+00:00,2021-09-22 18:45:00+00:00,To realize the full potential of <b>artificial...,,https://www.mddionline.com/sites/all/themes/pe...
2,109093307791724589,AI system cannot be named as the inventor on a...,https://www.ft.com/content/1c79e834-f1c8-40ef-...,2021-09-22 17:48:45+00:00,2021-09-22 17:48:45+00:00,An <b>artificial intelligence</b> system canno...,,Accessibility help Skip to navigation Skip to ...
3,11025837506321078156,The Blue Box is betting on the future of at-ho...,https://techcrunch.com/2021/09/22/the-blue-box...,2021-09-22 17:26:15+00:00,2021-09-22 17:26:15+00:00,Benet imagines a product where you might be ab...,,The Blue Box is betting on the future of at-ho...
4,11065248911920094116,Top 10 Principal Real-World Applications of <b...,https://www.analyticsinsight.net/top-10-princi...,2021-09-22 17:26:15+00:00,2021-09-22 17:26:15+00:00,"Various industries like manufacturing, finance...",,Toggle Menu Homepage Insights Artificial Intel...


In [8]:
df.loc[df['page_text'].isna()]

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
37,4181479459365450241,UHV graduate students find time for new skills...,https://news.uhv.edu/release.aspx%3Fid%3D9409,2021-09-22 21:45:00+00:00,2021-09-22 21:45:00+00:00,He also holds several certificates and license...,,
50,7876296478002446920,<b>Artificial Intelligence</b>-Based Battle Ma...,https://www.wads.ang.af.mil/News/Article-Displ...,2021-09-22 18:56:15+00:00,2021-09-22 18:56:15+00:00,BMTN was developed in partnership with Vectron...,,
52,8582260355863996167,Artificial Intelligence-Based Battle Managemen...,https://www.wads.ang.af.mil/News/Article-Displ...,2021-09-22 18:56:15+00:00,2021-09-22 18:56:15+00:00,"... combined artificial intelligence, <b>machi...",,


In [9]:
df.to_csv('results/contents.csv', mode='a', header=False, index=False)

In [10]:
for i, row in df.iterrows():
    content = row['page_text']
    if not isinstance(content, str):
        content = row['content']
    if not isinstance(content, str):
        continue
    all_skills = extract_skills(content)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    df.loc[i, 'skills'] = '; '.join(keep_skills)

df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text,skills
0,10270955116780684517,MicroAI To Bring AI Training To Renesas MCUs -...,https://aithority.com/machine-learning/microai...,2021-09-22 16:46:49+00:00,2021-09-22 16:46:49+00:00,"MicroAI, the pioneer in edge-native artificial...",,Artificial Intelligence | News | Insights | Ai...,Agriculture; Analytical; Artificial Intelligen...
1,10587977690130051795,Using <b>artificial intelligence</b> to manage...,https://www.mddionline.com/artificial-intellig...,2021-09-22 18:45:00+00:00,2021-09-22 18:45:00+00:00,To realize the full potential of <b>artificial...,,https://www.mddionline.com/sites/all/themes/pe...,Accuracy; Algorithm; Analysis; Analytical; Art...
2,109093307791724589,AI system cannot be named as the inventor on a...,https://www.ft.com/content/1c79e834-f1c8-40ef-...,2021-09-22 17:48:45+00:00,2021-09-22 17:48:45+00:00,An <b>artificial intelligence</b> system canno...,,Accessibility help Skip to navigation Skip to ...,Artificial Intelligence (AI); Artificial Neura...
3,11025837506321078156,The Blue Box is betting on the future of at-ho...,https://techcrunch.com/2021/09/22/the-blue-box...,2021-09-22 17:26:15+00:00,2021-09-22 17:26:15+00:00,Benet imagines a product where you might be ab...,,The Blue Box is betting on the future of at-ho...,Artificial Intelligence (AI); Azure Machine Le...
4,11065248911920094116,Top 10 Principal Real-World Applications of <b...,https://www.analyticsinsight.net/top-10-princi...,2021-09-22 17:26:15+00:00,2021-09-22 17:26:15+00:00,"Various industries like manufacturing, finance...",,Toggle Menu Homepage Insights Artificial Intel...,Advertising; Artificial Intelligence (AI); Aut...


In [11]:
df.loc[df['skills'].isna()]

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text,skills


In [12]:
df.drop(columns=['page_text']).to_csv('results/contents_skills.csv', mode='a', header=False, index=False)