In [16]:
import requests
import random
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4.element import Comment
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from skill_extraction import extract_skills, extract_ignore
from secrets import rss_links

In [2]:
def get_id(entry):
    id_ = entry.find('id')
    try:
        return id_.text.split(':')[-1]
    except:
        return None

def get_text(item):
    try:
        return item.text
    except:
        return None

def get_url(item):
    try:
        url = item['href']
        return url.split('url=')[-1].split('&')[0]
    except:
        return None

def get_author(entry):
    authors = entry.find('author')
    if len(authors) == 0:
        return None
    try:
        return '; '.join(a.text for a in authors)
    except:
        return None

def get_page_text(url):
    try:
        page = requests.get(url, headers=random.choice(headers_list))
        if page.status_code != 200:
            return None
        soup = BeautifulSoup(page.content, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)
        return u" ".join(t.strip() for t in visible_texts if len(t.strip()) > 0).replace('\n', ' ')
    except:
        return None

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [3]:
def get_content(url):
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = soup.findAll('entry')
    entry_list = []
    entry_list = []
    for entry in entries:
        page_url = get_url(entry.find('link'))
        entry_list.append({
            'id': get_id(entry),
            'title': get_text(entry.find('title')),
            'url': page_url,
            'published_date': get_text(entry.find('published')),
            'updated_date': get_text(entry.find('updated')),
            'content': get_text(entry.find('content')),
            'author': get_author(entry),
            'page_text': get_page_text(page_url),
        })

    df = pd.DataFrame.from_dict(entry_list)
    df['published_date'] = pd.to_datetime(df['published_date'])
    df['updated_date'] = pd.to_datetime(df['updated_date'])
    return df

In [10]:
df_ds = get_content(rss_links['Data Science'])
df_ds.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,5410161394141722453,<b>Data Science</b> Platform Market Expected t...,https://soccernurds.com/uncategorized/5794380/...,2021-09-20 01:28:31+00:00,2021-09-20 01:28:31+00:00,<b>Data Science</b> Platform Market Expected t...,,Skip to content Search for: SoccerNurds Primar...
1,9544898260592202046,Dubber acquires world class AI technology comp...,https://www.prnewswire.com/news-releases/dubbe...,2021-09-19 23:48:45+00:00,2021-09-19 23:48:45+00:00,We welcome the Notiv team to Dubber along with...,,Resources Blog Journalists Log In Sign Up Data...
2,8799730211579595871,"Srikrishnan V, vice-president - Enterprise Bus...",https://www.financialexpress.com/education-2/w...,2021-09-19 21:56:15+00:00,2021-09-19 21:56:15+00:00,"... full stack development, <b>data science</b...",,Home India News Markets Stocks Health Economy ...
3,4619331074291668668,"Targeted, hyperlocal, personalized… PepsiCo mi...",https://www.foodnavigator-usa.com/Article/2021...,2021-09-19 20:48:45+00:00,2021-09-19 20:48:45+00:00,A proprietary data practice developed by Pepsi...,,CONTINUE TO SITE > Or wait... Home News News M...
4,4339679955055703986,Unleashing 5G with semiconductor chip innovati...,https://www.straitstimes.com/branded-content/u...,2021-09-19 20:05:15+00:00,2021-09-19 20:05:15+00:00,... last year as a Global Lighthouse for its e...,,


In [11]:
df_ai = get_content(rss_links['Artificial Intelligence'])
df_ai.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,13014725768370886466,"Justice, Equity, And Fairness: Exploring The T...",https://www.forbes.com/sites/anniebrown/2021/0...,2021-09-20 01:30:00+00:00,2021-09-20 01:30:00+00:00,How <b>artificial intelligence</b> is impactin...,,Explore Billionaires All Billionaires World's ...
1,14622698968680201748,Hamilton County court officials see number of ...,https://www.wlwt.com/article/hamilton-county-c...,2021-09-20 01:30:00+00:00,2021-09-20 01:30:00+00:00,The court is launching new <b>artificial intel...,,Skip to content NOWCAST WLWT News on METV at 1...
2,12394354945011969774,Dubber acquires world class AI technology comp...,https://www.prnewswire.com/news-releases/dubbe...,2021-09-19 23:48:45+00:00,2021-09-19 23:48:45+00:00,Adds advanced technology capabilities in real-...,,Resources Blog Journalists Log In Sign Up Data...
3,6871009256076724404,Covid-19 is a &#39;wake-up call&#39; to act on...,https://www.cnbc.com/2021/09/20/covid-is-call-...,2021-09-19 23:26:15+00:00,2021-09-19 23:26:15+00:00,The firm developed an <b>artificial intelligen...,,Skip Navigation Markets Pre-Markets U.S. Marke...
4,11536432840799974222,Can apps manage our chronic health conditions?...,https://www.bbc.com/news/business-58556777,2021-09-19 23:03:45+00:00,2021-09-19 23:03:45+00:00,"The 55-year-old, from Stockholm, says it&#39;s...",,BBC Homepage Skip to content Accessibility Hel...


In [12]:
df_ml = get_content(rss_links['Machine Learning'])
df_ml.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,16875981523583924953,How to reshape business strategy to be future-...,https://www.bit.com.au/guide/how-to-reshape-bu...,2021-09-20 01:41:15+00:00,2021-09-20 01:41:15+00:00,<b>Machine learning</b> will filter the standa...,,Log In Subscribe Reviews Photo Galleries Guide...
1,7827283391740030815,"Justice, Equity, And Fairness: Exploring The T...",https://www.forbes.com/sites/anniebrown/2021/0...,2021-09-20 01:30:00+00:00,2021-09-20 01:30:00+00:00,There have also been studies that prove that <...,,Explore Billionaires All Billionaires World's ...
2,2165638237057178897,Leonid Bershidsky column: &#39;Techlash&#39; i...,https://richmond.com/opinion/columnists/leonid...,2021-09-19 23:03:45+00:00,2021-09-19 23:03:45+00:00,(Bloomberg uses supervised <b>machine learning...,,Skip to main content × × Please register or lo...
3,12161401120620571128,Doctors to use remotely controlled robot to as...,https://uk.sports.yahoo.com/news/doctors-remot...,2021-09-19 23:03:45+00:00,2021-09-19 23:03:45+00:00,... will use technology pioneered by The Natio...,,guce Your data. Your experience. Yahoo is part...
4,257248246512481826,Fusionex Signs MoU with Alliance Islamic Bank ...,https://www.kulr8.com/news/money/fusionex-sign...,2021-09-19 22:52:30+00:00,2021-09-19 22:52:30+00:00,... <b>Machine Learning</b> and Artificial Int...,,You have permission to edit this article. Edit...


In [21]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
0,10398587313766518210,Fusionex Signs MoU with Alliance Islamic Bank ...,https://finance.yahoo.com/news/fusionex-signs-...,2021-09-19 22:52:30+00:00,2021-09-19 22:52:30+00:00,"PETALING JAYA, Malaysia, September 19, 2021--(...",,HOME MAIL NEWS FINANCE SPORTS ENTERTAINMENT LI...
1,11399624425384127367,Honor and Microsoft have teamed up to build ne...,https://thedigitalhacker.com/honor-and-microso...,2021-09-19 18:33:15+00:00,2021-09-19 18:33:15+00:00,Honor and Microsoft have teamed up to build ne...,,"Sunday, September 19 2021 Trending Honor and M..."
2,11536432840799974222,Can apps manage our chronic health conditions?...,https://www.bbc.com/news/business-58556777,2021-09-19 23:03:45+00:00,2021-09-19 23:03:45+00:00,"The 55-year-old, from Stockholm, says it&#39;s...",,BBC Homepage Skip to content Accessibility Hel...
3,11932885372878281443,Safety-minded feds want Tesla to pump the brak...,https://mashable.com/article/tesla-full-self-d...,2021-09-19 16:56:16+00:00,2021-09-19 16:56:16+00:00,"These features, which are powered by <b>machin...",,Tech Life Social Good Entertainment Newsletter...
4,12104198783093585461,Network Security Market 2021 - Datamation,https://www.datamation.com/security/network-se...,2021-09-19 16:52:30+00:00,2021-09-19 16:52:30+00:00,"More recently, the network detection and respo...",,Trends Big Data Data Center AI Cloud Applicati...


In [22]:
df.loc[df['page_text'].isna()]

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text
28,4339679955055703986,Unleashing 5G with semiconductor chip innovati...,https://www.straitstimes.com/branded-content/u...,2021-09-19 20:05:15+00:00,2021-09-19 20:05:15+00:00,... last year as a Global Lighthouse for its e...,,


In [23]:
df.to_csv('results/contents.csv', mode='a', header=False, index=False)

In [24]:
for i, row in df.iterrows():
    content = row['page_text']
    if not isinstance(content, str):
        continue
    all_skills = extract_skills(content)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    df.loc[i, 'skills'] = '; '.join(keep_skills)

df.head()

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text,skills
0,10398587313766518210,Fusionex Signs MoU with Alliance Islamic Bank ...,https://finance.yahoo.com/news/fusionex-signs-...,2021-09-19 22:52:30+00:00,2021-09-19 22:52:30+00:00,"PETALING JAYA, Malaysia, September 19, 2021--(...",,HOME MAIL NEWS FINANCE SPORTS ENTERTAINMENT LI...,.NET Framework; Array; Artificial Intelligence...
1,11399624425384127367,Honor and Microsoft have teamed up to build ne...,https://thedigitalhacker.com/honor-and-microso...,2021-09-19 18:33:15+00:00,2021-09-19 18:33:15+00:00,Honor and Microsoft have teamed up to build ne...,,"Sunday, September 19 2021 Trending Honor and M...",Artificial Intelligence (AI); Collaboration; C...
2,11536432840799974222,Can apps manage our chronic health conditions?...,https://www.bbc.com/news/business-58556777,2021-09-19 23:03:45+00:00,2021-09-19 23:03:45+00:00,"The 55-year-old, from Stockholm, says it&#39;s...",,BBC Homepage Skip to content Accessibility Hel...,.NET Framework; Artificial Intelligence (AI); ...
3,11932885372878281443,Safety-minded feds want Tesla to pump the brak...,https://mashable.com/article/tesla-full-self-d...,2021-09-19 16:56:16+00:00,2021-09-19 16:56:16+00:00,"These features, which are powered by <b>machin...",,Tech Life Social Good Entertainment Newsletter...,Blink; Branding; Communication; Cybercrime; Cy...
4,12104198783093585461,Network Security Market 2021 - Datamation,https://www.datamation.com/security/network-se...,2021-09-19 16:52:30+00:00,2021-09-19 16:52:30+00:00,"More recently, the network detection and respo...",,Trends Big Data Data Center AI Cloud Applicati...,Analysis; Analytical; Anomaly Detection; Archi...


In [25]:
df.loc[df['skills'].isna()]

Unnamed: 0,id,title,url,published_date,updated_date,content,author,page_text,skills
28,4339679955055703986,Unleashing 5G with semiconductor chip innovati...,https://www.straitstimes.com/branded-content/u...,2021-09-19 20:05:15+00:00,2021-09-19 20:05:15+00:00,... last year as a Global Lighthouse for its e...,,,


In [27]:
df.drop(columns=['page_text']).to_csv('results/contents_skills.csv', mode='a', header=False, index=False)