In [83]:
import requests
import random
import pandas as pd
import re
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from headers import headers_list
from loading_bar import log_progress
from data_skills import DATA_SKILLS
from skill_extraction import extract_skills, extract_ignore, extract_data_skills

In [9]:
base_url = 'https://www.kdnuggets.com/{}/{:>02}/tutorials.html' #:>02 to add leading 0 to month
url = base_url.format(2021, 1)
url

'https://www.kdnuggets.com/2021/01/tutorials.html'

In [10]:
page = requests.get(url, headers=random.choice(headers_list))
soup = BeautifulSoup(page.content, 'html.parser')

In [13]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" xmlns="https://www.w3.org/1999/xhtml">
 <head profile="https://gmpg.org/xfn/11">
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="max-image-preview:large" name="robots"/>
  <title>
   2021 Jan Tutorials, Overviews - KDnuggets
  </title>
  <link href="/wp-content/themes/kdn17/images/favicon.ico" rel="shortcut icon"/>
  <link href="/wp-content/themes/kdn17/style.css" media="screen" rel="stylesheet" type="text/css"/>
  <script src="/wp-content/themes/kdn17/js/jquery-1.9.1.min.js" type="text/javascript">
  </script>
  <script src="/aps/kda_all.js" type="text/javascript">
  </script>
  <link href="/feed/" rel="alternate" title="KDnuggets: AI, Analytics, Data Science, Machine Learning Feed" type="application/rss+xml"/>
  <link href="https://www.kdnuggets.com/2021-01/k01-tutorials/" rel="canonical"/>
  <meta content="https://www.kdnuggets.com/2021-0

In [26]:
items = soup.find('ul', class_='three_ul test').find_all('li')
items[0]

<li> <a href="https://www.kdnuggets.com/2021/01/top-10-technology-trends-2021.html" id="122087"><b>Baidu Research: 10 Technology Trends in 2021</b></a>, by Baidu Research         <font color="#808080">- Jan 29, 2021</font>. 
         <div style="margin-left: 12px; font-size: small;">
         Understanding future technology trends may never have been as important as it is today. Check out the prediction of the 10 technology trends in 2021 from Baidu Research.
 </div>
<p class="tags">Tags: <a href="https://www.kdnuggets.com/tag/2021-predictions" rel="tag">2021 Predictions</a>, <a href="https://www.kdnuggets.com/tag/baidu" rel="tag">Baidu</a>, <a href="https://www.kdnuggets.com/tag/research" rel="tag">Research</a>, <a href="https://www.kdnuggets.com/tag/trends" rel="tag">Trends</a></p>
</li>

In [78]:
item = items[0]
item.find('a')['id']

'122087'

In [86]:
def get_id(item):
    try:
        return item.find('a')['id']
    except:
        return None

def get_title(item):
    try:
        return item.find('b').text
    except:
        return None

def get_author(item, title):
    try:
        this_text = item.text.replace(title, '')
        author = this_text.split('by')[1].split('-')[0]
        return author.strip()
    except:
        return None

def get_date(item):
    try:
        date = item.find('font').text
        return date.replace('-', '').strip()
    except:
        return None

def get_url(item):
    try:
        return item.find('a')['href']
    except:
        return None

def get_description(item):
    try:
        return item.find('div').text.strip()
    except:
        return None

def get_tags(item):
    try:
        tags = item.find('p', class_='tags').text
        return tags.split(': ')[-1]
    except:
        return None

def get_skills(title, description, tags):
    context = title
    if description is not None:
        context = context + ' ' + description
    if tags is not None:
        context = context + ' ' + tags
    all_skills = extract_skills(context)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [104]:
# type_: tutorials/opinions
def get_posts(type_, month, year):
    base_url = 'https://www.kdnuggets.com/{}/{:>02}/{}.html' #:>02 to add leading 0 to month
    url = base_url.format(year, month, type_)
    page = requests.get(url, headers=random.choice(headers_list))
    if page.status_code != 200:
        print(page, page.reason)
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    items = soup.find('ul', class_='three_ul test').find_all('li')
    post_list = []
    for item in items:
        title = get_title(item)
        if title is None:
            continue
        description = get_description(item)
        tags = get_tags(item)
        skills, data_skills = get_skills(title, description, tags)
        post_list.append({
            'id': get_id(item),
            'title': title,
            'author': get_author(item, title),
            'date': get_date(item),
            'url': get_url(item),
            'description': description,
            'type': type_,
            'tags': tags,
            'skills': skills,
            'data_skills': data_skills,
        })
    df = pd.DataFrame.from_dict(post_list)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by='id')
    return df

In [105]:
type_ = 'tutorials'
this_month = dt.datetime.today().month
df_tut = pd.DataFrame()

for i in log_progress(range(1, this_month)):
    df_tut = df_tut.append(get_posts(type_, i, 2021))

df_tut.tail()

VBox(children=(HTML(value=''), IntProgress(value=0, max=9)))

Unnamed: 0,id,title,author,date,url,description,type,tags,skills,data_skills
4,133211,Important Statistics Data Scientists Need to Know,Lekshmi Sunil,2021-09-29,https://www.kdnuggets.com/2021/09/important-st...,Several fundamental statistical concepts must ...,tutorials,"Bayes Theorem, Data Science, Probability, Stat...",Data Science; Probability; Python; Statistics,Python Programming; Statistics
3,133225,GitHub Desktop for Data Scientists,Drew Seewald,2021-09-29,https://www.kdnuggets.com/2021/09/github-deskt...,Less scary than version control in the command...,tutorials,"Data Science, Data Scientist, GitHub, Version ...",Data Science; Github; LeSS; Version Control,Github
2,133235,Use These Unique Data Sets to Sharpen Your Dat...,U. of North Florida,2021-09-29,https://www.kdnuggets.com/2021/09/springboard-...,Want to get your hands on some real-world data...,tutorials,"Data Science Skills, Datasets",Data Science; Dataset,
1,133244,Advanced Statistical Concepts in Data Science,Nagesh Singh Chauhan,2021-09-30,https://www.kdnuggets.com/2021/09/advanced-sta...,The article contains some of the most commonly...,tutorials,"Career Advice, Data Science, Distribution, Pro...",Data Science; Probability; Python; Statistics,Python Programming; Statistics
0,133262,How to Determine the Best Fitting Data Distrib...,Matthew Mayo,2021-09-30,https://www.kdnuggets.com/2021/09/determine-be...,"Approaches to data sampling, modeling, and ana...",tutorials,"Data Analysis, Distribution, Probability, Pyth...",Data Analysis; Modelling; Probability; Python;...,Python Programming; Statistics


In [106]:
type_ = 'opinions'
this_month = dt.datetime.today().month
df_op = pd.DataFrame()

for i in log_progress(range(1, this_month)):
    df_op = df_op.append(get_posts(type_, i, 2021))

df_op.tail()

VBox(children=(HTML(value=''), IntProgress(value=0, max=9)))

Unnamed: 0,id,title,author,date,url,description,type,tags,skills,data_skills
4,132944,Nine Tools I Wish I Mastered Before My PhD in ...,Aliaksei Mikhailiuk,2021-09-22,https://www.kdnuggets.com/2021/09/nine-tools-m...,Whether you are building a start up or making ...,opinions,"AI, Data Science, Data Science Tools, Machine ...",Artificial Intelligence (AI); Data Science Too...,AI
3,133046,How Data Scientists Can Compete in the Global ...,Devin Partida,2021-09-24,https://www.kdnuggets.com/2021/09/data-scienti...,Data scientists wanting to stay competitive or...,opinions,"Career Advice, Data Science, Data Scientist",Data Science; Market,
2,133163,MLOps and ModelOps: What’s the Difference and...,Stu Bailey,2021-09-28,https://www.kdnuggets.com/2021/09/mlops-modelo...,These two terms are often used interchangeably...,opinions,"Enterprise, MLOps, ModelOps",Artificial Intelligence (AI); MLOps,AI
1,133202,Data Science Process Lifecycle,"Lillian Pierson, P.E.",2021-09-29,https://www.kdnuggets.com/2021/09/data-science...,How would it feel to know that without a doubt...,opinions,"Analytics, Data Science, Data Scientist, Workflow",Analytical; Data Science,
0,133268,Scale and Govern AI Initiatives with ModelOps,Giuliano Liguori,2021-09-30,https://www.kdnuggets.com/2021/09/scale-govern...,AI/ML model life cycle automation and orchestr...,opinions,"AI, MLOps, ModelOps, Scalability",Artificial Intelligence (AI); Automation; MLOps,AI


In [110]:
df_kd = df_tut.append(df_op)
df_kd = df_kd.sort_values(by=['date', 'id'])
df_kd.head()

Unnamed: 0,id,title,author,date,url,description,type,tags,skills,data_skills
20,121033,Six Tips on Building a Data Science Team at a ...,Zbar & Vallejo,2021-01-04,https://www.kdnuggets.com/2021/01/six-tips-bui...,When a company decides that they want to start...,opinions,"Data Science, Data Science Team, Data Scientist",Data Science,
38,121054,All Machine Learning Algorithms You Should Kno...,Terence Shin,2021-01-04,https://www.kdnuggets.com/2021/01/machine-lear...,Many machine learning algorithms exits that ra...,tutorials,"Algorithms, Decision Trees, Explained, Gradien...",Bayesian Regression; Boosting; Decision Tree; ...,Regressions
37,121083,DeepMind’s MuZero is One of the Most Important...,Jesus Rodriguez,2021-01-04,https://www.kdnuggets.com/2021/01/deepmind-muz...,MuZero takes a unique approach to solve the pr...,tutorials,"AlphaZero, Deep Learning, DeepMind, MuZero, Re...",Deep Learning; Reinforcement Learning,Deep Learning; Reinforcement
36,121109,"Model Experiments, Tracking and Registration u...",Dash Desai,2021-01-05,https://www.kdnuggets.com/2021/01/model-experi...,This post covers how StreamSets can help exped...,tutorials,"Data Science, Databricks, DataOps, Experimenta...",Data Science; Databricks; MLFlow; MLOps; Machi...,
19,121117,How to Get a Job as a Data Engineer,Anna Anisienia,2021-01-05,https://www.kdnuggets.com/2021/01/get-job-as-d...,Data engineering skills are currently in high ...,opinions,"Career Advice, Data Engineer, Data Engineering",Data Engineering; Sentry,


In [111]:
df_kd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 587 entries, 20 to 0
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           587 non-null    object        
 1   title        587 non-null    object        
 2   author       587 non-null    object        
 3   date         587 non-null    datetime64[ns]
 4   url          587 non-null    object        
 5   description  587 non-null    object        
 6   type         587 non-null    object        
 7   tags         587 non-null    object        
 8   skills       582 non-null    object        
 9   data_skills  391 non-null    object        
dtypes: datetime64[ns](1), object(9)
memory usage: 50.4+ KB


In [114]:
df_kd.to_csv('database/kdnuggets.csv', index=False)