In [1]:
import requests
import random
import pandas as pd
import re
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from headers import headers_list
from loading_bar import log_progress
from data_skills import DATA_SKILLS
from skill_extraction import extract_skills, extract_ignore, extract_data_skills

In [2]:
base_url = 'https://www.kdnuggets.com/{}/{:>02}/tutorials.html' #:>02 to add leading 0 to month
url = base_url.format(2021, 10)
url

'https://www.kdnuggets.com/2021/10/tutorials.html'

In [3]:
page = requests.get(url, headers=random.choice(headers_list))
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" xmlns="https://www.w3.org/1999/xhtml">
 <head profile="https://gmpg.org/xfn/11">
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="max-image-preview:large" name="robots"/>
  <title>
   2021 Oct Tutorials, Overviews - KDnuggets
  </title>
  <link href="/wp-content/themes/kdn17/images/favicon.ico" rel="shortcut icon"/>
  <link href="/wp-content/themes/kdn17/style.css" media="screen" rel="stylesheet" type="text/css"/>
  <script src="/wp-content/themes/kdn17/js/jquery-1.9.1.min.js" type="text/javascript">
  </script>
  <script src="/aps/kda_all.js" type="text/javascript">
  </script>
  <link href="/feed/" rel="alternate" title="KDnuggets: AI, Analytics, Data Science, Machine Learning Feed" type="application/rss+xml"/>
  <link href="https://www.kdnuggets.com/2021-10/k10-tutorials/" rel="canonical"/>
  <meta content="https://www.kdnuggets.com/2021-1

In [5]:
items = soup.find('ul', class_='three_ul test').find_all('li')
items[0]

<li> <a href="https://www.kdnuggets.com/2021/10/multivariate-time-series-analysis-lstm-based-rnn.html" id="134487"><b>Multivariate Time Series Analysis with an LSTM based RNN</b></a>, by Kathrin Melcher         <font color="#808080">- Oct 29, 2021</font>. 
         <div style="margin-left: 12px; font-size: small;">
         Check out this codeless solution using the Keras integration.
 </div>
<p class="tags">Tags: <a href="https://www.kdnuggets.com/tag/keras" rel="tag">Keras</a>, <a href="https://www.kdnuggets.com/tag/knime" rel="tag">Knime</a>, <a href="https://www.kdnuggets.com/tag/low-code" rel="tag">Low-Code</a>, <a href="https://www.kdnuggets.com/tag/lstm" rel="tag">LSTM</a>, <a href="https://www.kdnuggets.com/tag/time-series" rel="tag">Time Series</a></p>
</li>

In [6]:
item = items[0]
item.find('a')['id']

'134487'

In [7]:
def get_id(item):
    try:
        return item.find('a')['id']
    except:
        return None

def get_title(item):
    try:
        return item.find('b').text
    except:
        return None

def get_author(item, title):
    try:
        this_text = item.text.replace(title, '')
        author = this_text.split('by')[1].split('-')[0]
        return author.strip()
    except:
        return None

def get_date(item):
    try:
        date = item.find('font').text
        return date.replace('-', '').strip()
    except:
        return None

def get_url(item):
    try:
        return item.find('a')['href']
    except:
        return None

def get_description(item):
    try:
        return item.find('div').text.strip()
    except:
        return None

def get_tags(item):
    try:
        tags = item.find('p', class_='tags').text
        return tags.split(': ')[-1]
    except:
        return None

def get_skills(title, description, tags):
    context = title
    if description is not None:
        context = context + ' ' + description
    if tags is not None:
        context = context + ' ' + tags
    all_skills = extract_skills(context)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [8]:
# type_: tutorials/opinions
def get_posts(type_, month, year):
    base_url = 'https://www.kdnuggets.com/{}/{:>02}/{}.html' #:>02 to add leading 0 to month
    url = base_url.format(year, month, type_)
    page = requests.get(url, headers=random.choice(headers_list))
    if page.status_code != 200:
        print(page, page.reason)
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    items = soup.find('ul', class_='three_ul test').find_all('li')
    post_list = []
    for item in items:
        title = get_title(item)
        if title is None:
            continue
        description = get_description(item)
        tags = get_tags(item)
        skills, data_skills = get_skills(title, description, tags)
        post_list.append({
            'id': get_id(item),
            'title': title,
            'author': get_author(item, title),
            'date': get_date(item),
            'url': get_url(item),
            'description': description,
            'type': type_,
            'tags': tags,
            'skills': skills,
            'data_skills': data_skills,
        })
    df = pd.DataFrame.from_dict(post_list)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by='id')
    return df

In [9]:
type_ = 'tutorials'
this_month = dt.datetime.today().month
df_tut = pd.DataFrame()

for i in log_progress(range(1, this_month)):
    df_tut = df_tut.append(get_posts(type_, i, 2021))

df_tut.tail()

VBox(children=(HTML(value=''), IntProgress(value=0, max=10)))

KeyboardInterrupt: 

In [None]:
type_ = 'opinions'
this_month = dt.datetime.today().month
df_op = pd.DataFrame()

for i in log_progress(range(1, this_month)):
    df_op = df_op.append(get_posts(type_, i, 2021))

df_op.tail()

In [None]:
df_kd = df_tut.append(df_op)
df_kd = df_kd.sort_values(by=['date', 'id'])
df_kd.head()

In [None]:
df_kd.info()

In [None]:
df_kd.to_csv('database/kdnuggets.csv', index=False)