In [1]:
import random
import requests
import pandas as pd
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from headers import headers_list
from data_skills import DATA_SKILLS
from skill_extraction import extract_skills, extract_ignore, extract_data_skills
from secrets import settings, api_keys

## JMLR

In [None]:
def jmlr_scraper(engine):
    base_url = 'https://jmlr.org'
    url = base_url + '/papers/v22/'
    page = requests.get(url, headers=random.choice(headers_list))
    if page.status_code != 200:
        return
    soup = BeautifulSoup(page.content, 'html.parser')
    dls = soup.findAll('dl')
    # Get existing papers in database
    df_ex = pd.read_sql_query('select cj.id, cj.title from "ContentJMLR" cj', engine)
    ex_papers = df_ex['title'].unique().tolist()
    papers = []
    # Iterate through each paper
    has_new = False
    for dl in dls:
        title = dl.find('dt').get_text()
        if title in ex_papers:
            continue
        paper = {}
        dd = dl.find('dd')
        paper['title'] = title
        paper['authors'] = dd.get_text().split(';')[0].strip()
        paper['journal_num'] = dd.get_text().split(';')[-1].split('\n')[0].strip()
        for a in dd.findAll('a'):
            if a.get_text() == '(Machine Learning Open Source Software Paper)':
                continue
            href = a['href']
            if 'http' not in href:
                href = 'https://jmlr.org' + href
            paper[a.get_text()] = href
        # Get abstract of paper and extract skills
        output = get_abstract_skills(paper)
        if output is not None:
            paper['abstract'] = output[0]
            if len(output[1]) > 0:
                paper['skills'] = '; '.join(output[1])
                data_skills = extract_data_skills(output[1])
                if len(data_skills) > 0:
                    paper['data_skills'] = '; '.join(data_skills)
        papers.append(paper)
        has_new = True
    # Compile into dataframe if we have new papers
    if has_new:
        df = pd.DataFrame.from_dict(papers)
        df['id'] = df.index + max(df_ex['id']) + 1
        return df
    return None

def get_abstract_skills(paper):
    page = requests.get(paper['abs'], headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    abstract = soup.find('p', class_='abstract').get_text().strip('\n')
    all_skills = extract_skills(paper['title'] + ' ' + abstract)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    return abstract, keep_skills

In [None]:
engine = create_engine(settings['skills_db'])
df_jmlr = jmlr_scraper(engine)
df_jmlr.head()

In [None]:
engine.dispose()

In [None]:
df_jmlr.to_csv('database/jmlr.csv', index=False)

## Youtube

In [32]:
def get_youtube_videos(skill, filter_time=None):
    base_url = 'https://www.youtube.com'
    # Dictionary for filtering search query
    sp_dict = {'this_year': 'EgQIBRAB', 'this_month': 'EgQIBBAB', 'this_week': 'EgQIAxAB', 'today': 'EgQIAhAB'}
    if filter_time not in sp_dict.keys():
        return None
    url = base_url + '/results'
    query = 'learn ' + skill
    params = {'search_query': query.replace(' ', '+')}
    # Default is no filter
    if filter_time is not None:
        params['sp'] = sp_dict[filter_time]
    page = requests.get(url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        print(page, page.reason)
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    json_text = str(soup.find_all('script')).split('var ytInitialData = ')[-1].split(';</script>')[0]
    res = json.loads(json_text)
    res = res['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']
    video_list = []
    # Iterate through each video
    for contents in res:
        # Get only those with video
        if 'itemSectionRenderer' not in contents:
            continue
        contents = contents['itemSectionRenderer']['contents']
        for content in contents:
            # Ignore ads
            if 'videoRenderer' not in content:
                continue
            content = content['videoRenderer']
            title = get_text(content, 'title')
            if title is None:
                continue
            description = get_description(content)
            skills, data_skills = get_skills(title, description)
            published_year, published_month = get_published_date(content)
            video_list.append({
                'id': content['videoId'],
                'title': title,
                'channel_id': get_channel_id(content),
                'channel': get_text(content, 'ownerText'),
                'published_year': published_year,
                'published_month': published_month,
                'length': get_length(content),
                'view_count': get_view_count(content),
                'url': get_url(content, base_url),
                'description': description,
                'skills': skills,
                'data_skills': data_skills
            })
    df = pd.DataFrame.from_dict(video_list)
    # df['length'] = pd.to_timedelta(df['length'])
    return df
    
def get_channel_id(content):
    try:
        return content['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId']
    except:
        return None

def get_text(content, info):
    try:
        return ' '.join(t['text'] for t in content[info]['runs'])
    except:
        return None

def get_length(content):
    try:
        length = content['lengthText']['simpleText']
        length = length.split(':')
        if len(length) == 1:
            length.insert(0, '00')
        if len(length) == 2:
            length.insert(0, '00')
        return ':'.join(length)
    except:
        return None

def get_published_date(content):
    try:
        published_time = content['publishedTimeText']['simpleText']
        val = [int(s) for s in published_time.split() if s.isdigit()][0]
        current = dt.datetime.now()
        if 'year' in published_time:
            published = current - dt.timedelta(days=365.25*val)
        elif 'month' in published_time:
            published = current - dt.timedelta(days=30.436875*val)
        elif 'week' in published_time:
            published = current - dt.timedelta(weeks=val)
        elif 'day' in published_time:
            published = current - dt.timedelta(days=val)
        elif 'hour' in published_time:
            published = current - dt.timedelta(hours=val)
        elif 'minute' in published_time:
            published = current - dt.timedelta(minutes=val)
        elif 'second' in published_time:
            published = current - dt.timedelta(seconds=val)
        return published.year, published.month
    except:
        return None, None

def get_view_count(content):
    try:
        view_count = content['viewCountText']['simpleText']
        view_count = view_count.split(' views')[0].replace(',', '')
        return int(view_count)
    except:
        return None

def get_url(content, base_url):
    try:
        url = content['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
        return base_url + url
    except:
        return None

def get_description(content):
    try:
        description = ' '.join([t['text'] for t in content['detailedMetadataSnippets'][0]['snippetText']['runs']])
        return description
    except:
        return None

def get_skills(title, description):
    context = title
    if description is not None:
        context = context + ' ' + description
    all_skills = extract_skills(context)
    # Ignore the Video skill as it is not relevant for Youtube
    if 'Video' in all_skills:
        all_skills.remove('Video')
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [33]:
df_yt = pd.DataFrame()

for skill in DATA_SKILLS:
    print(skill)
    try:
        # df_temp = get_youtube_videos(skill, 'this_month')
        df_temp = get_youtube_videos(skill, 'this_year')
        df_yt = df_yt.append(df_temp)
    except Exception as e:
        print('Error in scraping Youtube for {}'.format(skill), e)
    time.sleep(5)

df_yt.head()

APACHE
C++
Financial Statement
MongoDB
R
Python
Java
Microsoft Excel
Tableau
Statistics
Natural Language Processing (NLP)
Unsupervised Machine Learning
Structured Query Language (SQL)
Computer Vision
Supervised Machine Learning
Matlab
Extract Transform Load (ETL)
Deep Learning
Dashboard
NOSQL
Apache Spark
Artificial Intelligence (AI)
Time Series Analysis
Microsoft Power BI
Regression
Reinforcement Learning
Graph Theory
Database Management System (DBMS)
Apache Hadoop
Cloud Technology
Data Cleansing
Data Preparation
Github
JavaScript
Presentation
AB Testing
Visualization


Unnamed: 0,id,title,channel_id,channel,published_year,published_month,length,view_count,url,description,skills,data_skills
0,_C8kWso4ne4,PySpark Tutorial,UC8butISFwT-Wl7EV0hUK0BQ,freeCodeCamp.org,2021.0,8.0,1:49:02,418629.0,https://www.youtube.com/watch?v=_C8kWso4ne4,"Learn PySpark, an interface for Apache Spar...",Apache Spark; Processing; PySpark; Python,Python Programming; SPARK
1,FMUl-9Ze8_k,What is Apache & Nginx? | Apache vs Nginx 🔥🔥,UCeVMnSShP_Iviwkknt83cww,CodeWithHarry,2021.0,7.0,00:15:59,63707.0,https://www.youtube.com/watch?v=FMUl-9Ze8_k,"Apache vs Nginx in Hindi: In this video, we w...",Apache Ant; Hindi,
2,8dZxPJJCqmU,Learn Apache Airflow with Python easily in 1 h...,UCcMLc5sy-_ko6rSH1wZNkYQ,TejaWithData,2021.0,8.0,1:06:49,10683.0,https://www.youtube.com/watch?v=8dZxPJJCqmU,"Hello Everyone, In this video, we will learn ...",Apache Airflow; Data Pipeline; PipelineC; Python,Python Programming
3,9B2fhX4MfOE,DCS: AH-64D Tutorial | Getting started with th...,UCcZauUXQ-dHSFqqqTXMLBSg,CasmoTV,2022.0,4.0,00:29:17,64465.0,https://www.youtube.com/watch?v=9B2fhX4MfOE,NOTE: All footage is pre-EA release General co...,Apache,Apache
4,A0x-lnWpYdA,Learning Apache mod_rewrite - learn Web Develo...,UCyGosiDlDjfC0Ogq1Ib48Xw,Daisy Hilda,2021.0,8.0,00:1:11,2.0,https://www.youtube.com/watch?v=A0x-lnWpYdA,link to this course ...,Apache; Web Development,Apache


In [46]:
engine = create_engine(settings['skills_db2'])
df_yf = pd.read_sql_query('select * from "ContentYoutube"', engine)
df_yf

Unnamed: 0,id,title,channel_id,channel,published_year,published_month,length,view_count,url,description,skills,data_skills,is_lesson
0,-T_l8TjAzXk,WHY LEARN TABLEAU? | Is Tableau worth learning...,,Abhishek Agarrwal,2020.0,10.0,00:12:04,6634.0,https://www.youtube.com/watch?v=-T_l8TjAzXk,Many people who are a beginner and aspiring to...,Business Intelligence (BI); Business Intellige...,Tableau,
1,0nH1ospwJd4,4 Quick Tips to Make an Engaging PowerPoint Pr...,,ClassPoint,2020.0,10.0,00:09:36,5578.0,https://www.youtube.com/watch?v=0nH1ospwJd4,Here're 4 quick tips on how to make engaging P...,Animation; Microsoft PowerPoint; Presentation,Presentation Skill,
2,1LHch7usbzY,Processing Covid-19 Data with Apache Spark,,Manning Publications,2020.0,10.0,01:14:32,1056.0,https://www.youtube.com/watch?v=1LHch7usbzY,"""Spark in Action, Second Edition: Covers Apac...",Apache Spark; Java; Processing; Python; Scala,SPARK; Java; Python Programming,True
3,2ViNp_UijMQ,Apache Spark with Scala By Example - learn Apa...,,Petrina Corine,2020.0,10.0,00:03:46,5.0,https://www.youtube.com/watch?v=2ViNp_UijMQ,Learn the fundamentals and run examplesof Spa...,Apache Spark; Dataset; Distributed Database; S...,SPARK,True
4,2sTeNU3nE3o,ETL explained in 5 Minutes,,DaveSplains,2020.0,10.0,00:04:50,132.0,https://www.youtube.com/watch?v=2sTeNU3nE3o,"The concept of Extract Transform Load , or E...",Extract Transform Load (ETL); Programming; Tra...,ETL,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,j1mzSKItdcg,Advanced Analytics using Apache Spark in Azure...,,Global AI Community,,,,,https://www.youtube.com/watch?v=j1mzSKItdcg,In this session you will learn the fundament...,Analytical; Apache Spark; Databricks; Microsof...,SPARK,True
6596,k5NX209dfcw,Display Number Formats in MATLAB part 2 | MATL...,,Laplace Academy,,,00:0:58,,https://www.youtube.com/watch?v=k5NX209dfcw,,MATLAB,MATLAB,True
6597,mUD32ZytNnQ,Display Number Formats in MATLAB part 1 | MATL...,,Laplace Academy,,,00:0:58,,https://www.youtube.com/watch?v=mUD32ZytNnQ,,MATLAB,MATLAB,True
6598,pqAgHkfboUM,Introduction to Deep Learning Course | MATLAB ...,,MATLAB Helper ®,,,,,https://www.youtube.com/watch?v=pqAgHkfboUM,Get introduced to the Premium Online Course of...,Deep Learning; MATLAB,Deep Learning; MATLAB,True


In [51]:
def add_url(x):
    return 'https://www.youtube.com/watch?v=' + x

df_y = df_yf.copy()
df_y.loc[df_y['url'].isna(), 'url'] = df_y.loc[df_y['url'].isna(), 'id'].apply(lambda x: add_url(x))
df_y

Unnamed: 0,id,title,channel_id,channel,published_year,published_month,length,view_count,url,description,skills,data_skills,is_lesson
0,-T_l8TjAzXk,WHY LEARN TABLEAU? | Is Tableau worth learning...,,Abhishek Agarrwal,2020.0,10.0,00:12:04,6634.0,https://www.youtube.com/watch?v=-T_l8TjAzXk,Many people who are a beginner and aspiring to...,Business Intelligence (BI); Business Intellige...,Tableau,
1,0nH1ospwJd4,4 Quick Tips to Make an Engaging PowerPoint Pr...,,ClassPoint,2020.0,10.0,00:09:36,5578.0,https://www.youtube.com/watch?v=0nH1ospwJd4,Here're 4 quick tips on how to make engaging P...,Animation; Microsoft PowerPoint; Presentation,Presentation Skill,
2,1LHch7usbzY,Processing Covid-19 Data with Apache Spark,,Manning Publications,2020.0,10.0,01:14:32,1056.0,https://www.youtube.com/watch?v=1LHch7usbzY,"""Spark in Action, Second Edition: Covers Apac...",Apache Spark; Java; Processing; Python; Scala,SPARK; Java; Python Programming,True
3,2ViNp_UijMQ,Apache Spark with Scala By Example - learn Apa...,,Petrina Corine,2020.0,10.0,00:03:46,5.0,https://www.youtube.com/watch?v=2ViNp_UijMQ,Learn the fundamentals and run examplesof Spa...,Apache Spark; Dataset; Distributed Database; S...,SPARK,True
4,2sTeNU3nE3o,ETL explained in 5 Minutes,,DaveSplains,2020.0,10.0,00:04:50,132.0,https://www.youtube.com/watch?v=2sTeNU3nE3o,"The concept of Extract Transform Load , or E...",Extract Transform Load (ETL); Programming; Tra...,ETL,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,j1mzSKItdcg,Advanced Analytics using Apache Spark in Azure...,,Global AI Community,,,,,https://www.youtube.com/watch?v=j1mzSKItdcg,In this session you will learn the fundament...,Analytical; Apache Spark; Databricks; Microsof...,SPARK,True
6596,k5NX209dfcw,Display Number Formats in MATLAB part 2 | MATL...,,Laplace Academy,,,00:0:58,,https://www.youtube.com/watch?v=k5NX209dfcw,,MATLAB,MATLAB,True
6597,mUD32ZytNnQ,Display Number Formats in MATLAB part 1 | MATL...,,Laplace Academy,,,00:0:58,,https://www.youtube.com/watch?v=mUD32ZytNnQ,,MATLAB,MATLAB,True
6598,pqAgHkfboUM,Introduction to Deep Learning Course | MATLAB ...,,MATLAB Helper ®,,,,,https://www.youtube.com/watch?v=pqAgHkfboUM,Get introduced to the Premium Online Course of...,Deep Learning; MATLAB,Deep Learning; MATLAB,True


In [54]:
df_y.to_sql('ContentYoutube', engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
engine = create_engine(settings['skills_db2'])
df_yt = pd.read_csv('database/youtube.csv')
df_yt['length'] = df_yt['length'].apply(lambda x: x.split()[-1])
df_yt.head()

In [None]:
df_yt.to_sql('ContentYoutube', engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
len(df_yt), len(df_yt[~df_yt['data_skills'].isna()])

## Medium

In [None]:
def medium_scraper(tag, date):
    base_url = 'https://medium.com/tag/{}/archive/'
    url = base_url.format(tag) + date.strftime('%Y/%m/%d')
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    # Pulls each card from the feed. Each card is a story or comment
    cards = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    card_list = []
    for card in cards:
        title = get_title(card)
        subtitle = get_subtitle(card)
        claps = get_claps(card)
        if title is None or is_comment(card) or claps is None:
            continue
        if claps < 100:
            continue
        skills, data_skills = get_skills(title, subtitle)
        card_list.append({
            'id': get_id(card),
            'title': title,
            'subtitle': subtitle,
            'author': get_author(card),
            'publication': get_publication(card),
            'published_date': date,
            'read_time_mins': get_read_time(card),
            'claps': claps,
            'url': get_url(card),
            'skills': skills,
            'data_skills': data_skills,
        })
    df = pd.DataFrame.from_dict(card_list)
    return df

def get_id(card):
    id_ = card.find('div', class_='postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls')
    if id_ is not None:
        return id_['data-post-id']
    return id_

def get_title(card):
    # Different combination of classes possible for titles
    combinations = [('h3', 'graf graf--h3 graf-after--figure graf--title'),
                    ('h3', 'graf graf--h3 graf-after--figure graf--trailing graf--title'),
                    ('h4', 'graf graf--h4 graf--leading'),
                    ('h3', 'graf graf--h3 graf--leading graf--title'),
                    ('p', 'graf graf--p graf--leading'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf--leading graf--title'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf-after--figure graf--trailing graf--title')]
    title = None
    for combi in combinations:
        title = card.find(combi[0], class_=combi[1])
        if title is not None:
            return title.text
    return title

def get_subtitle(card):
    # Different combination of classes possible for subtitles
    combinations = [('h4', 'graf graf--h4 graf-after--h3 graf--subtitle'),
                    ('h4', 'graf graf--h4 graf-after--h3 graf--trailing graf--subtitle'),
                    ('strong', 'markup--strong markup--p-strong'),
                    ('h4', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('blockquote', 'graf graf--pullquote graf-after--figure graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure'),
                    ('blockquote', 'graf graf--blockquote graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure graf--trailing'),
                    ('em', 'markup--em markup--p-em'),
                    ('p', 'graf graf--p graf-after--p graf--trailing')]
    subtitle = None
    for combi in combinations:
        subtitle = card.find(combi[0], class_=combi[1])
        if subtitle is not None:
            return subtitle.text
    return subtitle

def get_author(card):
    author = card.find('a', class_='ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken')
    if author is not None:
        return author.text
    return author

def get_publication(card):
    pub = card.find('a', class_='ds-link ds-link--styleSubtle link--darken link--accent u-accentColor--textNormal')
    if pub is not None:
        return pub.text
    return pub

def get_read_time(card):
    time = card.find('span', class_='readingTime')
    if time is not None:
        time = time['title']
        return time.replace(' min read', '')
    return time

def get_claps(card):
    claps = card.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents')
    if claps is not None:
        claps = claps.text
        if 'K' in claps:
            try:
                return int(float(claps.replace('K', '')) * 1000)
            except:
                return None
        else:
            try:
                return int(claps)
            except:
                return None
    return claps

def is_comment(card):
    # Check if card is a story or comment
    comment = card.find('div', class_='u-fontSize14 u-marginTop10 u-marginBottom20 u-padding14 u-xs-padding12 u-borderRadius3 u-borderCardBackground u-borderLighterHover u-boxShadow1px4pxCardBorder')
    return comment is not None

def get_url(card):
    url = card.find('a', class_='')
    if url is not None:
        return url['href'].split('?')[0]
    return url

def get_skills(title, subtitle):
    context = title
    if subtitle is not None:
        context = context + ' ' + subtitle
    all_skills = extract_skills(context)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [None]:
tag = 'data-science'
start_date = dt.datetime(2021, 9, 30)
end_date = dt.datetime(2021, 10, 18)
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_data_science.csv', index=False)
    else:
        df.to_csv('database/medium_data_science.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
tag = 'machine-learning'
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_machine_learning.csv', index=False)
    else:
        df.to_csv('database/medium_machine_learning.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
tag = 'data-engineering'
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_data_engineering.csv', index=False)
    else:
        df.to_csv('database/medium_data_engineering.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
df_med = pd.read_csv('database/medium_data_science.csv')
df_med = df_med.append(pd.read_csv('database/medium_machine_learning.csv'))
df_med = df_med.append(pd.read_csv('database/medium_data_engineering.csv'))
df_med = df_med.drop_duplicates(subset=['id'])
df_med['published_date'] = pd.to_datetime(df_med['published_date'], dayfirst=True)
df_med = df_med.sort_values(by=['published_date', 'id'])
df_med['url'] = df_med['url'].apply(lambda x: x.split('?')[0])
df_med = df_med.reset_index(drop=True)
df_med.head()

In [None]:
df_med.to_csv('database/medium.csv', index=False)

In [None]:
start_date = dt.datetime(2022, 4, 13)
end_date = dt.datetime(2022, 5, 18)
current_date = start_date
for i in range((end_date - start_date).days):
    print(current_date)
    current_date = current_date + dt.timedelta(days=1)

In [None]:
tags = ['data-science', 'machine-learning', 'data-engineering']
# start_date = dt.datetime(2021, 12, 14)
# end_date = dt.datetime(2021, 12, 29)
df_med = pd.DataFrame()

for tag in tags:
    current_date = start_date
    print(tag)
    for i in range((end_date - start_date).days):
        df_temp = medium_scraper(tag, current_date)
        df_med = df_med.append(df_temp)
        current_date = current_date + dt.timedelta(days=1)
        time.sleep(random.randint(1,3))

df_med.head()

In [None]:
df_med = df_med.sort_values(by=['published_date', 'id'])
df_med = df_med.drop_duplicates(subset=['id'])
df_med

In [None]:
engine = create_engine(settings['skills_db2'])
# df_med = pd.read_csv('database/medium.csv')
df_med['published_date'] = pd.to_datetime(df_med['published_date'])
# df_med.to_sql('ContentMedium', engine, index=False, if_exists='replace')
df_med.to_sql('ContentMedium', engine, index=False, if_exists='append')
engine.dispose()

## KDnuggets

In [None]:
df_kd = pd.read_csv('database/kdnuggets.csv')
df_kd['date'] = pd.to_datetime(df_kd['date'])
df_kd.head()

In [None]:
df_kd.info()

In [None]:
engine = create_engine(settings['skills_db2'])
df_kd.to_sql('ContentKDnuggets', engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
dt.datetime.now() - dt.timedelta(days=7)

In [None]:
dt.datetime.now() - dt.timedelta(days=7)