In [13]:
import random
import requests
import pandas as pd
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from headers import headers_list
from data_skills import DATA_SKILLS
from skill_extraction import extract_skills, extract_ignore, extract_data_skills
from secrets import settings, api_keys

## JMLR

In [24]:
def jmlr_scraper(engine):
    base_url = 'https://jmlr.org'
    url = base_url + '/papers/v22/'
    page = requests.get(url, headers=random.choice(headers_list))
    if page.status_code != 200:
        return
    soup = BeautifulSoup(page.content, 'html.parser')
    dls = soup.findAll('dl')
    # Get existing papers in database
    df_ex = pd.read_sql_query('select cj.id, cj.title from "ContentJMLR" cj', engine)
    ex_papers = df_ex['title'].unique().tolist()
    papers = []
    # Iterate through each paper
    has_new = False
    for dl in dls:
        title = dl.find('dt').get_text()
        if title in ex_papers:
            continue
        paper = {}
        dd = dl.find('dd')
        paper['title'] = title
        paper['authors'] = dd.get_text().split(';')[0].strip()
        paper['journal_num'] = dd.get_text().split(';')[-1].split('\n')[0].strip()
        for a in dd.findAll('a'):
            if a.get_text() == '(Machine Learning Open Source Software Paper)':
                continue
            href = a['href']
            if 'http' not in href:
                href = 'https://jmlr.org' + href
            paper[a.get_text()] = href
        # Get abstract of paper and extract skills
        output = get_abstract_skills(paper)
        if output is not None:
            paper['abstract'] = output[0]
            if len(output[1]) > 0:
                paper['skills'] = '; '.join(output[1])
                data_skills = extract_data_skills(output[1])
                if len(data_skills) > 0:
                    paper['data_skills'] = '; '.join(data_skills)
        papers.append(paper)
        has_new = True
    # Compile into dataframe if we have new papers
    if has_new:
        df = pd.DataFrame.from_dict(papers)
        df['id'] = df.index + max(df_ex['id']) + 1
        return df
    return None

def get_abstract_skills(paper):
    page = requests.get(paper['abs'], headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    abstract = soup.find('p', class_='abstract').get_text().strip('\n')
    all_skills = extract_skills(paper['title'] + ' ' + abstract)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    return abstract, keep_skills

In [25]:
engine = create_engine(settings['skills_db'])
df_jmlr = jmlr_scraper(engine)
df_jmlr.head()

Unnamed: 0,title,authors,journal_num,abs,pdf,bib,code,abstract,skills,id
0,Mode-wise Tensor Decompositions: Multi-dimensi...,"HanQin Cai, Keaton Hamm, Longxiu Huang, Deanna...","(185):1−36, 2021.",https://jmlr.org/papers/v22/21-0287.html,https://jmlr.org/papers/volume22/21-0287/21-02...,https://jmlr.org/papers/v22/21-0287.bib,https://github.com/caesarcai/Modewise_Tensor_D...,Low rank tensor approximation is a fundamental...,Analysis; Approximation; Data Science; Dataset...,185


In [26]:
engine.dispose()

In [None]:
df_jmlr.to_csv('database/jmlr.csv', index=False)

## Youtube

In [11]:
# Possible orders: ['date', 'rating', 'relevance', 'title', 'videoCount', 'viewCount']
# start_date must be in RFC 3339 formatted date-time value (1970-01-01T00:00:00Z)
def youtube_scraper(max_results, order, start_date):
    # Do a search query for each data skills
    df = pd.DataFrame()
    for skill in DATA_SKILLS:
        print(skill)
        query = 'learn ' + skill
        df_temp = get_youtube_videos(query, max_results, order, start_date)
        df_temp['data_skills'] = skill
        df = df.append(df_temp)
    # Merge df on id to remove duplicates
    df = df.merge(df.groupby('id').mean(), left_on='id', right_index=True)
    df = df.merge(df[['id', 'data_skills']].groupby('id').agg(lambda x: '; '.join(x)), left_on='id', right_index=True)
    df = df.drop_duplicates(subset=['id'])
    df = df.drop(columns=['relevance_x', 'data_skills_x'])
    df = df.rename(columns={'relevance_y': 'relevance', 'data_skills_y': 'data_skills'})
    df = df.reset_index(drop=True)
    df = get_info_and_skills_for_videos(df)
    return df

def get_info_and_skills_for_videos(df):
    for i, row in df.iterrows():
        full_text = row['title']
        video_info = get_video_info(row['id'])
        if video_info is None:
            continue
        full_description = get_snippet(video_info, 'description')
        df.loc[i, 'description_full'] = full_description
        df.loc[i, 'language'] = get_snippet(video_info, 'defaultAudioLanguage')
        df.loc[i, 'view_count'] = get_statistics(video_info, 'viewCount')
        df.loc[i, 'like_count'] = get_statistics(video_info, 'likeCount')
        df.loc[i, 'dislike_count'] = get_statistics(video_info, 'dislikeCount')
        df.loc[i, 'comment_count'] = get_statistics(video_info, 'commentCount')
        # Get skills from description
        if full_description is not None:
            full_text += ' ' + full_description
        else:
            full_text += ' ' + row['description']
        all_skills = extract_skills(full_text)
        keep_skills, _ = extract_ignore(all_skills)
        keep_skills.sort()
        if len(keep_skills) > 0:
            df.loc[i, 'skills'] = '; '.join(keep_skills)
        data_skills = extract_data_skills(row['data_skills'].split('; ') + keep_skills)
        if len(data_skills) > 0:
            df.loc[i, 'data_skills'] = '; '.join(data_skills)
    return df

def get_youtube_videos(query, max_results, order, start_date=None):
    base_url = 'https://www.googleapis.com/youtube/v3/search'
    params = {'q': query, 'part': 'snippet', 'type': 'video', 'maxResults': max_results, 'order': order,
              'key': api_keys['youtube']}
    if start_date is not None:
        params['publishedAfter'] = start_date
    page = requests.get(base_url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        print(page, page.reason)
        return None
    res = json.loads(page.content.decode('utf8'))
    video_dict = []
    video_url = 'https://youtube.com/watch?v='
    for i, item in enumerate(res['items']):
        video_id = get_object(item['id'], 'videoId')
        snippet = item['snippet']
        video_dict.append({
            'id': video_id,
            'title': get_object(snippet, 'title'),
            'channel': get_object(snippet, 'channelTitle'),
            'url': video_url + video_id,
            'published_date': get_object(snippet, 'publishTime'),
            'description': get_object(snippet, 'description'),
            'relevance': i+1
        })
    return pd.DataFrame.from_dict(video_dict)

def get_video_info(video_id):
    base_url = 'https://www.googleapis.com/youtube/v3/videos'
    params = {'part': ['snippet', 'statistics'], 'id': video_id, 'key': api_keys['youtube']}
    page = requests.get(base_url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    try:
        res = json.loads(page.content.decode('utf8'))
        return res
    except:
        return None

def get_object(item, key):
    try:
        return item[key]
    except:
        return None

def get_snippet(res, info):
    try:
        return res['items'][0]['snippet'][info]
    except:
        return None

def get_statistics(res, info):
    try:
        return res['items'][0]['statistics'][info]
    except:
        return None

In [12]:
# start_date = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=365.25)).isoformat()
start_date = dt.datetime(2021, 1, 1).replace(tzinfo=dt.timezone.utc).isoformat()
df_yt = youtube_scraper(30, 'relevance', start_date)
df_yt.head()

APACHE
<Response [403]> Forbidden


TypeError: 'NoneType' object does not support item assignment

## Medium

In [None]:
def medium_scraper(tag, date):
    base_url = 'https://medium.com/tag/{}/archive/'
    url = base_url.format(tag) + date.strftime('%Y/%m/%d')
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    # Pulls each card from the feed. Each card is a story or comment
    cards = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    card_list = []
    for card in cards:
        title = get_title(card)
        subtitle = get_subtitle(card)
        claps = get_claps(card)
        if title is None or is_comment(card) or claps is None:
            continue
        if claps < 100:
            continue
        skills, data_skills = get_skills(title, subtitle)
        card_list.append({
            'id': get_id(card),
            'title': title,
            'subtitle': subtitle,
            'author': get_author(card),
            'publication': get_publication(card),
            'published_date': date,
            'read_time_mins': get_read_time(card),
            'claps': claps,
            'url': get_url(card),
            'skills': skills,
            'data_skills': data_skills,
        })
    df = pd.DataFrame.from_dict(card_list)
    return df

def get_id(card):
    id_ = card.find('div', class_='postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls')
    if id_ is not None:
        return id_['data-post-id']
    return id_

def get_title(card):
    # Different combination of classes possible for titles
    combinations = [('h3', 'graf graf--h3 graf-after--figure graf--title'),
                    ('h3', 'graf graf--h3 graf-after--figure graf--trailing graf--title'),
                    ('h4', 'graf graf--h4 graf--leading'),
                    ('h3', 'graf graf--h3 graf--leading graf--title'),
                    ('p', 'graf graf--p graf--leading'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf--leading graf--title'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf-after--figure graf--trailing graf--title')]
    title = None
    for combi in combinations:
        title = card.find(combi[0], class_=combi[1])
        if title is not None:
            return title.text
    return title

def get_subtitle(card):
    # Different combination of classes possible for subtitles
    combinations = [('h4', 'graf graf--h4 graf-after--h3 graf--subtitle'),
                    ('h4', 'graf graf--h4 graf-after--h3 graf--trailing graf--subtitle'),
                    ('strong', 'markup--strong markup--p-strong'),
                    ('h4', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('blockquote', 'graf graf--pullquote graf-after--figure graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure'),
                    ('blockquote', 'graf graf--blockquote graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure graf--trailing'),
                    ('em', 'markup--em markup--p-em'),
                    ('p', 'graf graf--p graf-after--p graf--trailing')]
    subtitle = None
    for combi in combinations:
        subtitle = card.find(combi[0], class_=combi[1])
        if subtitle is not None:
            return subtitle.text
    return subtitle

def get_author(card):
    author = card.find('a', class_='ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken')
    if author is not None:
        return author.text
    return author

def get_publication(card):
    pub = card.find('a', class_='ds-link ds-link--styleSubtle link--darken link--accent u-accentColor--textNormal')
    if pub is not None:
        return pub.text
    return pub

def get_read_time(card):
    time = card.find('span', class_='readingTime')
    if time is not None:
        time = time['title']
        return time.replace(' min read', '')
    return time

def get_claps(card):
    claps = card.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents')
    if claps is not None:
        claps = claps.text
        if 'K' in claps:
            try:
                return int(float(claps.replace('K', '')) * 1000)
            except:
                return None
        else:
            try:
                return int(claps)
            except:
                return None
    return claps

def is_comment(card):
    # Check if card is a story or comment
    comment = card.find('div', class_='u-fontSize14 u-marginTop10 u-marginBottom20 u-padding14 u-xs-padding12 u-borderRadius3 u-borderCardBackground u-borderLighterHover u-boxShadow1px4pxCardBorder')
    return comment is not None

def get_url(card):
    url = card.find('a', class_='')
    if url is not None:
        return url['href'].split('?')[0]
    return url

def get_skills(title, subtitle):
    context = title
    if subtitle is not None:
        context = context + ' ' + subtitle
    all_skills = extract_skills(context)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [None]:
tag = 'data-science'
start_date = dt.datetime(2021, 1, 1)
end_date = dt.datetime(2021, 9, 22)
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_data_science.csv', index=False)
    else:
        df.to_csv('database/medium_data_science.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
tag = 'machine-learning'
start_date = dt.datetime(2021, 1, 1)
end_date = dt.datetime(2021, 9, 22)
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_machine_learning.csv', index=False)
    else:
        df.to_csv('database/medium_machine_learning.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
df_med = pd.read_csv('database/medium_data_science.csv')
df_med = df_med.append(pd.read_csv('database/medium_data_science.csv'))
df_med = df_med.drop_duplicates(subset=['id'])
df_med = df_med.sort_values(by=['published_date', 'id'])
df_med['url'] = df_med['url'].apply(lambda x: x.split('?')[0])
df_med = df_med.reset_index(drop=True)
df_med.head()

In [None]:
len(df_med), len(df_med.loc[~df_med['data_skills'].isna()])

In [None]:
df_med.to_csv('database/medium.csv', index=False)

In [6]:
engine = create_engine(settings['skills_db2'])
df_jmlr = pd.read_csv('database/jmlr.csv')
df_jmlr.to_sql('ContentJMLR', engine, index=False)

ValueError: Table 'ContentJMLR' already exists.

In [7]:
df_med = pd.read_csv('database/medium.csv')
df_med['published_date'] = pd.to_datetime(df_med['published_date'])
df_med.to_sql('ContentMedium', engine, index=False, if_exists='replace')
engine.dispose()