In [14]:
import requests
import random
import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import datetime
from headers import headers_list
from loading_bar import log_progress
from skill_extraction import extract_skills, extract_ignore

In [2]:
BASE_URL = 'https://jmlr.org'
url = BASE_URL + '/papers/v22/'
url

'https://jmlr.org/papers/v22/'

In [3]:
page = requests.get(url, headers=random.choice(headers_list))
page

<Response [200]>

In [5]:
soup = BeautifulSoup(page.content, 'html.parser')
dls = soup.findAll('dl')
papers = []

for dl in dls:
    paper = {}
    paper['title'] = dl.find('dt').get_text()
    dd = dl.find('dd')
    paper['authors'] = dd.get_text().split(';')[0].strip()
    paper['journal_num'] = dd.get_text().split(';')[-1].split('\n')[0].strip()
    for a in dd.findAll('a'):
        if a.get_text() == '(Machine Learning Open Source Software Paper)':
            continue
        href = a['href']
        if 'http' not in href:
            href = 'https://jmlr.org' + href
        paper[a.get_text()] = href
    papers.append(paper)

papers

[{'title': 'On the Optimality of Kernel-Embedding Based Goodness-of-Fit Tests',
  'authors': 'Krishnakumar Balasubramanian, Tong Li, Ming Yuan',
  'journal_num': '(1):1−45, 2021.',
  'abs': 'https://jmlr.org/papers/v22/17-570.html',
  'pdf': 'https://jmlr.org/papers/volume22/17-570/17-570.pdf',
  'bib': 'https://jmlr.org/papers/v22/17-570.bib'},
 {'title': 'Domain Generalization by Marginal Transfer Learning',
  'authors': 'Gilles Blanchard, Aniket Anand Deshmukh, Urun Dogan, Gyemin Lee, Clayton Scott',
  'journal_num': '(2):1−55, 2021.',
  'abs': 'https://jmlr.org/papers/v22/17-679.html',
  'pdf': 'https://jmlr.org/papers/volume22/17-679/17-679.pdf',
  'bib': 'https://jmlr.org/papers/v22/17-679.bib',
  'code': 'https://github.com/aniketde/DomainGeneralizationMarginal'},
 {'title': 'Regulating Greed Over Time in Multi-Armed Bandits',
  'authors': 'Stefano Tracà, Cynthia Rudin, Weiyu Yan',
  'journal_num': '(3):1−99, 2021.',
  'abs': 'https://jmlr.org/papers/v22/17-720.html',
  'pdf': '

In [4]:
def get_abstract_skills(paper):
    page = requests.get(paper['abs'], headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    abstract = soup.find('p', class_='abstract').get_text().strip('\n')
    all_skills = extract_skills(paper['title'] + ' ' + abstract)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    return abstract, keep_skills

In [7]:
for paper in papers:
    output = get_abstract_skills(paper)
    if output is not None:
        paper['abstract'] = output[0]
        paper['skills'] = '; '.join(output[1])

In [11]:
df = pd.DataFrame.from_dict(papers)
df.head()

Unnamed: 0,title,authors,journal_num,abs,pdf,bib,abstract,skills,code,supplementary,website,blog
0,On the Optimality of Kernel-Embedding Based Go...,"Krishnakumar Balasubramanian, Tong Li, Ming Yuan","(1):1−45, 2021.",https://jmlr.org/papers/v22/17-570.html,https://jmlr.org/papers/volume22/17-570/17-570...,https://jmlr.org/papers/v22/17-570.bib,The reproducing kernel Hilbert space (RKHS) em...,Testing,,,,
1,Domain Generalization by Marginal Transfer Lea...,"Gilles Blanchard, Aniket Anand Deshmukh, Urun ...","(2):1−55, 2021.",https://jmlr.org/papers/v22/17-679.html,https://jmlr.org/papers/volume22/17-679/17-679...,https://jmlr.org/papers/v22/17-679.bib,"In the problem of domain generalization (DG), ...",Algorithm; Analysis; Supervised Learning; Trai...,https://github.com/aniketde/DomainGeneralizati...,,,
2,Regulating Greed Over Time in Multi-Armed Bandits,"Stefano Tracà, Cynthia Rudin, Weiyu Yan","(3):1−99, 2021.",https://jmlr.org/papers/v22/17-720.html,https://jmlr.org/papers/volume22/17-720/17-720...,https://jmlr.org/papers/v22/17-720.bib,"In retail, there are predictable yet dramatic ...",Algorithm; Analysis; Exploit; Retail; Sentry; ...,https://github.com/ShrekFelix/Regulating-Greed...,,,
3,An Empirical Study of Bayesian Optimization: A...,"Erich Merrill, Alan Fern, Xiaoli Fern, Nima Do...","(4):1−25, 2021.",https://jmlr.org/papers/v22/18-220.html,https://jmlr.org/papers/volume22/18-220/18-220...,https://jmlr.org/papers/v22/18-220.bib,Bayesian optimization (BO) is a popular framew...,Accounting; Algorithm; Bayesian Optimization; ...,https://github.com/Eiii/opt_cmp,,,
4,The Decoupled Extended Kalman Filter for Dynam...,"Carlos A. Gomez-Uribe, Brian Karrer","(5):1−25, 2021.",https://jmlr.org/papers/v22/18-417.html,https://jmlr.org/papers/volume22/18-417/18-417...,https://jmlr.org/papers/v22/18-417.bib,Motivated by the needs of online large-scale r...,Exploit; Modelling; Uncertainty,,,,


In [12]:
df.loc[df['skills'].isna()]

Unnamed: 0,title,authors,journal_num,abs,pdf,bib,abstract,skills,code,supplementary,website,blog


In [13]:
df.to_csv('results/jmlr.csv', index=False)