In [93]:
from bs4 import BeautifulSoup as bs
import urllib.request
import difflib
from bs4 import NavigableString
import numpy as np
import operator

In [94]:
def get_paper_details(url):
    """
    Input: url string for scirate page for a specific paper
    Output: dictionary corresponding to paper with keys authors, scites, title and arxivID
    """
    source = urllib.request.urlopen(url).read()
    soup = bs(source,"html.parser")

    paper = {}
    paper['authors'] = []
    paper['scites'] = []
    paper['title'] = None
    paper['arxivID'] = None

    #Get title
    div_title = soup.find_all("h1", class_="title")
    paper['title'] = div_title[0].contents[0]

    #Get arXivID
    
    #Get authors
    div_authors = soup.find_all("ul", class_="authors")
    for div_author in div_authors:
        a_href_authors = div_author.find_all('a')
        for a_href_author in a_href_authors:
            paper['authors'].append(a_href_author.contents[0])

    #Get scites
    div_scites = soup.find_all("div", class_="scites")
    for scite in div_scites:
        a_href_names = scite.find_all('a')
        for a_href_name in a_href_names:
            paper['scites'].append(a_href_name.contents[0])

    return paper

In [3]:
#Test
url = 'https://scirate.com/arxiv/1612.06203'
get_paper_details(url)

{'arxivID': None,
 'authors': ['Dominic J. Moylett', 'Noah Linden', 'Ashley Montanaro'],
 'scites': ['Adam Paetznick',
  'Andrew Childs',
  'Anna Pappa',
  'Aram Harrow',
  'Ciarán Lee',
  'Daniel Brod',
  'Daniel Freeman',
  'David Elkouss',
  'David Gosset',
  'David Gross',
  'Dawei Ding',
  'Dominic Williamson',
  'Elizabeth Crosson',
  'Evgeniy Zheltonozhskiy',
  'Fabiano Andrade',
  'Han-Hsuan Lin',
  'Ion Nechita',
  'Jamie Parkinson',
  "Joe O'Gorman",
  'Man-Hong Yung',
  'Mario Berta',
  'Māris Ozols',
  'Martin Schwarz',
  'Michael Jarret',
  'mick',
  'Noon van der Silk',
  'Quntao Zhuang',
  'Raul Garcia-Patron',
  'Ryan L. Mann',
  'Sam Roberts',
  'Srinivasan',
  'Stephen Bartlett',
  'Stephen Jordan',
  'Steve Flammia',
  'Stuart Hadfield',
  'Supartha Podder',
  'Tom Wong',
  'Tongyang Li',
  'Varun Narasimhachar',
  'Xin Wang',
  'Yichen Huang',
  'Yinan Li',
  'Yong-Zhen Xu',
  'Zak Webb',
  'Zoltán Zimborás'],
 'title': 'Quantum speedup of the Travelling Salesman Pr

In [95]:
def get_url_list(date, Range):
    """
    Input: date, range (specified as number of days behind date)
    Output: dictionary of link-dictionaries for papers published within those dates
            Scans the first 20 pages of scirate
    """
    paperlist = {}
    for p in range(1,21):
        url = 'https://scirate.com/?date='+str(date)+'&page='+str(p)+'&range='+str(Range)
        source = urllib.request.urlopen(url).read()
        soup = bs(source,'html.parser')
        
        listing_page = soup.find_all("div",class_="paperlist")
        div_titles = listing_page[0].find_all("div",class_="title")
        number = 50*(p-1) + 0
        for div_title in div_titles:
            a_href_title = div_title.find_all('a')[0]
            title = a_href_title.contents[0]
            url = "https://scirate.com"+ a_href_title['href']
            paperlist[str(number)] = {}
            paperlist[str(number)]['url'] = url
            number = number + 1

        length = len(paperlist)

    return paperlist

In [97]:
paperlist = get_url_list('2017-01-19','500')
paperlist

{'0': {'url': 'https://scirate.com/arxiv/1603.03039'},
 '1': {'url': 'https://scirate.com/arxiv/1604.07450'},
 '2': {'url': 'https://scirate.com/arxiv/1604.01790'},
 '3': {'url': 'https://scirate.com/arxiv/1601.07601'},
 '4': {'url': 'https://scirate.com/arxiv/1511.04206'},
 '5': {'url': 'https://scirate.com/arxiv/1511.00657'},
 '6': {'url': 'https://scirate.com/arxiv/1606.03140'},
 '7': {'url': 'https://scirate.com/arxiv/1611.06999'},
 '8': {'url': 'https://scirate.com/arxiv/1609.05537'},
 '9': {'url': 'https://scirate.com/arxiv/1607.08473'},
 '10': {'url': 'https://scirate.com/arxiv/1607.05256'},
 '11': {'url': 'https://scirate.com/arxiv/1701.01062'},
 '12': {'url': 'https://scirate.com/arxiv/1611.04471'},
 '13': {'url': 'https://scirate.com/arxiv/1602.07674'},
 '14': {'url': 'https://scirate.com/arxiv/1510.02082'},
 '15': {'url': 'https://scirate.com/arxiv/1611.05450'},
 '16': {'url': 'https://scirate.com/arxiv/1512.03547'},
 '17': {'url': 'https://scirate.com/arxiv/1511.02306'},
 '

In [98]:
def get_all_papers(paperlist):
    """
    Input: dictionary of link-scite-dictionary
    Output: dictionary of paper-dictionaries
    """
    length = len(paperlist)
    for i in range(length):
        url_to_paper = paperlist[str(i)]['url']
        paper_details = get_paper_details(url_to_paper)
        paperlist[str(i)]['authors'] = paper_details['authors']
        paperlist[str(i)]['scites'] = paper_details['scites']
        paperlist[str(i)]['title'] = paper_details['title']
        paperlist[str(i)]['arxivID'] = paper_details['arxivID']
    return paperlist

In [99]:
#Test
scirate_papers = get_all_papers(paperlist)

In [100]:
def get_qip2017_accepted_titles():
    """
    Input: None
    Output: Dictionary of dictionaries, each representing a paper accepted to QIP 2017
                each inner dictionary containing title and authors
    """
    url = 'https://stationq.microsoft.com/qip-2017details/'
    source = urllib.request.urlopen(url).read()
    soup = bs(source,'html.parser')

    div_papers = soup.find_all('ul')

    plenary_talks_tags = div_papers[9]
    accepted_talks_tags = div_papers[10]

    plenary_talks_names = plenary_talks_tags.get_text()
    plenary_talks_papers = plenary_talks_names.split('\n')[1:-1]
    plenary_talks_dict = {}

    accepted_talks_names = accepted_talks_tags.get_text()
    accepted_talks_papers = accepted_talks_names.split('\n')[1:-1]
    accepted_talks_dict = {}

    counter = 0
    for name in plenary_talks_papers:
        split = name.split('—')
        plenary_talks_dict[str(counter)] = {}
        plenary_talks_dict[str(counter)]['title'] = split[0]
        list_names = split[1].split(',')
        plenary_talks_dict[str(counter)]['authors'] = []
        for list_name in list_names:
            plenary_talks_dict[str(counter)]['authors'].append(list_name.strip().replace('and ',''))
        counter += 1

    counter = 0
    for name in accepted_talks_papers:
        if name == "merged with":
            continue
        elif name == "and":
            continue
        else:
            split = name.split('—')
            accepted_talks_dict[str(counter)] = {}
            accepted_talks_dict[str(counter)]['title'] = split[0]
            if ',' in split[1]:
                list_names = split[1].split(',')
            else:
                list_names = split[1].split('and ')
            accepted_talks_dict[str(counter)]['authors'] = []
            for list_name in list_names:
                accepted_talks_dict[str(counter)]['authors'].append(list_name.strip().replace('and ',''))
            counter += 1

    return plenary_talks_dict, accepted_talks_dict

In [101]:
def get_qip2017_posters():
    """
    Input: None
    Output: Dictionary of dictionaries, each representing a paper accepted to QIP 2017
                each inner dictionary containing title and authors
    """
    url = 'https://stationq.microsoft.com/qip-2017details/'
    source = urllib.request.urlopen(url).read()
    soup = bs(source,'html.parser')

    div_papers = soup.find_all('ul')

    posters_tags = div_papers[11]

    posters_names = posters_tags.get_text()
    posters_papers = posters_names.split('\n')[1:-1]
    posters_dict = {}

    counter = 0
    for name in posters_papers:
        if name == "merged with":
            continue
        elif name == "and":
            continue
        else:
            split = name.split('—')
            posters_dict[str(counter)] = {}
            posters_dict[str(counter)]['title'] = split[0]
            if ',' in split[1]:
                list_names = split[1].split(',')
            else:
                list_names = split[1].split('and ')
            posters_dict[str(counter)]['authors'] = []
            for list_name in list_names:
                posters_dict[str(counter)]['authors'].append(list_name.strip().replace('and ',''))
            counter += 1

    return posters_dict

In [102]:
plenary_talks, accepted_talks = get_qip2017_accepted_titles()

In [103]:
rejected_talks = get_qip2017_posters()

In [106]:
#List of titles accepted to QIP
accepted_titles = []
for no_sci, paper_info in accepted_talks.items():
    accepted_titles.append(paper_info['title'])

#List of titles rejected from QIP
rejected_titles = []
for no_sci, paper_info in rejected_talks.items():
    rejected_titles.append(paper_info['title'])

#List of scite count for papers used to calculate avg, median, std_devn
accept_scite_count = []
reject_scite_count = []

for no_acc, details in scirate_papers.items():
    accept_match = difflib.get_close_matches(details['title'], accepted_titles, cutoff = 0.7)
    reject_match = difflib.get_close_matches(details['title'], rejected_titles, cutoff = 0.7)
    if accept_match != []:
        accept_scite_count.append(len(details['scites']))
    if reject_match != []:
        reject_scite_count.append(len(details['scites']))
    
accept_scite_count = np.array(accept_scite_count)
reject_scite_count = np.array(reject_scite_count)

accept_avg_scites = np.mean(accept_scite_count)
accept_median_scites = np.median(accept_scite_count)

reject_avg_scites = np.mean(reject_scite_count)
reject_median_scites = np.median(reject_scite_count)

print('\n-----------------------------------------------------\n')
print('------------ Accept stats ---------------------------')
print('\n-----------------------------------------------------\n')

print('Average number of scites for accepted papers: ' + str(accept_avg_scites))
print('Median number of scites for accepted papers: ' + str(accept_median_scites))

print('\n-----------------------------------------------------\n')
print('------------ Reject stats ---------------------------')
print('\n-----------------------------------------------------\n')

print('Average number of scites for rejected papers: ' + str(reject_avg_scites))
print('Median number of scites for rejected papers: ' + str(reject_median_scites))


-----------------------------------------------------

------------ Accept stats ---------------------------

-----------------------------------------------------

Average number of scites for accepted papers: 30.4166666667
Median number of scites for accepted papers: 29.5

-----------------------------------------------------

------------ Reject stats ---------------------------

-----------------------------------------------------

Average number of scites for rejected papers: 23.24
Median number of scites for rejected papers: 21.0


In [121]:
def get_cabal():
    sciters = []
    for no_sci, details in scirate_papers.items():
        for sciter in details['scites']:
            sciters.append(sciter)

    sciters = set(sciters)

    score = {}

    for sciter in sciters:
        score[sciter] = [0,0]

    for no_acc, details in scirate_papers.items():
        accepted_match = difflib.get_close_matches(details['title'], accepted_titles, cutoff = 0.7)
        rejected_match = difflib.get_close_matches(details['title'], rejected_titles, cutoff = 0.7)
        if accepted_match != []:            
            for sciter in details['scites']:
                score[sciter][0] = score[sciter][0] + 1
                score[sciter][1] = score[sciter][1] + 1
        if rejected_match != []:
            for sciter in details['scites']:
                score[sciter][0] = score[sciter][0] - 1
                score[sciter][1] = score[sciter][1] + 1

    for name, score_tuple in score.items():
        if score_tuple[1] != 0:
            normalized_score = score_tuple[0]/score_tuple[1]
            score[name].append(normalized_score)
        elif score_tuple[1] == 0:
            score[name].append(-1)

    return score

In [122]:
cabal = get_cabal()

In [124]:
sorted(cabal.items(), key=lambda x:x[1][2])

[('Robie Hennigar', [0, 0, -1]),
 ('Piers Lillystone', [0, 0, -1]),
 ('Serge-Olivier Paquette', [0, 0, -1]),
 ('Christian Gogolin ', [0, 0, -1]),
 ('Jiajun Ma', [0, 0, -1]),
 ('Deniz Stiegemann', [0, 0, -1]),
 ('Jing-Yan Haw', [0, 0, -1]),
 ('heather', [0, 0, -1]),
 ('Ryuji Takagi', [-1, 1, -1.0]),
 ('Teddy Megumi', [0, 0, -1]),
 ('Daniel Nagaj', [-1, 1, -1.0]),
 ('Marius Lewerenz', [0, 0, -1]),
 ('Carlo Ottaviani', [0, 0, -1]),
 ('Ken Brown', [0, 0, -1]),
 ('Wolfgang Pfaff', [0, 0, -1]),
 ('Senaida Hernández-Santana', [0, 0, -1]),
 ('Hamed Mohammady', [-1, 1, -1.0]),
 ('Nai-Hui Chia', [0, 0, -1]),
 ('Sacha Schwarz', [0, 0, -1]),
 ('Siva', [-1, 1, -1.0]),
 ('Ashley Milsted', [-1, 1, -1.0]),
 ('JGarre', [0, 0, -1]),
 ('Ali Husain', [-1, 1, -1.0]),
 ('Tomasz Darmetko', [0, 0, -1]),
 ('Jiannis Pachos', [0, 0, -1]),
 ('Mark Pearce', [0, 0, -1]),
 ('mmanu', [0, 0, -1]),
 ('Giuseppe Carleo', [0, 0, -1]),
 ('Jean-Charles Vialatte', [0, 0, -1]),
 ('David Perez-Garcia', [0, 0, -1]),
 ('Zi-Wen L

In [110]:
scirate_titles = []
for no_sci, paper_details in scirate_papers.items():
    scirate_titles.append(paper_details['title'])

unaccounted_for = dict(accepted_talks)

for no_acc, details in accepted_talks.items():
    accept_match = difflib.get_close_matches(details['title'], scirate_titles, cutoff = 0.7)
    if accept_match != []:
        del unaccounted_for[no_acc]

for key, value in unaccounted_for.items():
    print(value['title'])