In [346]:
import json
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from lxml import etree

In [347]:
BASE_URL = 'http://papers.nips.cc'

### Creating the base class for storing the article metadata

In [348]:
class NipsPaper:
    def __init__(self,year,title,doi,url,authors,abstract):
        self.year = year
        self.title = title
        self.doi = doi
        self.abstract = abstract
        self.authors = authors
        self.url = url
    
    
    def to_json(self):
        return {
            'id': self.doi,
            'title': self.title,
            'year' : self.year,
            'url': self.url,
            'abstract': self.abstract,
            'authors': self.authors
        }

### Functions for crawling the NeurIPS website

In [349]:
def get_conference_links(year_from, year_to):
    base = BASE_URL + '/paper_files/paper/'
    number_year_from = year_from - FIRST_YEAR + 1
    number_year_to = year_to - _FIRST_YEAR + 1
    for i in range(year_from,year_to +1):
        year = str(i)
        url = base + year
        yield (url,year)

        
def get_papers_year(url):
    url_request = requests.get(url)
    soup = BeautifulSoup(url_request.content,'html5lib')
    papers_in_html = soup.find_all('a',attrs = {'title':'paper title'})
    for paper in papers_in_html:
        paper_link = BASE_URL + paper["href"]
        yield paper_link

        
def get_paper_info(paper_url):
    url_request = requests.get(paper_url)
    soup = BeautifulSoup(url_request.content,'html5lib')
    #Paper title
    paper_title = soup.find('title').text
    #Paper authors
    authors_tags = soup.find_all('meta',attrs = {'name':'citation_author'})
    authors = []
    for author in authors_tags:
        authors.append(author['content'])
    #Paper publication date
    date = soup.find('meta',attrs = {'name':'citation_publication_date'})['content']
    #Paper url
    article_url = soup.find('meta',attrs = {'name':'citation_pdf_url'})['content']
    #Abstract 
    abstract_tag = soup.find('h4',text='Abstract')
    abstract_text = ''
    for p in abstract_tag.find_all_next('p'):
        if len(p.text) != 0:
            abstract_text += p.text
            break
    #Building paper ID
    splitting_url = article_url.split('/')
    index_hash = splitting_url.index("file")
    paper_id = splitting_url[index_hash + 1].split('-')[0]
    return NipsPaper(date,paper_title,paper_id,article_url,authors,abstract_text)

### Fetching data from 2021 and 2022

In [350]:
dumping_json = []
start_year = 2021
end_year = 2022
cpt = 0
base_folder = 'NIPS_papers_'
for url in tqdm(get_conference_links(start_year,end_year),'getting_conference_links'):
    time.sleep(0.1)
    output_folder = base_folder + str(start_year + cpt)
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    for paper_link in tqdm(get_papers_year(url[0])):
        time.sleep(0.1)
        nips_paper = get_paper_info(paper_link)
        dumping_json.append(nips_paper.to_json())
    cpt += 1
with open('dataset.json', 'w') as file:
    json.dump(dumping_json, file, indent=2)

getting_conference_links: 0it [00:00, ?it/s]
0it [00:00, ?it/s][A
1it [00:05,  5.14s/it][A
2it [00:06,  2.89s/it][A
3it [00:07,  2.09s/it][A
4it [00:08,  1.75s/it][A
5it [00:10,  1.60s/it][A
6it [00:11,  1.49s/it][A
7it [00:12,  1.39s/it][A
8it [00:13,  1.31s/it][A
9it [00:14,  1.26s/it][A
10it [00:16,  1.28s/it][A
11it [00:17,  1.22s/it][A
12it [00:18,  1.21s/it][A
13it [00:19,  1.22s/it][A
14it [00:20,  1.22s/it][A
15it [00:22,  1.27s/it][A
16it [00:23,  1.25s/it][A
17it [00:24,  1.27s/it][A
18it [00:26,  1.27s/it][A
19it [00:27,  1.26s/it][A
20it [00:28,  1.28s/it][A
21it [00:29,  1.23s/it][A
22it [00:30,  1.20s/it][A
23it [00:32,  1.25s/it][A
24it [00:33,  1.23s/it][A
25it [00:34,  1.19s/it][A
26it [00:36,  1.27s/it][A
27it [00:37,  1.33s/it][A
28it [00:38,  1.31s/it][A
29it [00:40,  1.36s/it][A
30it [00:41,  1.32s/it][A
31it [00:42,  1.31s/it][A
32it [00:44,  1.30s/it][A
33it [00:45,  1.47s/it][A
34it [00:47,  1.48s/it][A
35it [00:48,  1.42s/it]

295it [06:14,  1.30s/it][A
296it [06:16,  1.34s/it][A
297it [06:17,  1.32s/it][A
298it [06:18,  1.31s/it][A
299it [06:19,  1.28s/it][A
300it [06:20,  1.25s/it][A
301it [06:22,  1.26s/it][A
302it [06:23,  1.24s/it][A
303it [06:24,  1.21s/it][A
304it [06:25,  1.21s/it][A
305it [06:27,  1.28s/it][A
306it [06:29,  1.61s/it][A
307it [06:30,  1.51s/it][A
308it [06:32,  1.44s/it][A
309it [06:33,  1.39s/it][A
310it [06:34,  1.36s/it][A
311it [06:36,  1.33s/it][A
312it [06:37,  1.32s/it][A
313it [06:38,  1.36s/it][A
314it [06:40,  1.33s/it][A
315it [06:41,  1.28s/it][A
316it [06:42,  1.22s/it][A
317it [06:43,  1.29s/it][A
318it [06:44,  1.24s/it][A
319it [06:46,  1.30s/it][A
320it [06:47,  1.29s/it][A
321it [06:48,  1.34s/it][A
322it [06:50,  1.32s/it][A
323it [06:51,  1.33s/it][A
324it [06:52,  1.28s/it][A
325it [06:54,  1.29s/it][A
326it [06:55,  1.29s/it][A
327it [06:56,  1.33s/it][A
328it [06:58,  1.37s/it][A
329it [06:59,  1.39s/it][A
330it [07:00,  1.35s

587it [12:19,  1.12s/it][A
588it [12:20,  1.13s/it][A
589it [12:21,  1.11s/it][A
590it [12:22,  1.13s/it][A
591it [12:24,  1.23s/it][A
592it [12:25,  1.19s/it][A
593it [12:26,  1.17s/it][A
594it [12:27,  1.14s/it][A
595it [12:28,  1.13s/it][A
596it [12:30,  1.14s/it][A
597it [12:31,  1.14s/it][A
598it [12:32,  1.12s/it][A
599it [12:33,  1.11s/it][A
600it [12:34,  1.19s/it][A
601it [12:36,  1.23s/it][A
602it [12:37,  1.19s/it][A
603it [12:38,  1.17s/it][A
604it [12:39,  1.14s/it][A
605it [12:40,  1.12s/it][A
606it [12:41,  1.11s/it][A
607it [12:42,  1.10s/it][A
608it [12:43,  1.10s/it][A
609it [12:44,  1.11s/it][A
610it [12:46,  1.15s/it][A
611it [12:47,  1.20s/it][A
612it [12:48,  1.27s/it][A
613it [12:49,  1.21s/it][A
614it [12:51,  1.22s/it][A
615it [12:52,  1.24s/it][A
616it [12:53,  1.25s/it][A
617it [12:54,  1.28s/it][A
618it [12:56,  1.25s/it][A
619it [12:57,  1.22s/it][A
620it [12:58,  1.20s/it][A
621it [12:59,  1.17s/it][A
622it [13:00,  1.17s

879it [18:01,  1.12s/it][A
880it [18:02,  1.14s/it][A
881it [18:03,  1.19s/it][A
882it [18:05,  1.21s/it][A
883it [18:06,  1.19s/it][A
884it [18:07,  1.21s/it][A
885it [18:08,  1.23s/it][A
886it [18:09,  1.19s/it][A
887it [18:11,  1.22s/it][A
888it [18:12,  1.23s/it][A
889it [18:13,  1.24s/it][A
890it [18:15,  1.30s/it][A
891it [18:16,  1.29s/it][A
892it [18:17,  1.24s/it][A
893it [18:18,  1.25s/it][A
894it [18:20,  1.24s/it][A
895it [18:21,  1.27s/it][A
896it [18:22,  1.28s/it][A
897it [18:24,  1.28s/it][A
898it [18:25,  1.27s/it][A
899it [18:26,  1.33s/it][A
900it [18:27,  1.31s/it][A
901it [18:29,  1.32s/it][A
902it [18:30,  1.26s/it][A
903it [18:31,  1.25s/it][A
904it [18:32,  1.23s/it][A
905it [18:34,  1.27s/it][A
906it [18:35,  1.28s/it][A
907it [18:36,  1.33s/it][A
908it [18:38,  1.31s/it][A
909it [18:39,  1.28s/it][A
910it [18:40,  1.25s/it][A
911it [18:41,  1.26s/it][A
912it [18:43,  1.32s/it][A
913it [18:44,  1.35s/it][A
914it [18:46,  1.39s

1165it [23:48,  1.15s/it][A
1166it [23:49,  1.16s/it][A
1167it [23:50,  1.17s/it][A
1168it [23:51,  1.16s/it][A
1169it [23:52,  1.15s/it][A
1170it [23:54,  1.19s/it][A
1171it [23:55,  1.21s/it][A
1172it [23:56,  1.18s/it][A
1173it [23:57,  1.17s/it][A
1174it [23:58,  1.17s/it][A
1175it [23:59,  1.16s/it][A
1176it [24:00,  1.14s/it][A
1177it [24:02,  1.13s/it][A
1178it [24:03,  1.14s/it][A
1179it [24:04,  1.14s/it][A
1180it [24:05,  1.15s/it][A
1181it [24:06,  1.15s/it][A
1182it [24:07,  1.14s/it][A
1183it [24:08,  1.14s/it][A
1184it [24:10,  1.13s/it][A
1185it [24:11,  1.13s/it][A
1186it [24:12,  1.14s/it][A
1187it [24:13,  1.13s/it][A
1188it [24:14,  1.13s/it][A
1189it [24:15,  1.14s/it][A
1190it [24:16,  1.15s/it][A
1191it [24:18,  1.17s/it][A
1192it [24:19,  1.19s/it][A
1193it [24:20,  1.18s/it][A
1194it [24:21,  1.19s/it][A
1195it [24:22,  1.17s/it][A
1196it [24:24,  1.19s/it][A
1197it [24:25,  1.18s/it][A
1198it [24:26,  1.17s/it][A
1199it [24:27,

1447it [29:21,  1.19s/it][A
1448it [29:22,  1.19s/it][A
1449it [29:23,  1.18s/it][A
1450it [29:24,  1.18s/it][A
1451it [29:25,  1.18s/it][A
1452it [29:27,  1.20s/it][A
1453it [29:28,  1.18s/it][A
1454it [29:29,  1.19s/it][A
1455it [29:30,  1.20s/it][A
1456it [29:31,  1.19s/it][A
1457it [29:33,  1.36s/it][A
1458it [29:34,  1.30s/it][A
1459it [29:36,  1.27s/it][A
1460it [29:37,  1.24s/it][A
1461it [29:38,  1.23s/it][A
1462it [29:39,  1.22s/it][A
1463it [29:40,  1.23s/it][A
1464it [29:42,  1.22s/it][A
1465it [29:43,  1.21s/it][A
1466it [29:44,  1.20s/it][A
1467it [29:45,  1.19s/it][A
1468it [29:46,  1.19s/it][A
1469it [29:47,  1.21s/it][A
1470it [29:49,  1.23s/it][A
1471it [29:50,  1.21s/it][A
1472it [29:51,  1.20s/it][A
1473it [29:52,  1.20s/it][A
1474it [29:54,  1.21s/it][A
1475it [29:55,  1.25s/it][A
1476it [29:56,  1.22s/it][A
1477it [29:57,  1.23s/it][A
1478it [29:58,  1.21s/it][A
1479it [30:00,  1.20s/it][A
1480it [30:01,  1.20s/it][A
1481it [30:02,

1729it [35:28,  1.16s/it][A
1730it [35:29,  1.16s/it][A
1731it [35:30,  1.16s/it][A
1732it [35:31,  1.15s/it][A
1733it [35:32,  1.16s/it][A
1734it [35:33,  1.16s/it][A
1735it [35:35,  1.18s/it][A
1736it [35:36,  1.19s/it][A
1737it [35:37,  1.18s/it][A
1738it [35:38,  1.18s/it][A
1739it [35:39,  1.18s/it][A
1740it [35:40,  1.17s/it][A
1741it [35:42,  1.17s/it][A
1742it [35:43,  1.18s/it][A
1743it [35:44,  1.16s/it][A
1744it [35:45,  1.16s/it][A
1745it [35:46,  1.18s/it][A
1746it [35:47,  1.17s/it][A
1747it [35:49,  1.16s/it][A
1748it [35:50,  1.17s/it][A
1749it [35:51,  1.15s/it][A
1750it [35:52,  1.17s/it][A
1751it [35:53,  1.17s/it][A
1752it [35:54,  1.19s/it][A
1753it [35:56,  1.18s/it][A
1754it [35:57,  1.17s/it][A
1755it [35:58,  1.18s/it][A
1756it [35:59,  1.18s/it][A
1757it [36:00,  1.17s/it][A
1758it [36:01,  1.17s/it][A
1759it [36:03,  1.16s/it][A
1760it [36:04,  1.16s/it][A
1761it [36:05,  1.17s/it][A
1762it [36:06,  1.18s/it][A
1763it [36:07,

2011it [41:03,  1.17s/it][A
2012it [41:04,  1.19s/it][A
2013it [41:05,  1.19s/it][A
2014it [41:06,  1.20s/it][A
2015it [41:08,  1.19s/it][A
2016it [41:09,  1.19s/it][A
2017it [41:10,  1.19s/it][A
2018it [41:11,  1.19s/it][A
2019it [41:12,  1.18s/it][A
2020it [41:14,  1.23s/it][A
2021it [41:15,  1.25s/it][A
2022it [41:16,  1.24s/it][A
2023it [41:17,  1.23s/it][A
2024it [41:19,  1.22s/it][A
2025it [41:20,  1.22s/it][A
2026it [41:21,  1.21s/it][A
2027it [41:22,  1.22s/it][A
2028it [41:23,  1.20s/it][A
2029it [41:25,  1.31s/it][A
2030it [41:26,  1.35s/it][A
2031it [41:28,  1.29s/it][A
2032it [41:29,  1.25s/it][A
2033it [41:30,  1.23s/it][A
2034it [41:31,  1.23s/it][A
2035it [41:32,  1.22s/it][A
2036it [41:34,  1.22s/it][A
2037it [41:35,  1.21s/it][A
2038it [41:36,  1.21s/it][A
2039it [41:37,  1.20s/it][A
2040it [41:38,  1.19s/it][A
2041it [41:39,  1.19s/it][A
2042it [41:41,  1.21s/it][A
2043it [41:42,  1.22s/it][A
2044it [41:43,  1.21s/it][A
2045it [41:44,

2293it [46:46,  1.21s/it][A
2294it [46:47,  1.21s/it][A
2295it [46:48,  1.22s/it][A
2296it [46:50,  1.21s/it][A
2297it [46:51,  1.21s/it][A
2298it [46:52,  1.19s/it][A
2299it [46:53,  1.20s/it][A
2300it [46:54,  1.20s/it][A
2301it [46:56,  1.20s/it][A
2302it [46:57,  1.21s/it][A
2303it [46:58,  1.20s/it][A
2304it [46:59,  1.19s/it][A
2305it [47:01,  1.24s/it][A
2306it [47:02,  1.26s/it][A
2307it [47:03,  1.24s/it][A
2308it [47:04,  1.21s/it][A
2309it [47:05,  1.23s/it][A
2310it [47:07,  1.23s/it][A
2311it [47:08,  1.21s/it][A
2312it [47:09,  1.20s/it][A
2313it [47:10,  1.20s/it][A
2314it [47:11,  1.19s/it][A
2315it [47:13,  1.20s/it][A
2316it [47:14,  1.19s/it][A
2317it [47:15,  1.18s/it][A
2318it [47:16,  1.19s/it][A
2319it [47:17,  1.19s/it][A
2320it [47:19,  1.18s/it][A
2321it [47:20,  1.19s/it][A
2322it [47:21,  1.19s/it][A
2323it [47:22,  1.19s/it][A
2324it [47:23,  1.21s/it][A
2325it [47:25,  1.20s/it][A
2326it [47:26,  1.19s/it][A
2327it [47:27,

251it [05:09,  1.18s/it][A
252it [05:10,  1.17s/it][A
253it [05:11,  1.17s/it][A
254it [05:15,  2.14s/it][A
255it [05:16,  1.86s/it][A
256it [05:18,  1.64s/it][A
257it [05:19,  1.51s/it][A
258it [05:20,  1.41s/it][A
259it [05:21,  1.36s/it][A
260it [05:22,  1.30s/it][A
261it [05:24,  1.27s/it][A
262it [05:25,  1.25s/it][A
263it [05:26,  1.24s/it][A
264it [05:27,  1.23s/it][A
265it [05:28,  1.21s/it][A
266it [05:30,  1.20s/it][A
267it [05:31,  1.22s/it][A
268it [05:32,  1.23s/it][A
269it [05:33,  1.25s/it][A
270it [05:35,  1.27s/it][A
271it [05:36,  1.25s/it][A
272it [05:37,  1.24s/it][A
273it [05:38,  1.24s/it][A
274it [05:39,  1.23s/it][A
275it [05:41,  1.22s/it][A
276it [05:42,  1.20s/it][A
277it [05:43,  1.17s/it][A
278it [05:44,  1.17s/it][A
279it [05:45,  1.17s/it][A
280it [05:46,  1.18s/it][A
281it [05:48,  1.17s/it][A
282it [05:49,  1.18s/it][A
283it [05:50,  1.17s/it][A
284it [05:51,  1.18s/it][A
285it [05:52,  1.18s/it][A
286it [05:54,  1.19s

543it [10:59,  1.19s/it][A
544it [11:00,  1.19s/it][A
545it [11:02,  1.18s/it][A
546it [11:03,  1.16s/it][A
547it [11:04,  1.17s/it][A
548it [11:05,  1.16s/it][A
549it [11:06,  1.17s/it][A
550it [11:08,  1.28s/it][A
551it [11:09,  1.24s/it][A
552it [11:10,  1.22s/it][A
553it [11:11,  1.20s/it][A
554it [11:12,  1.20s/it][A
555it [11:14,  1.19s/it][A
556it [11:15,  1.18s/it][A
557it [11:16,  1.19s/it][A
558it [11:17,  1.19s/it][A
559it [11:18,  1.18s/it][A
560it [11:19,  1.17s/it][A
561it [11:21,  1.17s/it][A
562it [11:22,  1.18s/it][A
563it [11:23,  1.19s/it][A
564it [11:24,  1.20s/it][A
565it [11:25,  1.19s/it][A
566it [11:27,  1.18s/it][A
567it [11:28,  1.17s/it][A
568it [11:29,  1.16s/it][A
569it [11:30,  1.15s/it][A
570it [11:31,  1.16s/it][A
571it [11:32,  1.16s/it][A
572it [11:33,  1.16s/it][A
573it [11:35,  1.17s/it][A
574it [11:36,  1.16s/it][A
575it [11:37,  1.17s/it][A
576it [11:38,  1.16s/it][A
577it [11:39,  1.17s/it][A
578it [11:40,  1.16s

835it [16:43,  1.19s/it][A
836it [16:44,  1.18s/it][A
837it [16:45,  1.18s/it][A
838it [16:46,  1.17s/it][A
839it [16:47,  1.17s/it][A
840it [16:49,  1.19s/it][A
841it [16:50,  1.18s/it][A
842it [16:51,  1.20s/it][A
843it [16:52,  1.19s/it][A
844it [16:53,  1.19s/it][A
845it [16:55,  1.18s/it][A
846it [16:56,  1.17s/it][A
847it [16:57,  1.21s/it][A
848it [16:58,  1.25s/it][A
849it [17:00,  1.28s/it][A
850it [17:01,  1.26s/it][A
851it [17:02,  1.24s/it][A
852it [17:03,  1.23s/it][A
853it [17:04,  1.22s/it][A
854it [17:06,  1.20s/it][A
855it [17:07,  1.20s/it][A
856it [17:08,  1.19s/it][A
857it [17:09,  1.19s/it][A
858it [17:10,  1.17s/it][A
859it [17:11,  1.17s/it][A
860it [17:13,  1.20s/it][A
861it [17:14,  1.19s/it][A
862it [17:15,  1.19s/it][A
863it [17:16,  1.20s/it][A
864it [17:18,  1.19s/it][A
865it [17:19,  1.20s/it][A
866it [17:20,  1.19s/it][A
867it [17:21,  1.18s/it][A
868it [17:22,  1.17s/it][A
869it [17:23,  1.17s/it][A
870it [17:25,  1.17s

1123it [22:27,  1.22s/it][A
1124it [22:29,  1.21s/it][A
1125it [22:30,  1.22s/it][A
1126it [22:31,  1.21s/it][A
1127it [22:32,  1.22s/it][A
1128it [22:33,  1.23s/it][A
1129it [22:35,  1.23s/it][A
1130it [22:36,  1.22s/it][A
1131it [22:37,  1.24s/it][A
1132it [22:38,  1.23s/it][A
1133it [22:40,  1.22s/it][A
1134it [22:41,  1.21s/it][A
1135it [22:42,  1.19s/it][A
1136it [22:43,  1.16s/it][A
1137it [22:44,  1.17s/it][A
1138it [22:45,  1.15s/it][A
1139it [22:46,  1.15s/it][A
1140it [22:48,  1.15s/it][A
1141it [22:49,  1.15s/it][A
1142it [22:50,  1.13s/it][A
1143it [22:51,  1.15s/it][A
1144it [22:52,  1.15s/it][A
1145it [22:53,  1.16s/it][A
1146it [22:55,  1.17s/it][A
1147it [22:56,  1.17s/it][A
1148it [22:57,  1.17s/it][A
1149it [22:58,  1.16s/it][A
1150it [22:59,  1.16s/it][A
1151it [23:00,  1.16s/it][A
1152it [23:01,  1.16s/it][A
1153it [23:03,  1.29s/it][A
1154it [23:05,  1.34s/it][A
1155it [23:06,  1.29s/it][A
1156it [23:07,  1.26s/it][A
1157it [23:08,

1405it [28:59,  1.24s/it][A
1406it [29:00,  1.23s/it][A
1407it [29:01,  1.27s/it][A
1408it [29:14,  4.62s/it][A
1409it [29:18,  4.40s/it][A
1410it [29:19,  3.50s/it][A
1411it [29:20,  2.83s/it][A
1412it [29:21,  2.36s/it][A
1413it [29:23,  2.03s/it][A
1414it [29:24,  1.79s/it][A
1415it [29:25,  1.64s/it][A
1416it [29:26,  1.50s/it][A
1417it [29:28,  1.41s/it][A
1418it [29:29,  1.37s/it][A
1419it [29:30,  1.34s/it][A
1420it [29:32,  1.54s/it][A
1421it [29:34,  1.48s/it][A
1422it [29:35,  1.41s/it][A
1423it [29:36,  1.40s/it][A
1424it [29:37,  1.35s/it][A
1425it [29:39,  1.33s/it][A
1426it [29:40,  1.29s/it][A
1427it [29:41,  1.27s/it][A
1428it [29:42,  1.28s/it][A
1429it [29:44,  1.26s/it][A
1430it [29:45,  1.28s/it][A
1431it [29:46,  1.29s/it][A
1432it [29:48,  1.29s/it][A
1433it [29:49,  1.30s/it][A
1434it [29:50,  1.29s/it][A
1435it [29:51,  1.29s/it][A
1436it [29:53,  1.29s/it][A
1437it [29:54,  1.27s/it][A
1438it [29:55,  1.26s/it][A
1439it [29:56,

1687it [35:28,  1.19s/it][A
1688it [35:29,  1.20s/it][A
1689it [35:30,  1.20s/it][A
1690it [35:31,  1.19s/it][A
1691it [35:32,  1.20s/it][A
1692it [35:34,  1.19s/it][A
1693it [35:35,  1.18s/it][A
1694it [35:36,  1.18s/it][A
1695it [35:37,  1.18s/it][A
1696it [35:38,  1.18s/it][A
1697it [35:39,  1.16s/it][A
1698it [35:41,  1.18s/it][A
1699it [35:42,  1.18s/it][A
1700it [35:43,  1.17s/it][A
1701it [35:44,  1.17s/it][A
1702it [35:45,  1.19s/it][A
1703it [35:47,  1.18s/it][A
1704it [35:48,  1.20s/it][A
1705it [35:49,  1.22s/it][A
1706it [35:50,  1.22s/it][A
1707it [35:52,  1.22s/it][A
1708it [35:53,  1.22s/it][A
1709it [35:54,  1.26s/it][A
1710it [35:55,  1.24s/it][A
1711it [35:57,  1.24s/it][A
1712it [35:58,  1.27s/it][A
1713it [35:59,  1.34s/it][A
1714it [36:01,  1.29s/it][A
1715it [36:02,  1.25s/it][A
1716it [36:03,  1.24s/it][A
1717it [36:04,  1.22s/it][A
1718it [36:05,  1.24s/it][A
1719it [36:07,  1.23s/it][A
1720it [36:08,  1.21s/it][A
1721it [36:09,

1969it [41:18,  1.25s/it][A
1970it [41:19,  1.23s/it][A
1971it [41:20,  1.24s/it][A
1972it [41:21,  1.23s/it][A
1973it [41:22,  1.23s/it][A
1974it [41:24,  1.21s/it][A
1975it [41:25,  1.20s/it][A
1976it [41:26,  1.24s/it][A
1977it [41:27,  1.23s/it][A
1978it [41:29,  1.21s/it][A
1979it [41:30,  1.21s/it][A
1980it [41:31,  1.23s/it][A
1981it [41:32,  1.23s/it][A
1982it [41:33,  1.22s/it][A
1983it [41:35,  1.23s/it][A
1984it [41:36,  1.22s/it][A
1985it [41:37,  1.21s/it][A
1986it [41:38,  1.21s/it][A
1987it [41:40,  1.23s/it][A
1988it [41:41,  1.22s/it][A
1989it [41:42,  1.22s/it][A
1990it [41:43,  1.23s/it][A
1991it [41:44,  1.22s/it][A
1992it [41:46,  1.20s/it][A
1993it [41:47,  1.20s/it][A
1994it [41:48,  1.20s/it][A
1995it [41:49,  1.21s/it][A
1996it [41:50,  1.22s/it][A
1997it [41:52,  1.23s/it][A
1998it [41:53,  1.22s/it][A
1999it [41:54,  1.23s/it][A
2000it [41:55,  1.22s/it][A
2001it [41:57,  1.21s/it][A
2002it [41:58,  1.23s/it][A
2003it [41:59,

2251it [46:55,  1.17s/it][A
2252it [46:56,  1.18s/it][A
2253it [46:57,  1.17s/it][A
2254it [46:59,  1.17s/it][A
2255it [47:00,  1.19s/it][A
2256it [47:01,  1.21s/it][A
2257it [47:02,  1.19s/it][A
2258it [47:04,  1.21s/it][A
2259it [47:05,  1.20s/it][A
2260it [47:06,  1.18s/it][A
2261it [47:07,  1.17s/it][A
2262it [47:08,  1.18s/it][A
2263it [47:09,  1.20s/it][A
2264it [47:11,  1.19s/it][A
2265it [47:12,  1.19s/it][A
2266it [47:13,  1.18s/it][A
2267it [47:14,  1.18s/it][A
2268it [47:15,  1.17s/it][A
2269it [47:16,  1.16s/it][A
2270it [47:18,  1.18s/it][A
2271it [47:19,  1.18s/it][A
2272it [47:20,  1.18s/it][A
2273it [47:21,  1.17s/it][A
2274it [47:22,  1.17s/it][A
2275it [47:23,  1.17s/it][A
2276it [47:25,  1.18s/it][A
2277it [47:26,  1.18s/it][A
2278it [47:27,  1.19s/it][A
2279it [47:28,  1.20s/it][A
2280it [47:29,  1.19s/it][A
2281it [47:31,  1.18s/it][A
2282it [47:32,  1.17s/it][A
2283it [47:33,  1.17s/it][A
2284it [47:34,  1.18s/it][A
2285it [47:35,

2533it [52:42,  1.28s/it][A
2534it [52:43,  1.24s/it][A
2535it [52:44,  1.22s/it][A
2536it [52:45,  1.21s/it][A
2537it [52:46,  1.21s/it][A
2538it [52:48,  1.18s/it][A
2539it [52:49,  1.20s/it][A
2540it [52:50,  1.20s/it][A
2541it [52:51,  1.21s/it][A
2542it [52:53,  1.22s/it][A
2543it [52:54,  1.23s/it][A
2544it [52:55,  1.22s/it][A
2545it [52:56,  1.21s/it][A
2546it [52:57,  1.20s/it][A
2547it [52:59,  1.20s/it][A
2548it [53:00,  1.20s/it][A
2549it [53:01,  1.19s/it][A
2550it [53:02,  1.20s/it][A
2551it [53:03,  1.19s/it][A
2552it [53:05,  1.20s/it][A
2553it [53:06,  1.19s/it][A
2554it [53:07,  1.20s/it][A
2555it [53:08,  1.22s/it][A
2556it [53:09,  1.24s/it][A
2557it [53:11,  1.21s/it][A
2558it [53:12,  1.21s/it][A
2559it [53:13,  1.21s/it][A
2560it [53:14,  1.20s/it][A
2561it [53:15,  1.21s/it][A
2562it [53:17,  1.20s/it][A
2563it [53:18,  1.20s/it][A
2564it [53:19,  1.20s/it][A
2565it [53:20,  1.20s/it][A
2566it [53:21,  1.20s/it][A
2567it [53:23,

2815it [58:17,  1.17s/it][A
2816it [58:19,  1.49s/it][A
2817it [58:20,  1.41s/it][A
2818it [58:21,  1.33s/it][A
2819it [58:22,  1.29s/it][A
2820it [58:23,  1.25s/it][A
2821it [58:25,  1.23s/it][A
2822it [58:26,  1.21s/it][A
2823it [58:27,  1.17s/it][A
2824it [58:28,  1.18s/it][A
2825it [58:29,  1.18s/it][A
2826it [58:30,  1.19s/it][A
2827it [58:32,  1.19s/it][A
2828it [58:33,  1.17s/it][A
2829it [58:34,  1.17s/it][A
2830it [58:35,  1.18s/it][A
2831it [58:36,  1.17s/it][A
2832it [58:37,  1.17s/it][A
2833it [58:39,  1.17s/it][A
2834it [58:40,  1.24s/it][A
getting_conference_links: 2it [1:46:16, 3188.06s/it]


### Cleaning abstracts

In [478]:
fichier_entree = "dataset.json"

with open(fichier_entree, 'r', encoding='utf-8') as fichier_json:
    clean_2021_data = json.load(fichier_json)

def clean_text(input_text):

    # Update caractere special from latex
    cleaned_text = re.sub(r'\\emph{([^}]+)}', r'\1', input_text)
    cleaned_text = re.sub(r'\\ell_(\d+)', r'l_\1', cleaned_text)
    cleaned_text = re.sub(r'\\mathcal{([^}]+)}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textbf\{(.+?)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\textit\{(.+?)\}', r'\1', cleaned_text)
    cleaned_text = re.sub(r'\\text\{(.+?)\}', r'\1', cleaned_text)

    cleaned_text = re.sub(r'\\', '', cleaned_text)  # Supprime les caractères '\\' 
    cleaned_text = cleaned_text.replace('$', '') # Remove "$"
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Normalize spaces

    return cleaned_text

for document in clean_2021_data:
    document["abstract"] = clean_text(document["abstract"])

fichier_sortie = "dataset.json"
with open(fichier_sortie, 'w', encoding='utf-8') as fichier_json:
    json.dump(clean_2021_data, fichier_json, ensure_ascii=False, indent=2)

### Code for downloading the pdf content of an article given its URL

In [None]:
def download_papers(paper_file):
    with open(paper_file, 'r') as file:
        papers_data = json.load(file)
    base_folder = 'papers/'
    for paper in papers_data:
        time.sleep(0.1)
        year = paper['year']
        pdf_url = paper['url']
        year_folder = os.path.join(base_folder, year)
        os.makedirs(year_folder, exist_ok=True)
        response = requests.get(pdf_url)
        if response.status_code == 200:
            filename = paper['title'].replace(' ', '_')+'.pdf'
            file_path = os.path.join(year_folder, filename)
            with open(file_path, 'wb') as file:
                file.write(response.content)
        

In [None]:
download_papers('dataset.json')

### Setting up an article SQL DB

In [355]:
import sqlite3
with open('dataset.json', 'r') as file:
    papers_data = json.load(file)
conn = sqlite3.connect('nips.db')
cursor = conn.cursor()
for item in papers_data:
    cursor.execute("INSERT INTO articles (title, year, url, abstract,location) VALUES (?, ?, ?, ?,?)",
                   (item['title'], item['year'], item['url'], item['abstract'],'Papers/'+str(item['year']) + '/'+item['title']+'.pdf'))
conn.commit()
conn.close()

In [356]:
#Visualizing dataset
import pandas as pd
conn = sqlite3.connect('nips.db')
query = 'SELECT * FROM articles'
df = pd.read_sql_query(query, conn)
conn.close()
df.head()

Unnamed: 0,id,title,year,url,abstract,location
0,1,Beyond Value-Function Gaps: Improved Instance-...,2021-12-06,https://proceedings.neurips.cc/paper_files/pap...,We provide improved gap-dependent regret bound...,Papers/2021-12-06/Beyond Value-Function Gaps: ...
1,2,Learning One Representation to Optimize All Re...,2021-12-06,https://proceedings.neurips.cc/paper_files/pap...,We introduce the forward-backward (FB) represe...,Papers/2021-12-06/Learning One Representation ...
2,3,Matrix factorisation and the interpretation of...,2021-12-06,https://proceedings.neurips.cc/paper_files/pap...,"Given a graph or similarity matrix, we conside...",Papers/2021-12-06/Matrix factorisation and the...
3,4,UniDoc: Unified Pretraining Framework for Docu...,2021-12-06,https://proceedings.neurips.cc/paper_files/pap...,Document intelligence automates the extraction...,Papers/2021-12-06/UniDoc: Unified Pretraining ...
4,5,Finding Discriminative Filters for Specific De...,2021-12-06,https://proceedings.neurips.cc/paper_files/pap...,Recent blind super-resolution (SR) methods typ...,Papers/2021-12-06/Finding Discriminative Filte...


### Setting up code for pdf content extraction

#### Author info extraction

In [360]:
import requests
grobid_api_url = "http://localhost:8070/api/processFulltextDocument"
pdf_file_path = "https://proceedings.neurips.cc/paper_files/paper/2014/file/ffeed84c7cb1ae7bf4ec4bd78275bb98-Paper.pdf"


def pdf_2_xml(pdf_file_path):
    files = {'input': requests.get(pdf_file_path).content}
    response = requests.post(grobid_api_url, files=files)
    if response.status_code == 200:
        extracted_data = response.text
    else:
        print(f"Error {response.status_code}: {response.text}")
        extracted_data = ""
    return extracted_data


In [372]:
import xml.etree.ElementTree as ET
def extract_email(xml):
    xml_content = xml.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
    root = etree.fromstring(xml_content)
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
    # Utiliser XPath pour trouver les adresses e-mail
    emails = root.xpath('//tei:email/text()', namespaces=namespaces)
    return emails

def extract_author_info(xml):
    if xml != "":
        xml_content = xml.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
        root = ET.fromstring(xml_content)
        bibl_struct = root.find('.//tei:sourceDesc/tei:biblStruct', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
        authors_info = []
        if bibl_struct is not None:
            for author_element in bibl_struct.findall('.//tei:author', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
                author_info = {}
                pers_name = author_element.find('tei:persName', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                if pers_name is not None:
                    author_info['first_name'] = pers_name.findtext('tei:forename[@type="first"]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                    author_info['middle_name'] = pers_name.findtext('tei:forename[@type="middle"]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                    author_info['surname'] = pers_name.findtext('tei:surname', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                    author_info['email'] = author_element.findtext('tei:email', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                    authors_info.append(author_info)
    else:
        authors_info = []
    return authors_info

#### Extracting references

In [373]:
def extract_ref_from_pdf(pdf_file_path):
    grobid_url = 'http://localhost:8070/api/processReferences'
    files = {'input': requests.get(pdf_file_path).content}
    response = requests.post(grobid_url, files=files)
    if response.status_code == 200:
        grobid_response = response.text
        return grobid_response
    else:
        return "Erreur lors de la communication avec Grobid."


def extract_titles_and_authors(xml_content):
    if xml != "":
    # Analyser le contenu XML
        root = etree.fromstring(xml_content)
        references = []
        # Parcourir chaque référence
        for biblStruct in root.xpath('//tei:biblStruct', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
            # Extraire le titre
            title = biblStruct.xpath('.//tei:title[@level="a" and @type="main"]/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
            title = title[0] if title else "Unknown"

            # Extraire les auteurs
            authors = []
            for author in biblStruct.xpath('.//tei:author/tei:persName', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
                forenames = author.xpath('.//tei:forename[@type="first"]/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                middlenames = author.xpath('.//tei:forename[@type="middle"]/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                surname = author.xpath('.//tei:surname/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
                fullname = " ".join(forenames + middlenames + surname)
                authors.append(fullname)

            references.append({'title': title, 'authors': ', '.join(authors)})
    else:
        references = []
    return references

### Global functions for extracting info given a PDF file path

In [354]:
def get_authors_info(pdf_path):
    pdf_xml = pdf_2_xml(pdf_path)
    #emails
    info = extract_author_info(pdf_xml)
    return info

def get_citations(pdf_path):
    xml = extract_ref_from_pdf(pdf_file_path)
    refs = extract_titles_and_authors(xml)
    return refs

#### Examples

In [276]:
info = get_authors_info(pdf_file_path)
print(info)

[{'first_name': 'Neil', 'middle_name': 'M T', 'surname': 'Houlsby', 'email': 'neilhoulsby@google.com'}, {'first_name': 'David', 'middle_name': 'M', 'surname': 'Blei', 'email': 'david.blei@colombia.edu'}]


In [327]:
get_citations(pdf_file_path)

[{'title': 'Stochastic variational inference',
  'authors': 'M D Hoffman, D M Blei, C Wang, J Paisley'},
 {'title': 'Online learning for latent Dirichlet allocation',
  'authors': 'M D Hoffman, D M Blei, F Bach'},
 {'title': 'Unknown',
  'authors': 'J M Hernandez-Lobato, N M T Houlsby, Z Ghahramani'},
 {'title': 'Efficient discovery of overlapping communities in massive networks',
  'authors': 'P K Gopalan, D M Blei'},
 {'title': 'A scalable approach to probabilistic latent space inference of large-scale networks',
  'authors': 'J Yin, Q Ho, E Xing'},
 {'title': 'Unknown', 'authors': 'J Hensman, N Fusi, N D Lawrence'},
 {'title': 'A stochastic approximation method',
  'authors': 'H Robbins, S Monro'},
 {'title': 'An adaptive learning rate for stochastic variational inference',
  'authors': 'R Ranganath, C Wang, D M Blei, E P Xing'},
 {'title': 'Natural gradient works efficiently in learning',
  'authors': 'Shun-Ichi Amari'},
 {'title': 'A new approach to linear filtering and prediction

### Building an author dataset


In [376]:
with open('dataset.json', 'r') as file:
    papers_data = json.load(file)
author_json = []
for article in tqdm(papers_data):
    if '2022' not in article['year']:
        pdf_link = article['url']
        time.sleep(0.1)
        author_json += get_authors_info(pdf_link)
with open('authors.json', 'w') as file:
    json.dump(author_json, file, indent=2)

  7%|██████▏                                                                                   | 358/5168 [48:12<13:12:47,  9.89s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


  8%|███████▍                                                                                  | 430/5168 [56:50<15:40:17, 11.91s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 11%|██████████                                                                               | 585/5168 [1:15:06<7:46:49,  6.11s/it]

Error 500: [GENERAL] An exception occurred while running Grobid.


 13%|███████████▍                                                                            | 669/5168 [1:23:49<14:47:10, 11.83s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 16%|██████████████                                                                           | 819/5168 [1:40:56<9:45:07,  8.07s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 25%|██████████████████████▏                                                                 | 1303/5168 [2:37:37<9:16:33,  8.64s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 25%|██████████████████████                                                                 | 1313/5168 [2:38:52<10:49:34, 10.11s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 32%|███████████████████████████▉                                                           | 1658/5168 [3:16:53<12:39:59, 12.99s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 33%|████████████████████████████▋                                                           | 1688/5168 [3:20:00<6:40:50,  6.91s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 33%|█████████████████████████████▎                                                          | 1720/5168 [3:23:35<7:43:15,  8.06s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 34%|█████████████████████████████▌                                                          | 1738/5168 [3:25:35<6:38:59,  6.98s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 34%|██████████████████████████████                                                          | 1766/5168 [3:28:18<5:42:44,  6.04s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 40%|██████████████████████████████████▉                                                     | 2050/5168 [4:03:13<8:23:59,  9.70s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 44%|██████████████████████████████████████▋                                                | 2295/5168 [4:40:59<19:03:57, 23.89s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 45%|███████████████████████████████████████▌                                                | 2321/5168 [4:44:28<7:57:30, 10.06s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


100%|██████████████████████████████████████████████████████████████████████████████████████████| 5168/5168 [4:46:22<00:00,  3.32s/it]


In [384]:
#Going from JSON to SQL (Building the author SQL DB)
authors_file = 'authors.json'
with open(authors_file, 'r') as file:
        author_data = json.load(file)
conn = sqlite3.connect('nips.db')
cursor = conn.cursor()
for item in author_data:
    email = item['email']
    cursor.execute("SELECT id FROM authors WHERE email=?", (email,))
    existing_author_id = cursor.fetchone()
    if existing_author_id:
        continue
    else:
        cursor.execute("INSERT INTO authors (first_name, middle_name, last_name, email) VALUES (?, ?, ?,?)",
                       (item['first_name'], item['middle_name'], item['surname'], item['email']))
conn.commit()
conn.close()

In [415]:
#Visualizing the DB using Pandas
conn = sqlite3.connect('nips.db')
query = 'SELECT * FROM authors'
df = pd.read_sql_query(query, conn)
conn.close()
df.head(50)

Unnamed: 0,id,first_name,last_name,middle_name,email
0,1,Chris,Dann,,chrisdann@google.com
1,2,Teodor,Marinov,V,tvmarinov@google.com
2,3,Mehryar,Mohri,,mohri@google.com
3,4,Julian,Zimmert,,zimmert@google.com
4,5,Ahmed,Touati,,ahmed.touati@umontreal.ca
5,6,⇤,Mila,,
6,7,Yann,Ollivier,,
7,8,Nick,Whiteley,,nick.whiteley@bristol.ac.uk
8,9,Annie,Gray,,annie.gray@bristol.ac.uk
9,10,Patrick,Rubin-Delanchy,,patrick.rubin-delanchy@bristol.ac.uk


### Building the article_id,author_id SQL DB

#### Function to minimize calls to GROBID

In [438]:
import re
def get_authors_from_og(article):
    authors = []
    conn = sqlite3.connect('nips.db')
    cursor = conn.cursor()
    sep = r"[, ]"
    for author in article['authors']:
        splitting = re.split(sep, author)
        names = []
        for elem in splitting:
            if elem != '':
                names.append(elem)
        last_name,name = names[0],names[1]
        first_name = name
        #Looking for the name in the DB
        cursor.execute("SELECT COUNT(*) FROM authors WHERE first_name=? AND last_name = ?", (first_name,last_name))
        val = cursor.fetchone()[0]
        if val != 1:
            return get_authors_info(article['url'])
        else:
            cursor.execute("SELECT * FROM authors WHERE first_name=? AND last_name = ?", (first_name,last_name))
            ret = cursor.fetchone()
            dico = {'first_name':ret[1],'surname':ret[2],'middle_name':ret[3],'email':ret[4]}
            authors.append(dico)
    return authors

#### Some remaining issues with None values in the DB

In [441]:
#Setting up a DB for article_id and author_id
def get_article_id(db_path,article_title):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(f"SELECT id FROM articles WHERE title = ?", (article_title,))
    result = cursor.fetchone()
    # Check if the article was found
    if result:
        article_id = result[0]
        conn.close()
        return article_id
    else:
        print(f"No article found with the title '{article_title}'.")
        conn.close()
    
        
def get_author_id(db_path,first_name,last_name,email):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    if email :
        cursor.execute("SELECT id FROM authors WHERE first_name = ? AND last_name = ? AND email = ?",
                       (first_name,last_name,email))
    else:
        cursor.execute("SELECT id FROM authors WHERE first_name = ? AND last_name = ?",
                       (first_name,last_name))
    result = cursor.fetchone()
    if result:
        author_id = result[0]
        conn.close()
        return author_id
    else:
        print(f"No author found with the name '{first_name} {last_name}'.")
    conn.close()

with open('dataset.json', 'r') as file:
    article_metadata = json.load(file)
written_by = []
db_path = 'nips.db'
for article in tqdm(article_metadata):
    #Retrieve the article ID from DB
    if '2022' not in article['year']:
        article_id = get_article_id(db_path,article['title'])
        #Retrieve authors ID
        #Fetch name and email from URL
        authors = get_authors_from_og(article)
        for author in authors:
            author_id = get_author_id(db_path,author['first_name'],author['surname'],author['email'])
            written_by.append((article_id,author_id))
    else:
        continue

  1%|▍                                                                                           | 28/5168 [02:21<6:57:42,  4.88s/it]

No author found with the name 'None Basat'.


  2%|█▌                                                                                         | 92/5168 [09:06<13:17:34,  9.43s/it]

No author found with the name 'None Hkust'.


  2%|█▊                                                                                         | 104/5168 [10:28<6:37:40,  4.71s/it]

No author found with the name 'None Mcalinn'.


  3%|██▌                                                                                        | 142/5168 [13:29<6:02:28,  4.33s/it]

No author found with the name 'None Cachay'.


  3%|██▊                                                                                        | 162/5168 [15:09<8:15:18,  5.94s/it]

No author found with the name 'Mila Kawaguchi'.


  4%|███▎                                                                                      | 187/5168 [18:01<12:03:57,  8.72s/it]

No author found with the name 'None Lti'.


  5%|████▋                                                                                     | 270/5168 [27:37<12:31:10,  9.20s/it]

No author found with the name 'None Geurts'.


  6%|█████▏                                                                                     | 294/5168 [30:26<7:36:00,  5.61s/it]

No author found with the name 'None Spotify'.


  6%|█████▌                                                                                     | 319/5168 [34:03<6:14:42,  4.64s/it]

No author found with the name 'Zhongxiang Nguyen'.
No author found with the name 'None Jaillet'.


  6%|█████▋                                                                                     | 323/5168 [34:28<7:43:15,  5.74s/it]

No author found with the name 'None London'.


  6%|█████▊                                                                                     | 333/5168 [35:34<8:59:02,  6.69s/it]

No author found with the name 'None Tuan'.


  7%|██████                                                                                     | 344/5168 [36:31<5:46:44,  4.31s/it]

No author found with the name 'None Shin'ya Yamaguchi'.


  7%|██████▎                                                                                    | 357/5168 [37:43<7:32:04,  5.64s/it]

No author found with the name 'None Grewe'.


  7%|██████▏                                                                                   | 358/5168 [38:06<14:17:34, 10.70s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


  7%|██████▌                                                                                    | 371/5168 [39:56<9:29:52,  7.13s/it]

No author found with the name 'Marzyeh Ghassemi'.


  8%|███████▍                                                                                  | 430/5168 [45:38<13:34:51, 10.32s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


  8%|███████▌                                                                                   | 433/5168 [45:53<9:06:42,  6.93s/it]

No author found with the name 'None L2s'.
No author found with the name 'None Centralesupélec'.


  8%|███████▋                                                                                  | 439/5168 [48:06<27:34:18, 20.99s/it]

No author found with the name 'None Postech'.


 11%|██████████                                                                               | 585/5168 [1:00:01<5:45:03,  4.52s/it]

Error 500: [GENERAL] An exception occurred while running Grobid.


 12%|██████████▌                                                                              | 611/5168 [1:02:02<6:08:15,  4.85s/it]

No author found with the name 'None Tommaso D'orsi'.


 12%|██████████▌                                                                              | 615/5168 [1:02:15<5:43:14,  4.52s/it]

No author found with the name 'None Postech'.


 12%|██████████▋                                                                              | 621/5168 [1:02:40<5:58:44,  4.73s/it]

No author found with the name 'None Milano'.


 12%|███████████                                                                              | 639/5168 [1:04:07<5:21:09,  4.25s/it]

No author found with the name 'None Han'.


 13%|███████████▍                                                                            | 669/5168 [1:06:27<12:20:54,  9.88s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 14%|████████████                                                                             | 698/5168 [1:08:46<5:52:46,  4.74s/it]

No author found with the name 'Patrick Jaillet'.


 14%|████████████▌                                                                            | 727/5168 [1:10:40<6:46:55,  5.50s/it]

No author found with the name 'None Li'.


 15%|████████████▉                                                                            | 752/5168 [1:12:53<6:39:00,  5.42s/it]

No author found with the name 'None Research'.


 15%|█████████████                                                                            | 757/5168 [1:13:00<3:12:59,  2.63s/it]

No author found with the name 'None Research'.


 15%|█████████████▏                                                                           | 765/5168 [1:13:41<6:28:59,  5.30s/it]

No author found with the name 'None Pydi'.


 15%|█████████████▎                                                                           | 771/5168 [1:14:13<6:40:19,  5.46s/it]

No author found with the name 'None Combes'.


 15%|█████████████▊                                                                           | 799/5168 [1:16:18<4:40:06,  3.85s/it]

No author found with the name 'Tianyi Zhou'.


 16%|█████████████▉                                                                          | 819/5168 [1:18:12<10:12:37,  8.45s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 16%|██████████████▏                                                                          | 825/5168 [1:18:47<8:43:46,  7.24s/it]

No author found with the name 'None Kong'.


 16%|██████████████▍                                                                          | 841/5168 [1:19:59<7:26:31,  6.19s/it]

No author found with the name 'None Airi'.


 17%|███████████████                                                                          | 874/5168 [1:22:40<6:28:17,  5.43s/it]

No author found with the name 'None Deepmind'.


 17%|███████████████▍                                                                         | 900/5168 [1:25:14<7:48:45,  6.59s/it]

No author found with the name 'C Jinwoo'.


 18%|███████████████▋                                                                         | 911/5168 [1:26:08<5:01:28,  4.25s/it]

No author found with the name 'None Paschalidis'.


 18%|███████████████▌                                                                        | 916/5168 [1:27:05<14:10:19, 12.00s/it]

No author found with the name 'None Tuan'.


 18%|████████████████                                                                         | 933/5168 [1:28:55<8:03:31,  6.85s/it]

No author found with the name 'None Kim'.


 20%|█████████████████▊                                                                      | 1046/5168 [1:40:14<6:46:28,  5.92s/it]

No author found with the name 'None Liang Pang'.


 21%|██████████████████▏                                                                     | 1069/5168 [1:42:31<9:42:22,  8.52s/it]

No author found with the name 'None Cvlab'.


 22%|███████████████████▋                                                                    | 1156/5168 [1:51:30<7:02:05,  6.31s/it]

No author found with the name 'None Research'.
No author found with the name 'None Openai'.


 22%|███████████████████▋                                                                    | 1157/5168 [1:51:35<6:51:58,  6.16s/it]

No author found with the name 'None Lin'.
No author found with the name 'None Facebook'.


 23%|████████████████████▏                                                                   | 1188/5168 [1:54:22<7:15:36,  6.57s/it]

No author found with the name 'None Desai'.


 23%|████████████████████▏                                                                   | 1189/5168 [1:54:31<8:05:01,  7.31s/it]

No author found with the name 'None Milinković'.


 23%|████████████████████▍                                                                   | 1198/5168 [1:55:14<5:45:32,  5.22s/it]

No author found with the name 'None Nam'.


 24%|████████████████████▉                                                                   | 1228/5168 [1:57:43<4:26:52,  4.06s/it]

No author found with the name 'Tianyi Zhou'.


 24%|█████████████████████▎                                                                  | 1253/5168 [2:00:08<7:06:25,  6.54s/it]

No author found with the name 'None Mit'.


 25%|██████████████████████                                                                  | 1297/5168 [2:04:39<6:04:06,  5.64s/it]

No author found with the name 'Jose Blanchet'.


 25%|██████████████████████▏                                                                 | 1303/5168 [2:05:27<9:10:36,  8.55s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 25%|██████████████████████▎                                                                 | 1313/5168 [2:06:21<9:12:33,  8.60s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 26%|███████████████████████▏                                                                | 1359/5168 [2:10:01<4:51:00,  4.58s/it]

No author found with the name 'None Chen'.
No author found with the name 'None Gu'.


 27%|████████████████████████                                                                | 1413/5168 [2:14:14<6:58:48,  6.69s/it]

No author found with the name 'None -Champaign'.


 27%|████████████████████████▏                                                               | 1417/5168 [2:14:40<7:55:31,  7.61s/it]

No author found with the name 'None Mit'.
No author found with the name 'None Deepmind'.


 28%|████████████████████████▍                                                               | 1437/5168 [2:16:40<6:26:17,  6.21s/it]

No author found with the name 'None London'.
No author found with the name 'Hao Botao'.
No author found with the name 'None Deepmind'.


 28%|████████████████████████▌                                                               | 1439/5168 [2:16:50<6:01:36,  5.82s/it]

No author found with the name 'Zhuqing Liu'.


 28%|████████████████████████▋                                                               | 1450/5168 [2:18:09<7:05:10,  6.86s/it]

No author found with the name 'None Kini'.


 28%|█████████████████████████                                                               | 1471/5168 [2:20:07<6:29:59,  6.33s/it]

No author found with the name 'None Bastani'.


 29%|█████████████████████████▏                                                              | 1479/5168 [2:20:55<6:20:23,  6.19s/it]

No author found with the name 'Csaba Szepesvári'.


 29%|█████████████████████████▍                                                              | 1492/5168 [2:22:23<7:16:17,  7.12s/it]

No author found with the name 'None Uk'.


 29%|█████████████████████████▋                                                              | 1509/5168 [2:23:59<7:23:49,  7.28s/it]

No author found with the name 'None Australia'.


 30%|█████████████████████████▉                                                             | 1541/5168 [2:27:57<10:50:31, 10.76s/it]

No author found with the name 'Matthew Botvinick'.
No author found with the name 'None Deepmind'.


 31%|███████████████████████████                                                             | 1591/5168 [2:32:17<5:46:07,  5.81s/it]

No author found with the name 'None Nguyen'.


 31%|███████████████████████████▎                                                            | 1601/5168 [2:33:05<5:08:21,  5.19s/it]

No author found with the name 'None Hsu'.


 31%|███████████████████████████▋                                                            | 1626/5168 [2:35:20<4:48:37,  4.89s/it]

No author found with the name 'None Lai'.


 32%|████████████████████████████▏                                                           | 1656/5168 [2:37:34<5:11:42,  5.33s/it]

No author found with the name 'None Amazon'.


 32%|███████████████████████████▉                                                           | 1658/5168 [2:38:06<10:50:02, 11.11s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 33%|████████████████████████████▌                                                           | 1680/5168 [2:40:19<4:53:39,  5.05s/it]

No author found with the name 'None Nguyen'.


 33%|████████████████████████████▋                                                           | 1684/5168 [2:40:39<5:18:25,  5.48s/it]

No author found with the name 'None Papp'.


 33%|████████████████████████████▋                                                           | 1688/5168 [2:41:02<6:00:41,  6.22s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 33%|█████████████████████████████                                                           | 1706/5168 [2:42:02<4:15:27,  4.43s/it]

No author found with the name 'None Geist'.


 33%|█████████████████████████████▎                                                          | 1720/5168 [2:43:44<6:40:24,  6.97s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 34%|█████████████████████████████▌                                                          | 1737/5168 [2:45:21<5:07:55,  5.38s/it]

No author found with the name 'Han Hu'.


 34%|█████████████████████████████▋                                                          | 1747/5168 [2:45:58<4:20:46,  4.57s/it]

No author found with the name 'Marcello Restelli'.


 34%|██████████████████████████████                                                          | 1766/5168 [2:47:41<5:12:33,  5.51s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 34%|██████████████████████████████                                                          | 1769/5168 [2:48:02<6:09:15,  6.52s/it]

No author found with the name 'None -Champaign'.


 34%|██████████████████████████████▏                                                         | 1776/5168 [2:48:32<4:55:58,  5.24s/it]

No author found with the name 'None Sankararaman'.


 35%|██████████████████████████████▍                                                         | 1789/5168 [2:49:32<4:35:02,  4.88s/it]

No author found with the name 'None Mit'.


 36%|███████████████████████████████▋                                                        | 1858/5168 [2:55:41<7:31:51,  8.19s/it]

No author found with the name 'None Florence D'alché-Buc'.


 36%|███████████████████████████████▋                                                        | 1861/5168 [2:56:07<6:50:52,  7.45s/it]

No author found with the name 'None Paschalidis'.


 36%|███████████████████████████████▋                                                        | 1863/5168 [2:56:20<6:20:37,  6.91s/it]

No author found with the name 'None Mit'.


 36%|███████████████████████████████▊                                                        | 1871/5168 [2:57:05<5:15:09,  5.74s/it]

No author found with the name 'None Combes'.


 37%|████████████████████████████████▌                                                       | 1915/5168 [3:01:04<5:24:26,  5.98s/it]

No author found with the name 'Marzyeh Ghassemi'.


 38%|█████████████████████████████████▏                                                      | 1947/5168 [3:04:04<4:46:51,  5.34s/it]

No author found with the name 'None Tommaso D'orsi'.


 38%|█████████████████████████████████▌                                                      | 1971/5168 [3:05:30<2:46:00,  3.12s/it]

No author found with the name 'None London'.
No author found with the name 'Donoghue Deepmind'.
No author found with the name 'None Deepmind'.


 38%|█████████████████████████████████▊                                                      | 1984/5168 [3:06:23<4:39:49,  5.27s/it]

No author found with the name 'None Lions'.


 39%|██████████████████████████████████▎                                                     | 2013/5168 [3:08:52<4:50:34,  5.53s/it]

No author found with the name 'None Facebook'.


 39%|██████████████████████████████████▍                                                     | 2019/5168 [3:09:17<4:24:41,  5.04s/it]

No author found with the name 'None Dziugaite'.


 40%|██████████████████████████████████▊                                                     | 2042/5168 [3:11:09<6:58:43,  8.04s/it]

No author found with the name 'None Research'.


 40%|██████████████████████████████████▉                                                     | 2050/5168 [3:12:03<6:58:09,  8.05s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 40%|██████████████████████████████████▉                                                     | 2051/5168 [3:12:09<6:31:32,  7.54s/it]

No author found with the name 'None Openai'.


 40%|██████████████████████████████████▉                                                     | 2054/5168 [3:12:34<6:48:32,  7.87s/it]

No author found with the name 'None Patel'.


 40%|███████████████████████████████████▍                                                    | 2084/5168 [3:15:47<8:07:00,  9.47s/it]

No author found with the name 'None Rus'.


 41%|███████████████████████████████████▋                                                    | 2097/5168 [3:16:40<4:18:24,  5.05s/it]

No author found with the name 'None Deepmind'.


 42%|████████████████████████████████████▌                                                   | 2145/5168 [3:20:41<5:00:11,  5.96s/it]

No author found with the name 'None Epfl'.


 42%|████████████████████████████████████▋                                                   | 2154/5168 [3:21:33<5:32:02,  6.61s/it]

No author found with the name 'Bernard Thabet'.
No author found with the name 'None Ghanem'.


 42%|████████████████████████████████████▋                                                   | 2157/5168 [3:21:47<4:20:27,  5.19s/it]

No author found with the name 'None Gulluk'.


 42%|████████████████████████████████████▊                                                   | 2160/5168 [3:21:59<3:39:21,  4.38s/it]

No author found with the name 'None Inria'.
No author found with the name 'Alessandro Rudi Inria'.


 42%|█████████████████████████████████████                                                   | 2173/5168 [3:23:14<4:39:08,  5.59s/it]

No author found with the name 'None Liacs'.


 43%|█████████████████████████████████████▋                                                  | 2213/5168 [3:26:09<3:44:39,  4.56s/it]

No author found with the name 'None Deepmind'.


 44%|██████████████████████████████████████▋                                                 | 2271/5168 [3:32:06<4:04:33,  5.06s/it]

No author found with the name 'None Csail'.
No author found with the name 'None Hallucinated'.


 44%|███████████████████████████████████████                                                 | 2291/5168 [3:33:57<4:21:36,  5.46s/it]

No author found with the name 'None London'.


 44%|██████████████████████████████████████▋                                                | 2295/5168 [3:35:17<15:39:23, 19.62s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


 44%|██████████████████████████████████████▋                                                | 2296/5168 [3:35:23<12:35:18, 15.78s/it]

No author found with the name 'None Amazon'.
No author found with the name 'None Google'.


 45%|███████████████████████████████████████▌                                                | 2321/5168 [3:37:49<6:35:04,  8.33s/it]

Error 500: [NO_BLOCKS] PDF parsing resulted in empty content


100%|██████████████████████████████████████████████████████████████████████████████████████████| 5168/5168 [3:39:08<00:00,  2.54s/it]


In [450]:
#Building the SQL DB
conn = sqlite3.connect('nips.db')
cursor = conn.cursor()
for item in written_by:
    print(item)
    cursor.execute("SELECT * FROM written_by WHERE article_id= ? AND author_id =?", (item[0],item[1]))
    existing_author_id = cursor.fetchone()
    if existing_author_id:
        continue
    else:
        cursor.execute("INSERT INTO written_by (article_id,author_id) VALUES (?, ?)",
                   (item))
conn.commit()
conn.close()




In [451]:
#Visualizing the DB
conn = sqlite3.connect('nips.db')
query = 'SELECT * FROM written_by'
df = pd.read_sql_query(query, conn)
conn.close()
df.head()

Unnamed: 0,article_id,author_id
0,1,1.0
1,1,2.0
2,1,3.0
3,1,4.0
4,2,5.0


### Building a DB for article Citations

In [None]:
# Setting up the references DB
with open('dataset.json', 'r') as file:
    article_metadata = json.load(file)
references = []
db_path = 'nips.db'
for article in article_metadata:
    time.sleep(0.1)
    #Retrieve the article ID from DB
    article_id = get_article_id(db_path,article['title'])
    refs = get_citations(article['url'])
    for ref in refs : 
        references.append((article_id,ref['title'],ref['authors']))

In [475]:
conn = sqlite3.connect('nips.db')
cursor = conn.cursor()
for ref in references:
    cursor.execute("INSERT INTO citations (article_id,title,authors) VALUES (?, ?,?)",
                   (ref))
conn.commit()
conn.close()

In [476]:
conn = sqlite3.connect('nips.db')
query = 'SELECT * FROM citations'
df = pd.read_sql_query(query, conn)
conn.close()
df.head()

Unnamed: 0,article_id,title,authors
0,1083,Stochastic variational inference,"M D Hoffman, D M Blei, C Wang, J Paisley"
1,1083,Online learning for latent Dirichlet allocation,"M D Hoffman, D M Blei, F Bach"
2,1083,Unknown,"J M Hernandez-Lobato, N M T Houlsby, Z Ghahramani"
3,1083,Efficient discovery of overlapping communities...,"P K Gopalan, D M Blei"
4,1083,A scalable approach to probabilistic latent sp...,"J Yin, Q Ho, E Xing"
