In [121]:
import argparse
import json
import os
import re
import time
import uuid
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from lxml import etree

In [122]:
_BASE_URL = 'http://papers.nips.cc'
_CRAWLING_WAIT_TIME = 0.3
_NEURIPS_NAMESPACE = uuid.UUID('5ee6531f-0d79-4cf1-8da6-dc83cb553336')
_FIRST_YEAR = 1988
_PDF_FOLDER = 'pdfs'
_OUTPUT_PAPERS_FILE = 'papers_data.jsons'

In [123]:
class NipsPaper:
    def __init__(self,year,title,doi,url,authors,abstract):
        self.year = year
        self.title = title
        self.doi = doi
        self.abstract = abstract
        self.authors = authors
        self.url = url
    
    
    def to_json(self):
        return {
            'id': self.doi,
            'title': self.title,
            'year' : self.year,
            'url': self.url,
            'abstract': self.abstract,
            'authors': self.authors
        }

In [124]:
def get_conference_links(year_from, year_to):
    base = _BASE_URL + '/paper_files/paper/'
    number_year_from = year_from - _FIRST_YEAR + 1
    number_year_to = year_to - _FIRST_YEAR + 1
    for idx, i in enumerate(range(number_year_from, number_year_to + 1)):
        year = str(year_from + idx)
        url = base + year
        yield (url,year)

        
def get_papers_year(url):
    base = _BASE_URL
    url_request = requests.get(url)
    soup = BeautifulSoup(url_request.content,'html5lib')
    papers_in_html = soup.find_all('a',attrs = {'title':'paper title'})
    for paper in papers_in_html:
        paper_link = base + paper["href"]
        yield paper_link

        
def get_paper_info(paper_url):
    url_request = requests.get(paper_url)
    soup = BeautifulSoup(url_request.content,'html5lib')
    #Paper title
    paper_title = soup.find('title').text
    #Paper authors
    authors_tags = soup.find_all('meta',attrs = {'name':'citation_author'})
    authors = []
    for author in authors_tags:
        authors.append(author['content'])
    #Paper publication date
    date = soup.find('meta',attrs = {'name':'citation_publication_date'})['content']
    #Paper url
    article_url = soup.find('meta',attrs = {'name':'citation_pdf_url'})['content']
    #Abstract 
    abstract_tag = soup.find('h4',text='Abstract')
    abstract_text = ''
    for p in abstract_tag.find_all_next('p'):
        if len(p.text) != 0:
            abstract_text += p.text
            break
    #Building paper ID
    splitting_url = article_url.split('/')
    index_hash = splitting_url.index("file")
    paper_id = splitting_url[index_hash + 1].split('-')[0]
    return NipsPaper(date,paper_title,paper_id,article_url,authors,abstract_text)

In [88]:
dumping_json = []
start_year = 2012
end_year = 2014
cpt = 0
base_folder = 'NIPS_papers_'
for url in tqdm(get_conference_links(start_year,end_year),'getting_conference_links'):
    time.sleep(_CRAWLING_WAIT_TIME)
    output_folder = base_folder + str(start_year + cpt)
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    for paper_link in tqdm(get_papers_year(url[0])):
        time.sleep(_CRAWLING_WAIT_TIME)
        nips_paper = get_paper_info(paper_link)
        dumping_json.append(nips_paper.to_json())
    cpt += 1
with open('dataset.json', 'w') as file:
    json.dump(dumping_json, file, indent=2)

getting_conference_links: 0it [00:00, ?it/s]
0it [00:00, ?it/s][A
1it [00:03,  3.28s/it][A
2it [00:04,  2.22s/it][A
3it [00:06,  1.89s/it][A
4it [00:07,  1.73s/it][A
5it [00:09,  1.64s/it][A
6it [00:10,  1.55s/it][A
7it [00:12,  1.51s/it][A
8it [00:13,  1.48s/it][A
9it [00:14,  1.50s/it][A
10it [00:16,  1.52s/it][A
11it [00:18,  1.52s/it][A
12it [00:19,  1.60s/it][A
13it [00:21,  1.63s/it][A
14it [00:23,  1.67s/it][A
15it [00:24,  1.62s/it][A
16it [00:26,  1.59s/it][A
17it [00:27,  1.56s/it][A
18it [00:29,  1.55s/it][A
19it [00:30,  1.51s/it][A
20it [00:32,  1.52s/it][A
21it [00:33,  1.52s/it][A
22it [00:35,  1.49s/it][A
23it [00:36,  1.49s/it][A
24it [00:38,  1.52s/it][A
25it [00:39,  1.51s/it][A
26it [00:41,  1.50s/it][A
27it [00:42,  1.53s/it][A
28it [00:44,  1.55s/it][A
29it [00:46,  1.55s/it][A
30it [00:47,  1.52s/it][A
31it [00:48,  1.49s/it][A
32it [00:50,  1.47s/it][A
33it [00:51,  1.46s/it][A
34it [00:53,  1.45s/it][A
35it [00:54,  1.45s/it]

295it [07:12,  1.39s/it][A
296it [07:14,  1.42s/it][A
297it [07:15,  1.41s/it][A
298it [07:17,  1.40s/it][A
299it [07:18,  1.39s/it][A
300it [07:19,  1.38s/it][A
301it [07:21,  1.39s/it][A
302it [07:22,  1.39s/it][A
303it [07:24,  1.40s/it][A
304it [07:25,  1.41s/it][A
305it [07:26,  1.41s/it][A
306it [07:28,  1.41s/it][A
307it [07:29,  1.43s/it][A
308it [07:31,  1.42s/it][A
309it [07:32,  1.41s/it][A
310it [07:34,  1.41s/it][A
311it [07:35,  1.40s/it][A
312it [07:36,  1.39s/it][A
313it [07:38,  1.38s/it][A
314it [07:39,  1.37s/it][A
315it [07:40,  1.37s/it][A
316it [07:42,  1.41s/it][A
317it [07:43,  1.42s/it][A
318it [07:45,  1.44s/it][A
319it [07:46,  1.47s/it][A
320it [07:48,  1.47s/it][A
321it [07:49,  1.43s/it][A
322it [07:50,  1.42s/it][A
323it [07:52,  1.40s/it][A
324it [07:53,  1.40s/it][A
325it [07:55,  1.41s/it][A
326it [07:56,  1.42s/it][A
327it [07:58,  1.41s/it][A
328it [07:59,  1.42s/it][A
329it [08:00,  1.41s/it][A
330it [08:02,  1.42s

218it [05:15,  1.45s/it][A
219it [05:16,  1.45s/it][A
220it [05:18,  1.45s/it][A
221it [05:20,  1.74s/it][A
222it [05:22,  1.72s/it][A
223it [05:23,  1.68s/it][A
224it [05:25,  1.59s/it][A
225it [05:26,  1.52s/it][A
226it [05:27,  1.47s/it][A
227it [05:29,  1.46s/it][A
228it [05:30,  1.46s/it][A
229it [05:32,  1.44s/it][A
230it [05:33,  1.42s/it][A
231it [05:34,  1.40s/it][A
232it [05:36,  1.42s/it][A
233it [05:37,  1.44s/it][A
234it [05:39,  1.44s/it][A
235it [05:40,  1.42s/it][A
236it [05:41,  1.40s/it][A
237it [05:43,  1.41s/it][A
238it [05:44,  1.41s/it][A
239it [05:46,  1.45s/it][A
240it [05:47,  1.45s/it][A
241it [05:49,  1.48s/it][A
242it [05:50,  1.49s/it][A
243it [05:52,  1.46s/it][A
244it [05:53,  1.45s/it][A
245it [05:55,  1.43s/it][A
246it [05:56,  1.45s/it][A
247it [05:57,  1.44s/it][A
248it [05:59,  1.46s/it][A
249it [06:00,  1.43s/it][A
250it [06:02,  1.45s/it][A
251it [06:07,  2.42s/it][A
252it [06:08,  2.22s/it][A
253it [06:10,  2.14s

151it [03:59,  1.56s/it][A
152it [04:01,  1.51s/it][A
153it [04:03,  1.83s/it][A
154it [04:05,  1.75s/it][A
155it [04:06,  1.66s/it][A
156it [04:08,  1.64s/it][A
157it [04:09,  1.56s/it][A
158it [04:11,  1.52s/it][A
159it [04:12,  1.52s/it][A
160it [04:14,  1.54s/it][A
161it [04:16,  1.77s/it][A
162it [04:20,  2.32s/it][A
163it [04:21,  2.09s/it][A
164it [04:23,  1.94s/it][A
165it [04:24,  1.81s/it][A
166it [04:26,  1.71s/it][A
167it [04:27,  1.64s/it][A
168it [04:29,  1.62s/it][A
169it [04:30,  1.59s/it][A
170it [04:32,  1.56s/it][A
171it [04:33,  1.55s/it][A
172it [04:35,  1.51s/it][A
173it [04:36,  1.54s/it][A
174it [04:38,  1.54s/it][A
175it [04:39,  1.51s/it][A
176it [04:41,  1.47s/it][A
177it [04:42,  1.45s/it][A
178it [04:43,  1.44s/it][A
179it [04:45,  1.42s/it][A
180it [04:46,  1.43s/it][A
181it [04:48,  1.43s/it][A
182it [04:49,  1.43s/it][A
183it [04:51,  1.42s/it][A
184it [04:52,  1.42s/it][A
185it [04:53,  1.44s/it][A
186it [04:55,  1.43s

In [125]:
def download_papers(paper_file):
    with open(paper_file, 'r') as file:
        papers_data = json.load(file)
    base_folder = 'papers/'
    for paper in papers_data:
        time.sleep(_CRAWLING_WAIT_TIME)
        year = paper['year']
        pdf_url = paper['url']
        year_folder = os.path.join(base_folder, year)
        os.makedirs(year_folder, exist_ok=True)
        response = requests.get(pdf_url)
        if response.status_code == 200:
            filename = paper['title'].replace(' ', '_')+'.pdf'
            file_path = os.path.join(year_folder, filename)
            with open(file_path, 'wb') as file:
                file.write(response.content)
        

In [97]:
download_papers('dataset.json')

papers/2012/Topology_Constraints_in_Graphical_Models.pdf
papers/2012/Clustering_Aggregation_as_Maximum-Weight_Independent_Set.pdf
papers/2012/FastEx:_Hash_Clustering_with_Exponential_Families.pdf


KeyboardInterrupt: 

In [126]:
import requests
grobid_api_url = "http://localhost:8070/api/processFulltextDocument"
pdf_file_path = "papers/2012/Topology_Constraints_in_Graphical_Models.pdf"


def pdf_2_xml(pdf_file_path):
    # Open and read the PDF file
    with open(pdf_file_path, "rb") as file:
        # Prepare the files parameter for the API request
        files = {'input': file}

        # Make the API request to GROBID
        response = requests.post(grobid_api_url, files=files)

    # Check if the request was successful
    if response.status_code == 200:
        # Extracted bibliographic information in XML format
        extracted_data = response.text
        print(extracted_data)
    else:
        print(f"Error {response.status_code}: {response.text}")
        return extracted_data


In [127]:
def extract_email(xml):
    xml_content = xml.replace('<?xml version="1.0" encoding="UTF-8"?>', '')

    # Analyser le contenu XML
   
    root = etree.fromstring(xml_content)
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Utiliser XPath pour trouver les adresses e-mail
    emails = root.xpath('//tei:email/text()', namespaces=namespaces)
    return emails
def extract_ref_from_pdf(pdf_file_path):
    grobid_url = 'http://localhost:8070/api/processReferences'
    with open(pdf_file_path, "rb") as file:
        # Prepare the files parameter for the API request
        files = {'input': file}
        response = requests.post(grobid_url, files=files)
    if response.status_code == 200:
        # Grobid renvoie les données au format TEI XML
        grobid_response = response.text
        # Analysez ici le XML pour extraire les informations
        return grobid_response
    else:
        return "Erreur lors de la communication avec Grobid."

In [117]:
refs = extract_ref_from_pdf(pdf_file_path)
def extract_titles_and_authors(xml_content):
    # Analyser le contenu XML
    root = etree.fromstring(xml_content)
    references = []

    # Parcourir chaque référence
    for biblStruct in root.xpath('//tei:biblStruct', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
        # Extraire le titre
        title = biblStruct.xpath('.//tei:title[@level="a" and @type="main"]/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
        title = title[0] if title else "Unknown"

        # Extraire les auteurs
        authors = []
        for author in biblStruct.xpath('.//tei:author/tei:persName', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
            forenames = author.xpath('.//tei:forename[@type="first"]/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
            middlenames = author.xpath('.//tei:forename[@type="middle"]/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
            surname = author.xpath('.//tei:surname/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
            fullname = " ".join(forenames + middlenames + surname)
            authors.append(fullname)

        references.append({'title': title, 'authors': ', '.join(authors)})

    return references

In [118]:
extract_titles_and_authors(refs)

[{'title': 'Model selection through sparse maximum likelihood estimation for multivariate gaussian or binary data',
  'authors': 'O Banerjee, L El Ghaoui, A Aspremont'},
 {'title': 'Emergence of scaling in random networks',
  'authors': 'A Barabási, R Albert'},
 {'title': 'Covariance selection', 'authors': 'A Dempster'},
 {'title': 'Sparse inverse covariance estimation with the graphical lasso',
  'authors': 'J Friedman, T Hastie, R Tibshirani'},
 {'title': 'Unknown', 'authors': 'J Friedman, T Hastie, R Tibshirani'},
 {'title': 'Estimating high-dimensional directed acyclic graphs with the PC-Algorithm',
  'authors': 'M Kalisch, P Bühlmann'},
 {'title': 'Unknown', 'authors': 'S Lauritzen'},
 {'title': 'Learning scale free networks by reweighted 1 regularization',
  'authors': 'Q Liu, A Ihler'},
 {'title': 'High-dimensional graphs and variable selection with the Lasso',
  'authors': 'N Meinshausen, P Bühlmann'},
 {'title': 'Unknown', 'authors': 'M Newman'},
 {'title': 'Partial correlatio

In [186]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('articles.db')
cursor = conn.cursor()

# Iterate through JSON data and insert into the table
for item in papers_data:
    cursor.execute("INSERT INTO articles (title, year, url, abstract,location) VALUES (?, ?, ?, ?,?)",
                   (item['title'], item['year'], item['url'], item['abstract'],'Papers/'+str(item['year']) + '/'+item['title']+'.pdf'))

# Commit changes and close the connection
conn.commit()
conn.close()

In [188]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('articles.db')  # Replace with the actual path to your SQLite database file

# Query to select all rows from the 'your_table' table
query = 'SELECT * FROM articles'

# Use the connection and query to read data into a DataFrame
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the DataFrame
print(df)


        id                                              title  year  \
0        1           Topology Constraints in Graphical Models  2012   
1        2  Clustering Aggregation as Maximum-Weight Indep...  2012   
2        3  FastEx: Hash Clustering with Exponential Families  2012   
3        4  The Bethe Partition Function of Log-supermodul...  2012   
4        5    Selective Labeling via Error Bound Minimization  2012   
...    ...                                                ...   ...   
3418  3419  Analysis of Brain States from Multi-Region LFP...  2014   
3419  3420       Clamping Variables and Approximate Inference  2014   
3420  3421  Neural Word Embedding as Implicit Matrix Facto...  2014   
3421  3422  Constrained convex minimization via model-base...  2014   
3422  3423  A Filtering Approach to Stochastic Variational...  2014   

                                                    url  \
0     https://proceedings.neurips.cc/paper_files/pap...   
1     https://proceedings.neu

(1, 2012)
(2, 2012)
(3, 2012)
(4, 2012)
(5, 2012)
(6, 2012)
(7, 2012)
(8, 2012)
(9, 2012)
(10, 2012)
(11, 2012)
(12, 2012)
(13, 2012)
(14, 2012)
(15, 2012)
(16, 2012)
(17, 2012)
(18, 2012)
(19, 2012)
(20, 2012)
(21, 2012)
(22, 2012)
(23, 2012)
(24, 2012)
(25, 2012)
(26, 2012)
(27, 2012)
(28, 2012)
(29, 2012)
(30, 2012)
(31, 2012)
(32, 2012)
(33, 2012)
(34, 2012)
(35, 2012)
(36, 2012)
(37, 2012)
(38, 2012)
(39, 2012)
(40, 2012)
(41, 2012)
(42, 2012)
(43, 2012)
(44, 2012)
(45, 2012)
(46, 2012)
(47, 2012)
(48, 2012)
(49, 2012)
(50, 2012)
(51, 2012)
(52, 2012)
(53, 2012)
(54, 2012)
(55, 2012)
(56, 2012)
(57, 2012)
(58, 2012)
(59, 2012)
(60, 2012)
(61, 2012)
(62, 2012)
(63, 2012)
(64, 2012)
(65, 2012)
(66, 2012)
(67, 2012)
(68, 2012)
(69, 2012)
(70, 2012)
(71, 2012)
(72, 2012)
(73, 2012)
(74, 2012)
(75, 2012)
(76, 2012)
(77, 2012)
(78, 2012)
(79, 2012)
(80, 2012)
(81, 2012)
(82, 2012)
(83, 2012)
(84, 2012)
(85, 2012)
(86, 2012)
(87, 2012)
(88, 2012)
(89, 2012)
(90, 2012)
(91, 2012)
(92, 201

In [179]:
## Building an article dataset
with open('dataset.json', 'r') as file:
    papers_data = json.load(file)
