# JSON -> CSV
> ### Helper functions for you to easily format inner dictionaries from each file.


In [2]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
from datetime import datetime
start1 = datetime.now()

In [3]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [4]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

> ### Biorxiv- Exploration:


In [5]:
biorxiv_dir = '2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 803


In [6]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [7]:
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


> ### Biorxiv- Abstract:

In [8]:
pprint(file['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'word count: 194 22 Text word count: 5168 23 24 25 author/funder. '
          'All rights reserved. No reuse allowed without permission. Abstract '
          '27 The positive stranded RNA genomes of picornaviruses comprise a '
          'single large open reading 28 frame flanked by 5′ and 3′ '
          'untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 '
          'has an unusually large 5′ UTR (1.3 kb) containing five structural '
          'domains. These include the 30 internal ribosome entry site (IRES), '
          'which facilitates initiation of translation, and the cis-acting 31 '
          'replication element (cre). Less well characterised structures are a '
          '5′ terminal 360 nucleotide 32 stem-loop, a variable length '
          'poly-C-tract of approximately 100-200 nucleotides and a series of '
          '33 two to four tandemly repeated pseudoknots (PKs). We investigated

> ### Biorxiv- body text:

In [9]:
print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 20
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


In [10]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [],
  'ref_spans': [{...}],
  'section': '',
  'text': 'VP3, and VP0 (which is further processed to VP2 and VP4 during '
          'virus assembly) (6). The P2 64 and P3 regions encode the '
          'non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro '
          'and 4 structural protein-coding region is replaced by reporter '
          'genes, allow the study of genome 68 replication without the '
          'requirement for high containment (9, 10) ( figure 1A ).'},
 {'cite_spans': [{...}, {...}, {...}, {...}, {...}, {...}],
  'ref_spans': [],
  'section': '70',
  'text': 'The FMDV 5′ UTR is the largest known picornavirus UTR, comprising '
          'approximately 1300 71 nucleotides and containing several highly '
          'structured regions. The first 360 nucleotides at the 5′ 72 end are '
          'predicted to fold into a single large stem loop termed the '
          'S-fragment, followed by a The PKs were originally predicted 

In [11]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['',
 '70',
 '120',
 '135',
 '136',
 '144',
 '301',
 'Function of the PKs in replication is dependent on downstream interactions '
 'and 350',
 '368',
 '468',
 '479']


In [12]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])



VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).

70

The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, p

In [13]:
print(format_body(file['body_text'])[:3000])



VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).

70

The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, p

> ### Biorxiv- Metadata:

In [14]:
print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


In [15]:
print(all_files[0]['metadata']['title'])

The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3


In [16]:
authors = all_files[0]['metadata']['authors']
pprint(authors[:3])

[{'affiliation': {},
  'email': '',
  'first': 'Joseph',
  'last': 'Ward',
  'middle': ['C'],
  'suffix': ''},
 {'affiliation': {},
  'email': '',
  'first': 'Lidia',
  'last': 'Lasecka-Dykes',
  'middle': [],
  'suffix': ''},
 {'affiliation': {},
  'email': '',
  'first': 'Chris',
  'last': 'Neil',
  'middle': [],
  'suffix': ''}]


In [17]:
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Joseph C Ward
Affiliation: 

Name: Lidia Lasecka-Dykes
Affiliation: 

Name: Chris Neil
Affiliation: 

Name: Oluwapelumi Adeyemi
Affiliation: 

Name: Sarah 
Affiliation: 

Name:  Gold
Affiliation: 

Name: Niall Mclean
Affiliation: 

Name: Caroline Wright
Affiliation: 

Name: Morgan R Herod
Affiliation: 

Name: David Kealy
Affiliation: 

Name: Emma 
Affiliation: 

Name: Warner 
Affiliation: 

Name: Donald P King
Affiliation: 

Name: Tobias J Tuthill
Affiliation: 

Name: David J Rowlands
Affiliation: 

Name: Nicola J 
Affiliation: 

Name: Stonehouse A#
Affiliation: 



In [18]:
pprint(all_files[4]['metadata'], depth=4)

{'authors': [{'affiliation': {'institution': 'Chinese Academy of Sciences',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Xiaoyang',
              'last': 'Ji',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'Chinese Academy of Sciences',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Chunming',
              'last': 'Zhang',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'Chinese Academy of Sciences',
                              'laboratory': 'State Key Laboratory of Computer '
                                            'Architecture',
                              'location': {...}},
              'email': '',
              'first': 'Yubo',
              'last': 'Zhai',
 

In [19]:
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhonghai Zhang, Chunli Zhang, Yiqing Xue, Guangming Tan, Gang Niu

Formatting with affiliation:
Xiaoyang Ji (Chinese Academy of Sciences, Beijing, China), Chunming Zhang (Chinese Academy of Sciences, Beijing, China), Yubo Zhai (Chinese Academy of Sciences, Beijing, China), Zhonghai Zhang, Chunli Zhang (Phil Rivers Technology, Beijing, China), Yiqing Xue (Chinese Academy of Sciences, Beijing, China), Guangming Tan (Chinese Academy of Sciences, Beijing, China), Gang Niu (Chinese Academy of Sciences, Beijing, China)


> ### Biorxiv- bibliography:

In [20]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [{'first': 'T', 'last': 'Jackson', 'middle': [], 'suffix': ''},
              {'first': 'T', 'last': 'Tuthill', 'middle': [...], 'suffix': ''},
              {'first': 'D', 'last': 'Rowlands', 'middle': [...], 'suffix': ''},
              {'first': 'N',
               'last': 'Stonehouse',
               'middle': [...],
               'suffix': ''}],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b0',
  'title': 'Genetic economy in 598 picornaviruses: Foot-and-mouth disease '
           'virus replication exploits alternative precursor 599 cleavage '
           'pathways',
  'venue': 'PLOS Pathog',
  'volume': '13',
  'year': 2017},
 {'authors': [{'first': 'N',
               'last': 'Sanderson',
               'middle': [...],
               'suffix': ''},
              {'first': 'N', 'last': 'Knowles', 'middle': [...], 'suffix': ''},
              {'first': 'D', 'last': 'King', 'middle': [...], 'suffix': ''},
              {'first': 'E', 'last': 'Cottam', 

In [21]:
format_authors(bibs[1]['authors'], with_affiliation=False)

'N D Sanderson, N J Knowles, D P King, E M Cottam'

In [22]:
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

Genetic economy in 598 picornaviruses: Foot-and-mouth disease virus replication exploits alternative precursor 599 cleavage pathways, T Jackson, T J Tuthill, D J Rowlands, N J Stonehouse, PLOS Pathog, 2017; A universal protocol to 602 generate consensus level genome sequences for foot-and-mouth disease virus and other 603 positive-sense polyadenylated RNA viruses using the Illumina MiSeq, N D Sanderson, N J Knowles, D P King, E M Cottam, BMC Genomics, 2014; Library preparation for highly accurate population 606 sequencing of RNA viruses, A Acevedo, R Andino, Nat Protoc, 2014; IDBA-UD: a de novo assembler for 608 single-cell and metagenomic sequencing data with highly uneven depth, Y Peng, Hcm Leung, S M Yiu, Fyl Chin, , 2012; Basic local alignment 611 search tool, S F Altschul, W Gish, W Miller, E W Myers, D J Lipman, J Mol Biol, 1990


> ## Biorxiv- Generate CSV:

In [27]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

100%|███████████████████████████████████████████████████████████████████████████████| 803/803 [00:02<00:00, 286.91it/s]


In [28]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",Abstract\n\nword count: 194 22 Text word count...,"\n\nVP3, and VP0 (which is further processed t...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic..."
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, † , ...","Hanchu Zhou (City University of Hong Kong, Hon...",,Introduction\n\nThe 2019-nCoV epidemic has spr...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H..."
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (University of Georgia, 30602, A...",Abstract\n\nInfectious bronchitis (IB) causes ...,"Introduction\n\nInfectious bronchitis (IB), wh...",Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen..."
3,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (University of Southampton, UK), ...",Abstract\n\nBackground: A novel coronavirus (2...,"Introduction\n\nIn December 2019, a cluster of...",A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel..."
4,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...","Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhongh...","Xiaoyang Ji (Chinese Academy of Sciences, Beij...",Abstract\n\nFaced with the current large-scale...,Introduction\n\nThe sudden outbreak of the new...,A pneumonia outbreak associated with a new cor...,"[{'first': 'Xiaoyang', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A pneum..."


In [32]:
clean_df.to_csv('2020-03-13/biorxiv/biorxiv_clean.csv', index=False)

In [33]:
# PMC
pmc_dir = '2020-03-13/pmc_custom_license/pmc_custom_license/'
pmc_files = load_files(pmc_dir)
pmc_df = generate_clean_df(pmc_files)
pmc_df.head()

100%|█████████████████████████████████████████████████████████████████████████████| 1426/1426 [00:03<00:00, 359.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1426/1426 [00:05<00:00, 238.08it/s]


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,002f09dfc9a1323a15bf72e349d8b733ac97a2aa,,,,,\n\nT he modern word camel is derived from the...,Ancient and modern DNA reveal dynamics of dome...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Ancient..."
1,0036e8891c93ae63611bde179ada1e03e8577dea,Stable Occupancy of the Crimean-Congo Hemorrha...,"Florine E M Scholte, Brian L Hua, Jessica R Sp...",Florine E M Scholte (National Center for Emerg...,Abstract\n\nCrimean-Congo hemorrhagic fever vi...,\n\nto Western Europe with the assistance of m...,Seroepidemiological studies of Crimean-Congo h...,"[{'first': 'Florine', 'middle': ['E M'], 'last...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Seroepi..."
2,00573277e6be50669016f770bc28ec2da0639a8f,Asymptomatic Severe Acute Respiratory Syndrome...,,,,\n\nWe identified a nurse who was asymptomatic...,Coronavirus as a possible cause of severe acut...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Coronav..."
3,00683d59d56123ae85e080d00ef1b3edd3f7405d,A Rift Valley fever (RVF) epidemic affecting a...,"Raphaëlle Métras, Marc Baguelin, W John Edmund...","Raphaëlle Métras, Marc Baguelin, W John Edmund...",Abstract\n\nThe first cases occurred after hea...,R ift Valley fever (RVF) is a zoonotic arbovir...,Rift Valley fever virus (Bunyaviridae: Phlebov...,"[{'first': 'Raphaëlle', 'middle': [], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Rift Va..."
4,0104f6ceccf92ae8567a0102f89cbb976969a774,BMC Medical Genetics Association of HLA class ...,"Marie Lin, Hsiang-Kuang Tseng, Jean A Trejaut,...","Marie Lin (Mackay Memorial Hospital, Taipei, T...",Abstract\n\nThe human leukocyte antigen (HLA) ...,"\n\npatient group, a further significant incre...",for surveillance of severe acute respiratory s...,"[{'first': 'Marie', 'middle': [], 'last': 'Lin...","{'BIBREF0': {'ref_id': 'b0', 'title': 'for sur..."


In [34]:
pmc_df.to_csv('2020-03-13/biorxiv/clean_pmc.csv', index=False)

In [36]:
# Commercial Use
comm_dir = '2020-03-13/comm_use_subset/comm_use_subset/'
comm_files = load_files(comm_dir)
comm_df = generate_clean_df(comm_files)
comm_df.head()

100%|█████████████████████████████████████████████████████████████████████████████| 9000/9000 [00:33<00:00, 266.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 9000/9000 [01:09<00:00, 129.40it/s]


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,Supplementary Information An eco-epidemiologic...,"Julien Mélade, Nicolas Wieseke 4#, Beza Ramazi...","Julien Mélade (2 rue Maxime Rivière, 97490 Sai...",,\n\n- Figure S1 : Phylogeny of all sequences b...,"NDV/HQ266603/Chicken/1992, , , None; MuV/FJ375...","[{'first': 'Julien', 'middle': [], 'last': 'Mé...","{'BIBREF32': {'ref_id': 'b32', 'title': 'NDV/H..."
1,00142f93c18b07350be89e96372d240372437ed9,immunity to pathogens taught by specialized hu...,"Elisabetta Padovan, Marina Cella, Shahram Sale...","Elisabetta Padovan, Marina Cella, Shahram Sale...",Abstract\n\nDendritic cells (DCs) are speciali...,\n\niNTRODUCTiON Human beings are constantly e...,The dendritic cell system and its role in immu...,"[{'first': 'Elisabetta', 'middle': [], 'last':...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The den..."
2,0022796bb2112abd2e6423ba2d57751db06049fb,Public Health Responses to and Challenges for ...,"Elvina Viennet, Scott A Ritchie, Craig R Willi...",Elvina Viennet (The Australian National Univer...,Abstract\n\nDengue has a negative impact in lo...,Introduction\n\nPathogens and vectors can now ...,"The global distribution and burden of dengue, ...","[{'first': 'Elvina', 'middle': [], 'last': 'Vi...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The glo..."
3,00326efcca0852dc6e39dc6b7786267e1bc4f194,a section of the journal Frontiers in Pediatri...,"Jan Hau Lee, Oguz Dursun, Phuc Huu Phan, Yek K...","Jan Hau Lee, Oguz Dursun, Phuc Huu Phan, Yek K...","Abstract\n\nFifteen years ago, United Nations ...",\n\nIn addition to preventative care and nutri...,"Global, regional, and national levels of neona...","[{'first': 'Jan', 'middle': ['Hau'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Global,..."
4,00352a58c8766861effed18a4b079d1683fec2ec,MINI REVIEW Function of the Deubiquitinating E...,"Molly Hodul, Caroline L Dahlberg, Peter Juo, C...","Molly Hodul (Tufts University, Boston, MA, Uni...",Abstract\n\nPosttranslational modification of ...,INTRODUCTION\n\nUbiquitination is a widely use...,Regulation of AMPA receptor trafficking and sy...,"[{'first': 'Molly', 'middle': [], 'last': 'Hod...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Regulat..."


In [37]:
comm_df.to_csv('2020-03-13/biorxiv/clean_comm_use.csv', index=False)

In [38]:
# Non-commercial Use
noncomm_dir = '2020-03-13/noncomm_use_subset/noncomm_use_subset/'
noncomm_files = load_files(noncomm_dir)
noncomm_df = generate_clean_df(noncomm_files)
noncomm_df.head()

100%|█████████████████████████████████████████████████████████████████████████████| 1973/1973 [00:05<00:00, 368.91it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1973/1973 [00:11<00:00, 178.25it/s]


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,0036b28fddf7e93da0970303672934ea2f9944e7,The fecal microbiota and unconjugated fecal bi...,"B Ruggerone, A C Manchester, F Del Baldo, F Fr...","B Ruggerone (University of Thessaly, Karditsa,...",Abstract\n\nand Blautia (P = 0.008) significan...,\n\nhuman type 1 DM. The aim of this study was...,Significant Feline Proteinuria: a retrospectiv...,"[{'first': 'B', 'middle': [], 'last': 'Ruggero...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Signifi..."
1,005c43980edf3fcc2a4d12ee7ad630ddb651ce6e,Development of a smartphone-based rapid dual f...,"Seon-Ju Yeo, Homan Kang, Tung Duy Dao, Thi Bui...","Seon-Ju Yeo (Wonkwang University, 570-749, Iks...",Abstract\n\nAccurate and rapid diagnosis of hi...,Introduction\n\nAvian influenza (AI) viruses a...,"Avian and other zoonotic influenza, , , 2018; ...","[{'first': 'Seon-Ju', 'middle': [], 'last': 'Y...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Avian a..."
2,006be99e337c84b8758591a54f0362353b24dfde,Regulated Export of a Secretory Protein from t...,Stephen S Macintyre,Stephen S Macintyre (Case Western Reserve Univ...,"Abstract\n\n253 protein, C-reactive protein (C...",\n\nAbstract. The half-time for secretion of t...,Studies on the binding specificity of human C-...,"[{'first': 'Stephen', 'middle': ['S'], 'last':...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Studies..."
3,00a00d0edc750db4a0c299dd1ec0c6871f5a4f24,,,,,Editorial\n\nThis is an Open Access article di...,"Report of the Ebola Interim Assessment Panel, ...",[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Report ..."
4,00e5a723d44eb9f2698c38b518eff85c00f9753b,,"Vijay Harypursat, Yao-Kai Chen, Yao-Kai, Chen",Vijay Harypursat (Chongqing Public Health Medi...,,\n\nCoronaviruses have in the past been known ...,Efficient replication of the novel human betac...,"[{'first': 'Vijay', 'middle': [], 'last': 'Har...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Efficie..."


In [39]:
noncomm_df.to_csv('2020-03-13/biorxiv/clean_noncomm_use.csv', index=False)

In [None]:
stop = timeit.default_timer()
print(datetime.now()-start1)