# RePORTER data

[RePORTER data](https://reporter.nih.gov/exporter/): publications + funding data of NIH projects (US) from 1980 until 2022. 

Analysis time window: `1985 - 2022`.

## Imports

In [1]:
import os
import json
import zipfile
import pandas as pd

## Scraper

[pynih](https://github.com/jermwatt/pynih) is a useful library designed to extract data from the [NIH RePORTER API](https://api.reporter.nih.gov/). However, it was not utilized for the Hackathon, where zip files were manually downloaded.

## Functions

In [2]:
def create_folder(folder_path):
    # Specify the folder containing the zip files
    # Create the extraction folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

In [3]:
def create_general_folders():
    folders = []
    for folder in ['raw', 'input', 'processing', 'output']:
        folder_path = os.path.join('data/', folder)
        folders.append(create_folder(folder_path)) 
    return folders   

In [4]:
def create_extraction_folders(document):
    # Specify the folder containing the zip files
    zip_folder = f'/{document}/'
    # Specify the folder where you want to extract the files
    extracted_folder = f'data/input/{document}/'
    # Create the extraction folder if it doesn't exist
    os.makedirs(zip_folder, exist_ok=True)
    os.makedirs(extracted_folder, exist_ok=True)
    print(f'Input folder: {zip_folder}')
    print(f'Output folder: {zip_folder}')
    return zip_folder, extracted_folder

In [5]:
def unzip_files(zip_folder, extracted_folder):
    # List all zip files in the zip folder
    zip_files = [f for f in os.listdir(zip_folder) if f.endswith('.zip')]

    # Loop through zip files and extract contents
    count = 0
    for zip_file in zip_files:
        zip_path = os.path.join(zip_folder, zip_file)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_folder)
        count += 1 
    print(f'{count} zips extracted successfully!')

In [6]:
def create_df_from_csv(folders, document, project_numbers=''):
    extracted_folder =  os.path.join(folders[1], document)
    # Create a DataFrame to store the data from the CSVs
    df_list = []
    # Loop through extracted CSV files and read them into the DataFrame
    for csv_file in os.listdir(extracted_folder):
        if csv_file.endswith('.csv'):
            csv_path = os.path.join(extracted_folder, csv_file)
            try:
                # Open and read the file manually with error handling
                with open(csv_path, 'r', encoding='utf-8', errors='replace') as file:
                    df = pd.read_csv(file, on_bad_lines='skip', encoding='utf-8', engine='python')
                    if 'projects' in csv_path and project_numbers != '':
                        df = df[df["CORE_PROJECT_NUM"].isin(project_numbers)]
                        # df = pd.merge(codes_df, df, on='pmid', how='left')
                df_list.append(df)
            except Exception as e:
                print(f"Error reading {csv_path}: {e}")

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(df_list, ignore_index=True)
    print(f'Number of {document}: {final_df.shape[0]}')
    return final_df


## Paths

In [7]:
folders = create_general_folders() # 'raw', 'input', 'processing', 'output'
folders

['data/raw', 'data/input', 'data/processing', 'data/output']

## Unzip files

In [8]:
zip_folder = os.path.join(folders[0], 'publications')
extracted_folder = os.path.join(folders[1], 'publications')

In [9]:
# UNZIP FILES - optional
# if any(os.scandir(zip_folder)):
#    unzip_files(zip_folder, extracted_folder)
# else:
#    print('No zips!')

## Data

### Count Data

In [10]:
json_file_path = 'data/results_for_analysis.json'

In [11]:
# Open the JSON file
with open(json_file_path, 'r') as file:
    # Load the JSON data
    data = json.load(file)

In [12]:
count_data = pd.DataFrame.from_dict(data)
count_data.head(2)

Unnamed: 0,pmcid,agg_sentence_index,agg_n_fem,agg_n_male,agg_perc_fem,agg_perc_male,agg_sample,clean_n_fem,clean_n_male,clean_perc_fem,...,article_categories,article_title,authors,copyright_info,funding,publisher_id,doi,journal_title,keywords,publication_date
0,PMC9683380,"[3, 5, 6, 7, 8]","[null, null, ""\""352\"""", null, null]","[null, null, ""\""328\"""", null, null]","[null, null, null, null, null]","[null, null, null, null, null]","[""\""##3\"""", ""\""##3\"""", ""\""70\"""", ""\""##6\"""", ""\...",[352],[328],[],...,"[""Endocrinology""]",Differential diagnostic value of plain CT scan...,"[""Zhijiang Han"", ""Mengwei Wu"", ""Peiying Wei"", ...","Copyright © 2022 Han, Wu, Wei, Zhu, Zhang, Din...",[],,10.3389/fendo.2022.1007870,Frontiers in Endocrinology,"[""adrenal gland neoplasms"", ""adrenal adenoma"",...",2022-11-09 00:00:00
1,PMC5137654,"[1, 3, 4]","[""\""30\"""", null, null]","[""\""30\"""", null, null]","[null, null, null]","[null, null, null]","[""\""60\"""", ""\""16\"""", ""\""46\""""]",[30],[30],[],...,"[""Spine""]",Posterior hemivertebra resection and monosegme...,"[""X Zhu"", ""X Wei"", ""J Chen"", ""C Li"", ""M Li"", ""...",Copyright © 2013 Royal College of Surgeons,[],650173.0,10.1308/003588414X13824511650173,Annals of The Royal College of Surgeons of Eng...,"[""Hemivertebra resection"", ""Monosegmental fusi...",


In [13]:
count_data.shape[0]

166011

In [14]:
count_data.columns

Index(['pmcid', 'agg_sentence_index', 'agg_n_fem', 'agg_n_male',
       'agg_perc_fem', 'agg_perc_male', 'agg_sample', 'clean_n_fem',
       'clean_n_male', 'clean_perc_fem', 'clean_perc_male', 'clean_sample',
       'max_n_fem', 'max_n_male', 'max_perc_fem', 'max_perc_male',
       'max_sample', 'article_categories', 'article_title', 'authors',
       'copyright_info', 'funding', 'publisher_id', 'doi', 'journal_title',
       'keywords', 'publication_date'],
      dtype='object')

### Filters

In [15]:
pmc_ids_df = pd.read_csv('data/pmcids_dois_from_counts_data.txt')
pmc_ids_df.head(2)

Unnamed: 0,pmcid,doi
0,PMC9683380,10.3389/fendo.2022.1007870
1,PMC5137654,10.1308/003588414X13824511650173


In [16]:
pmc_ids_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166011 entries, 0 to 166010
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   pmcid   166011 non-null  object
 1   doi     162158 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [17]:
pmc_ids = [x.replace('PMC', '') for x in pmc_ids_df.pmcid.unique()]
pmc_ids_df['pmcid'] = pmc_ids
print(f'Unique pmc ids: {len(pmc_ids)}')

Unique pmc ids: 166011


### Link Tables

In [18]:
link_tables = create_df_from_csv(folders, 'link_tables')
link_tables.head()

Number of link_tables: 6437869


Unnamed: 0,PMID,PROJECT_NUMBER
0,3968312,R01HL025826
1,3889259,R01HD016292
2,3918136,R01HD011011
3,3864856,T32HL007535
4,4064865,R23AM031070


In [19]:
# Convert column names to lowercase
link_tables.columns = link_tables.columns.str.lower()

In [20]:
link_tables.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6437869 entries, 0 to 6437868
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   pmid            int64 
 1   project_number  object
dtypes: int64(1), object(1)
memory usage: 98.2+ MB


In [21]:
link_tables['pmid'] = link_tables['pmid'].fillna(0).astype(str)

### Publications

In [22]:
publications = create_df_from_csv(folders, 'publications')
publications.columns

Number of publications: 2755776


Index(['AFFILIATION', 'AUTHOR_LIST', 'COUNTRY', 'ISSN', 'JOURNAL_ISSUE',
       'JOURNAL_TITLE', 'JOURNAL_TITLE_ABBR', 'JOURNAL_VOLUME', 'LANG',
       'PAGE_NUMBER', 'PMC_ID', 'PMID', 'PUB_DATE', 'PUB_TITLE', 'PUB_YEAR'],
      dtype='object')

In [23]:
# Convert column names to lowercase
publications.columns = publications.columns.str.lower()

In [24]:
publications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755776 entries, 0 to 2755775
Data columns (total 15 columns):
 #   Column              Dtype  
---  ------              -----  
 0   affiliation         object 
 1   author_list         object 
 2   country             object 
 3   issn                object 
 4   journal_issue       object 
 5   journal_title       object 
 6   journal_title_abbr  object 
 7   journal_volume      object 
 8   lang                object 
 9   page_number         object 
 10  pmc_id              float64
 11  pmid                int64  
 12  pub_date            object 
 13  pub_title           object 
 14  pub_year            int64  
dtypes: float64(1), int64(2), object(12)
memory usage: 315.4+ MB


In [25]:
publications['pmc_id'] = publications['pmc_id'].fillna(0).astype(int).astype(str)

In [26]:
publications['pmid'] = publications['pmid'].fillna(0).astype(int).astype(str)

### Codes

In [27]:
merged_df1 = pd.merge(pmc_ids_df, publications, left_on='pmcid', right_on='pmc_id', how='left')
merged_df1.head()

Unnamed: 0,pmcid,doi,affiliation,author_list,country,issn,journal_issue,journal_title,journal_title_abbr,journal_volume,lang,page_number,pmc_id,pmid,pub_date,pub_title,pub_year
0,9683380,10.3389/fendo.2022.1007870,,,,,,,,,,,,,,,
1,5137654,10.1308/003588414X13824511650173,,,,,,,,,,,,,,,
2,7906844,10.5395/rde.2021.46.e7,,,,,,,,,,,,,,,
3,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6.0,PloS one,PLoS One,7.0,eng,e39725,3387267.0,22768113.0,2012.0,NOTCH1 signaling promotes human T-cell acute l...,2012.0
4,4221596,10.1038/mtna.2014.36,,,,,,,,,,,,,,,


In [28]:
print(f'Merged df: {merged_df1.shape[0]}')

Merged df: 166044


In [29]:
merged_df1.columns

Index(['pmcid', 'doi', 'affiliation', 'author_list', 'country', 'issn',
       'journal_issue', 'journal_title', 'journal_title_abbr',
       'journal_volume', 'lang', 'page_number', 'pmc_id', 'pmid', 'pub_date',
       'pub_title', 'pub_year'],
      dtype='object')

In [30]:
merged_df1.isnull().sum()

pmcid                      0
doi                     3853
affiliation           144332
author_list           143857
country               143939
issn                  144114
journal_issue         149221
journal_title         143852
journal_title_abbr    143852
journal_volume        143924
lang                  143852
page_number           145676
pmc_id                143852
pmid                  143852
pub_date              143852
pub_title             143852
pub_year              143852
dtype: int64

In [31]:
# Convert column names to lowercase
merged_df1.columns = merged_df1.columns.str.lower()

In [32]:
partial_df = merged_df1[(merged_df1['pmcid'].notna()) & (merged_df1['pmc_id'].notna())]
print(f'Partial df: {partial_df.shape[0]}')

Partial df: 22192


In [33]:
merged_df2 = pd.merge(partial_df, link_tables, on='pmid', how='left')
merged_df2.head()

Unnamed: 0,pmcid,doi,affiliation,author_list,country,issn,journal_issue,journal_title,journal_title_abbr,journal_volume,lang,page_number,pmc_id,pmid,pub_date,pub_title,pub_year,project_number
0,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,7,eng,e39725,3387267,22768113,2012,NOTCH1 signaling promotes human T-cell acute l...,2012.0,K08CA133103
1,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,7,eng,e39725,3387267,22768113,2012,NOTCH1 signaling promotes human T-cell acute l...,2012.0,P01CA068484
2,7363900,10.1038/s41398-020-00916-0,"Department of Psychiatry, University of Califo...","Wierenga, Christina E; Bischoff-Grethe, Amanda...",United States,2158-3188,1,Translational psychiatry,Transl Psychiatry,10,eng,236,7363900,32669557,2020 07 16,Increased anticipatory brain response to pleas...,2020.0,R01MH113588
3,7363900,10.1038/s41398-020-00916-0,"Department of Psychiatry, University of Califo...","Wierenga, Christina E; Bischoff-Grethe, Amanda...",United States,2158-3188,1,Translational psychiatry,Transl Psychiatry,10,eng,236,7363900,32669557,2020 07 16,Increased anticipatory brain response to pleas...,2020.0,R01MH092793
4,7363900,10.1038/s41398-020-00916-0,"Department of Psychiatry, University of Califo...","Wierenga, Christina E; Bischoff-Grethe, Amanda...",United States,2158-3188,1,Translational psychiatry,Transl Psychiatry,10,eng,236,7363900,32669557,2020 07 16,Increased anticipatory brain response to pleas...,2020.0,F32MH108311


In [34]:
len(merged_df2.pmcid.unique())

22159

In [35]:
merged_df2.columns

Index(['pmcid', 'doi', 'affiliation', 'author_list', 'country', 'issn',
       'journal_issue', 'journal_title', 'journal_title_abbr',
       'journal_volume', 'lang', 'page_number', 'pmc_id', 'pmid', 'pub_date',
       'pub_title', 'pub_year', 'project_number'],
      dtype='object')

In [36]:
codes_df = merged_df2[['pmcid', 'doi', 'pmid', 'project_number']]
codes_df.head()

Unnamed: 0,pmcid,doi,pmid,project_number
0,3387267,10.1371/journal.pone.0039725,22768113,K08CA133103
1,3387267,10.1371/journal.pone.0039725,22768113,P01CA068484
2,7363900,10.1038/s41398-020-00916-0,32669557,R01MH113588
3,7363900,10.1038/s41398-020-00916-0,32669557,R01MH092793
4,7363900,10.1038/s41398-020-00916-0,32669557,F32MH108311


In [61]:
print(f"Unique pmc-ids: {len(codes_df.pmcid.unique())}")

Unique pmc-ids: 22159


### Projects

In [37]:
projects_number = list(codes_df.project_number.unique())
print(type(projects_number))

<class 'list'>


In [38]:
print(len(projects_number))

33207


In [39]:
projects = create_df_from_csv(folders, 'projects', projects_number)  # filter: pmc_ids

Number of projects: 616399


In [40]:
projects.tail()

Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,...,SUFFIX,SUPPORT_YEAR,TOTAL_COST,TOTAL_COST_SUB_PROJECT,FUNDING_MECHANISM,FUNDING_Ics,DIRECT_COST_AMT,INDIRECT_COST_AMT,ORG_IPF_CODE,OPPORTUNITY NUMBER
616394,10869852,P20,GM,5.0,N,2022-07-11,2023-05-02,2023-06-30,,P20GM130454,...,,4.0,,58761.0,RESEARCH CENTERS,,36050.0,22711.0,2021601.0,PAR-16-415
616395,10873385,R35,GM,7.0,N,2023-08-04,2023-08-01,2023-08-31,859.0,R35GM137904,...,,4.0,51729.0,,NON-SBIR/STTR RPGS,,32534.0,19195.0,2059802.0,PA-21-268
616396,10875228,R01,GM,7.0,N,2023-08-24,2023-01-01,2023-02-28,859.0,R01GM137656,...,,4.0,112546.0,,NON-SBIR/STTR RPGS,,73553.0,38993.0,6297007.0,PA-21-268
616397,10887865,K23,NS,7.0,N,2023-08-11,2022-09-03,2023-08-31,853.0,K23NS114178,...,,5.0,192275.0,,OTHER RESEARCH-RELATED,,178032.0,14243.0,1080401.0,PA-19-119
616398,10910432,R01,HL,7.0,N,2023-08-25,2023-09-01,2024-01-31,837.0,R01HL115195,...,,11.0,698308.0,,NON-SBIR/STTR RPGS,,527713.0,170595.0,490201.0,PA-21-268


In [41]:
projects.columns = projects.columns.str.lower()

In [42]:
projects.columns

Index(['application_id', 'activity', 'administering_ic', 'application_type',
       'arra_funded', 'award_notice_date', 'budget_start', 'budget_end',
       'cfda_code', 'core_project_num', 'ed_inst_type', 'foa_number',
       'full_project_num', 'subproject_id', 'funding_ics', 'fy', 'ic_name',
       'nih_spending_cats', 'org_city', 'org_country', 'org_dept',
       'org_district', 'org_duns', 'org_fips', 'org_name', 'org_state',
       'org_zipcode', 'phr', 'pi_ids', 'pi_names', 'program_officer_name',
       'project_start', 'project_end', 'project_terms', 'project_title',
       'serial_number', 'study_section', 'study_section_name', 'suffix',
       'support_year', 'total_cost', 'total_cost_sub_project',
       'funding_mechanism', 'funding_ics', 'direct_cost_amt',
       'indirect_cost_amt', 'org_ipf_code', 'opportunity number'],
      dtype='object')

### Save partial results

In [43]:
# Partial results
# publications.to_csv('data/processing/publications.csv')
# link_tables.to_csv('data/processing/link_tables.csv')
# projects.to_csv('data/processing/projects.csv')

## Results

In [44]:
link_tables.columns

Index(['pmid', 'project_number'], dtype='object')

In [45]:
publications.columns

Index(['affiliation', 'author_list', 'country', 'issn', 'journal_issue',
       'journal_title', 'journal_title_abbr', 'journal_volume', 'lang',
       'page_number', 'pmc_id', 'pmid', 'pub_date', 'pub_title', 'pub_year'],
      dtype='object')

In [46]:
merged_df2.columns # link_tables + publications

Index(['pmcid', 'doi', 'affiliation', 'author_list', 'country', 'issn',
       'journal_issue', 'journal_title', 'journal_title_abbr',
       'journal_volume', 'lang', 'page_number', 'pmc_id', 'pmid', 'pub_date',
       'pub_title', 'pub_year', 'project_number'],
      dtype='object')

In [47]:
# Remove pmc_id column (duplicate)
are_columns_equal = merged_df2["pmcid"].equals(merged_df2["pmc_id"])

if are_columns_equal:
    print("The two columns contain the same data.")
else:
    print("The two columns do not contain the same data.")

The two columns contain the same data.


In [48]:
merged_df2 = merged_df2.drop("pmc_id", axis=1)

In [49]:
merged_df2.columns # link_tables + publications

Index(['pmcid', 'doi', 'affiliation', 'author_list', 'country', 'issn',
       'journal_issue', 'journal_title', 'journal_title_abbr',
       'journal_volume', 'lang', 'page_number', 'pmid', 'pub_date',
       'pub_title', 'pub_year', 'project_number'],
      dtype='object')

In [50]:
publ_list = ['pub_' + col if 'pub_' not in col else col for col in merged_df2.columns[2:]]
merged_df2.columns = ['pmcid', 'doi'] + publ_list

In [51]:
merged_df2.columns # link_tables + publications

Index(['pmcid', 'doi', 'pub_affiliation', 'pub_author_list', 'pub_country',
       'pub_issn', 'pub_journal_issue', 'pub_journal_title',
       'pub_journal_title_abbr', 'pub_journal_volume', 'pub_lang',
       'pub_page_number', 'pub_pmid', 'pub_date', 'pub_title', 'pub_year',
       'pub_project_number'],
      dtype='object')

In [52]:
print(f"Unique pmc-ids: {len(merged_df2.pmcid.unique())}")

Unique pmc-ids: 22159


In [53]:
projects.columns

Index(['application_id', 'activity', 'administering_ic', 'application_type',
       'arra_funded', 'award_notice_date', 'budget_start', 'budget_end',
       'cfda_code', 'core_project_num', 'ed_inst_type', 'foa_number',
       'full_project_num', 'subproject_id', 'funding_ics', 'fy', 'ic_name',
       'nih_spending_cats', 'org_city', 'org_country', 'org_dept',
       'org_district', 'org_duns', 'org_fips', 'org_name', 'org_state',
       'org_zipcode', 'phr', 'pi_ids', 'pi_names', 'program_officer_name',
       'project_start', 'project_end', 'project_terms', 'project_title',
       'serial_number', 'study_section', 'study_section_name', 'suffix',
       'support_year', 'total_cost', 'total_cost_sub_project',
       'funding_mechanism', 'funding_ics', 'direct_cost_amt',
       'indirect_cost_amt', 'org_ipf_code', 'opportunity number'],
      dtype='object')

In [54]:
proj_list = ['proj_' + col if 'proj_' not in col else col for col in projects.columns]
projects.columns =  proj_list

In [55]:
projects.columns

Index(['proj_application_id', 'proj_activity', 'proj_administering_ic',
       'proj_application_type', 'proj_arra_funded', 'proj_award_notice_date',
       'proj_budget_start', 'proj_budget_end', 'proj_cfda_code',
       'proj_core_project_num', 'proj_ed_inst_type', 'proj_foa_number',
       'proj_full_project_num', 'proj_subproject_id', 'proj_funding_ics',
       'proj_fy', 'proj_ic_name', 'proj_nih_spending_cats', 'proj_org_city',
       'proj_org_country', 'proj_org_dept', 'proj_org_district',
       'proj_org_duns', 'proj_org_fips', 'proj_org_name', 'proj_org_state',
       'proj_org_zipcode', 'proj_phr', 'proj_pi_ids', 'proj_pi_names',
       'proj_program_officer_name', 'proj_project_start', 'proj_project_end',
       'proj_project_terms', 'proj_project_title', 'proj_serial_number',
       'proj_study_section', 'proj_study_section_name', 'proj_suffix',
       'proj_support_year', 'proj_total_cost', 'proj_total_cost_sub_project',
       'proj_funding_mechanism', 'proj_funding_ics

In [56]:
# Check 
print(merged_df2.pub_project_number.loc[0])
print(projects.proj_core_project_num.loc[0])

K08CA133103
F31GM009614


In [57]:
# Final results
results = pd.merge(merged_df2, projects, left_on='pub_project_number', right_on='proj_core_project_num', how='left')

In [58]:
# Check
results['pmcid'] = results['pmcid'].apply(lambda x: 'PMC' + str(x))

In [59]:
results.head(2)

Unnamed: 0,pmcid,doi,pub_affiliation,pub_author_list,pub_country,pub_issn,pub_journal_issue,pub_journal_title,pub_journal_title_abbr,pub_journal_volume,...,proj_suffix,proj_support_year,proj_total_cost,proj_total_cost_sub_project,proj_funding_mechanism,proj_funding_ics,proj_direct_cost_amt,proj_indirect_cost_amt,proj_org_ipf_code,proj_opportunity number
0,PMC3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,7,...,,1.0,140940.0,,,,,,,
1,PMC3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,7,...,,2.0,140940.0,,Other Research Related,,,,,


In [76]:
# Check pmc-ids 
print(f"Unique pmc-ids (original): {len(merged_df2.pmcid.unique())}")
print(f"Unique pmc-id (results): {len(results.pmcid.unique())}")

Unique pmc-ids (original): 22159
Unique pmc-id (results): 22159


In [64]:
results.columns

Index(['pmcid', 'doi', 'pub_affiliation', 'pub_author_list', 'pub_country',
       'pub_issn', 'pub_journal_issue', 'pub_journal_title',
       'pub_journal_title_abbr', 'pub_journal_volume', 'pub_lang',
       'pub_page_number', 'pub_pmid', 'pub_date', 'pub_title', 'pub_year',
       'pub_project_number', 'proj_application_id', 'proj_activity',
       'proj_administering_ic', 'proj_application_type', 'proj_arra_funded',
       'proj_award_notice_date', 'proj_budget_start', 'proj_budget_end',
       'proj_cfda_code', 'proj_core_project_num', 'proj_ed_inst_type',
       'proj_foa_number', 'proj_full_project_num', 'proj_subproject_id',
       'proj_funding_ics', 'proj_fy', 'proj_ic_name', 'proj_nih_spending_cats',
       'proj_org_city', 'proj_org_country', 'proj_org_dept',
       'proj_org_district', 'proj_org_duns', 'proj_org_fips', 'proj_org_name',
       'proj_org_state', 'proj_org_zipcode', 'proj_phr', 'proj_pi_ids',
       'proj_pi_names', 'proj_program_officer_name', 'proj_project

### Save final results

In [84]:
# Final results
# results.to_csv('data/output/nih_projs_publs.csv')

### Random check

In [85]:
check = pd.read_csv('data/output/nih_projs_publs.csv')
check.head()

In [78]:
publications.columns

Index(['affiliation', 'author_list', 'country', 'issn', 'journal_issue',
       'journal_title', 'journal_title_abbr', 'journal_volume', 'lang',
       'page_number', 'pmc_id', 'pmid', 'pub_date', 'pub_title', 'pub_year'],
      dtype='object')

In [79]:
projects.columns

Index(['proj_application_id', 'proj_activity', 'proj_administering_ic',
       'proj_application_type', 'proj_arra_funded', 'proj_award_notice_date',
       'proj_budget_start', 'proj_budget_end', 'proj_cfda_code',
       'proj_core_project_num', 'proj_ed_inst_type', 'proj_foa_number',
       'proj_full_project_num', 'proj_subproject_id', 'proj_funding_ics',
       'proj_fy', 'proj_ic_name', 'proj_nih_spending_cats', 'proj_org_city',
       'proj_org_country', 'proj_org_dept', 'proj_org_district',
       'proj_org_duns', 'proj_org_fips', 'proj_org_name', 'proj_org_state',
       'proj_org_zipcode', 'proj_phr', 'proj_pi_ids', 'proj_pi_names',
       'proj_program_officer_name', 'proj_project_start', 'proj_project_end',
       'proj_project_terms', 'proj_project_title', 'proj_serial_number',
       'proj_study_section', 'proj_study_section_name', 'proj_suffix',
       'proj_support_year', 'proj_total_cost', 'proj_total_cost_sub_project',
       'proj_funding_mechanism', 'proj_funding_ics

In [83]:
# Publication Y, project Y
example1 = results[(results.pub_pmid.notna()) & (results.proj_core_project_num.notna())]
print(example1['pub_pmid'])
print(example1['proj_core_project_num'])

0          22768113
1          22768113
2          22768113
3          22768113
4          22768113
             ...   
3945707    26018549
3945708    26018549
3945709    26018549
3945710    26018549
3945711    26018549
Name: pub_pmid, Length: 3944868, dtype: object
0          K08CA133103
1          K08CA133103
2          K08CA133103
3          K08CA133103
4          K08CA133103
              ...     
3945707    R01HL069057
3945708    R01HL069057
3945709    R01HL069057
3945710    R01HL069057
3945711    R01HL069057
Name: proj_core_project_num, Length: 3944868, dtype: object


In [74]:
publications[publications.pmid == '22768113']

Unnamed: 0,affiliation,author_list,country,issn,journal_issue,journal_title,journal_title_abbr,journal_volume,lang,page_number,pmc_id,pmid,pub_date,pub_title,pub_year
1552804,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,7,eng,e39725,3387267,22768113,2012,NOTCH1 signaling promotes human T-cell acute l...,2012


In [80]:
projects[projects.proj_core_project_num == 'K08CA133103']

Unnamed: 0,proj_application_id,proj_activity,proj_administering_ic,proj_application_type,proj_arra_funded,proj_award_notice_date,proj_budget_start,proj_budget_end,proj_cfda_code,proj_core_project_num,...,proj_suffix,proj_support_year,proj_total_cost,proj_total_cost_sub_project,proj_funding_mechanism,proj_funding_ics,proj_direct_cost_amt,proj_indirect_cost_amt,proj_org_ipf_code,proj_opportunity number
285985,7449856,K08,CA,1.0,,2008-05-27T00:00:00,07/01/2008,06/30/2009,398.0,K08CA133103,...,,1.0,140940.0,,,,,,,
319944,7628345,K08,CA,5.0,N,06/05/2009,07/01/2009,06/30/2010,398.0,K08CA133103,...,,2.0,140940.0,,Other Research Related,,,,,
345768,7864277,K08,CA,5.0,N,6/29/2010,7/1/2010,6/30/2011,398.0,K08CA133103,...,,3.0,140940.0,,Other Research Related,,,,,
370122,8104193,K08,CA,5.0,N,06/29/2011,07/01/2011,02/29/2012,398.0,K08CA133103,...,,4.0,140940.0,,Other Research Related,,,,,
388744,8292154,K08,CA,7.0,N,9/7/2012,7/1/2012,6/30/2013,398.0,K08CA133103,...,,5.0,140940.0,,OTHER RESEARCH-RELATED,NCI:140940\,130500.0,10440.0,,


## Edit Table