# RePORTER results

[RePORTER data](https://reporter.nih.gov/exporter/): publications + funding data of NIH projects (US) from 1980 until 2022.

Analysis time window: `1985 - 2022`.

## Imports

In [1]:
import csv
import pandas as pd

## Paths

In [2]:
codes_path = 'data/processing/codes.csv'
publications_path = 'data/processing/publications.csv'
link_tables_path = 'data/processing/link_tables.csv'
merge_df2_path = 'data/processing/merge_df2.csv' # link_tables + publications
projects_path = 'data/processing/projects.csv'

## Data

In [3]:
merged_df2 = pd.read_csv(merge_df2_path, index_col=0, low_memory=False)
merged_df2.shape # link_tables + publications

(69795, 14)

In [4]:
projects = pd.read_csv(projects_path, index_col=0, low_memory=False)
projects.shape

(616345, 41)

In [5]:
merged_df2.columns

Index(['pmcid', 'doi', 'affiliation', 'author_list', 'country', 'issn',
       'journal_issue', 'journal_title', 'journal_title_abbr', 'pmid',
       'pub_date', 'pub_title', 'pub_year', 'project_number'],
      dtype='object')

In [6]:
publ_list = ['pub_' + col if 'pub_' not in col else col for col in merged_df2.columns[2:]]
merged_df2.columns = ['pmcid', 'doi'] + publ_list

In [7]:
merged_df2.columns # link_tables + publications

Index(['pmcid', 'doi', 'pub_affiliation', 'pub_author_list', 'pub_country',
       'pub_issn', 'pub_journal_issue', 'pub_journal_title',
       'pub_journal_title_abbr', 'pub_pmid', 'pub_date', 'pub_title',
       'pub_year', 'pub_project_number'],
      dtype='object')

In [8]:
print(f"Unique pmc-ids: {len(merged_df2.pmcid.unique())}")

Unique pmc-ids: 22146


In [9]:
projects.columns

Index(['application_id', 'activity', 'administering_ic', 'application_type',
       'arra_funded', 'award_notice_date', 'budget_start', 'budget_end',
       'core_project_num', 'ed_inst_type', 'foa_number', 'full_project_num',
       'subproject_id', 'funding_ics', 'fy', 'ic_name', 'nih_spending_cats',
       'org_city', 'org_country', 'org_dept', 'org_fips', 'org_name',
       'org_state', 'phr', 'pi_names', 'program_officer_name', 'project_start',
       'project_end', 'project_terms', 'project_title', 'serial_number',
       'study_section', 'study_section_name', 'support_year', 'total_cost',
       'total_cost_sub_project', 'funding_mechanism', 'direct_cost_amt',
       'indirect_cost_amt', 'org_ipf_code', 'opportunity number'],
      dtype='object')

In [10]:
proj_list = ['proj_' + col if 'proj_' not in col else col for col in projects.columns]
projects.columns =  proj_list

In [11]:
projects.columns

Index(['proj_application_id', 'proj_activity', 'proj_administering_ic',
       'proj_application_type', 'proj_arra_funded', 'proj_award_notice_date',
       'proj_budget_start', 'proj_budget_end', 'proj_core_project_num',
       'proj_ed_inst_type', 'proj_foa_number', 'proj_full_project_num',
       'proj_subproject_id', 'proj_funding_ics', 'proj_fy', 'proj_ic_name',
       'proj_nih_spending_cats', 'proj_org_city', 'proj_org_country',
       'proj_org_dept', 'proj_org_fips', 'proj_org_name', 'proj_org_state',
       'proj_phr', 'proj_pi_names', 'proj_program_officer_name',
       'proj_project_start', 'proj_project_end', 'proj_project_terms',
       'proj_project_title', 'proj_serial_number', 'proj_study_section',
       'proj_study_section_name', 'proj_support_year', 'proj_total_cost',
       'proj_total_cost_sub_project', 'proj_funding_mechanism',
       'proj_direct_cost_amt', 'proj_indirect_cost_amt', 'proj_org_ipf_code',
       'proj_opportunity number'],
      dtype='object')

In [12]:
# Check
print(merged_df2.pub_project_number.loc[0])
print(projects.proj_core_project_num.loc[0])

K08CA133103
F31GM009614


In [13]:
# Final results
results = pd.merge(merged_df2, projects, left_on='pub_project_number', right_on='proj_core_project_num', how='left')
results.shape

(3943185, 55)

In [14]:
results.head()

Unnamed: 0,pmcid,doi,pub_affiliation,pub_author_list,pub_country,pub_issn,pub_journal_issue,pub_journal_title,pub_journal_title_abbr,pub_pmid,...,proj_study_section,proj_study_section_name,proj_support_year,proj_total_cost,proj_total_cost_sub_project,proj_funding_mechanism,proj_direct_cost_amt,proj_indirect_cost_amt,proj_org_ipf_code,proj_opportunity number
0,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee E - Prevention &Control,1.0,140940.0,,0,,,,0
1,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,2.0,140940.0,,Other Research Related,,,,0
2,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,3.0,140940.0,,Other Research Related,,,,0
3,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,4.0,140940.0,,Other Research Related,,,,0
4,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee I - Transistion to Independence,5.0,140940.0,,OTHER RESEARCH-RELATED,130500.0,10440.0,,0


In [15]:
example1 = results[results.pmcid == 3387267]
example1

Unnamed: 0,pmcid,doi,pub_affiliation,pub_author_list,pub_country,pub_issn,pub_journal_issue,pub_journal_title,pub_journal_title_abbr,pub_pmid,...,proj_study_section,proj_study_section_name,proj_support_year,proj_total_cost,proj_total_cost_sub_project,proj_funding_mechanism,proj_direct_cost_amt,proj_indirect_cost_amt,proj_org_ipf_code,proj_opportunity number
0,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee E - Prevention &Control,1.0,140940.0,,0,,,,0
1,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,2.0,140940.0,,Other Research Related,,,,0
2,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,3.0,140940.0,,Other Research Related,,,,0
3,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,4.0,140940.0,,Other Research Related,,,,0
4,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee I - Transistion to Independence,5.0,140940.0,,OTHER RESEARCH-RELATED,130500.0,10440.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,15.0,,371859.0,Research Projects,,,,0
115,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,15.0,,247123.0,Research Projects,,,,0
116,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,15.0,,513343.0,Research Projects,,,,0
117,3387267,10.1371/journal.pone.0039725,"Department of Medicine, Stem Cell Program and ...","Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",United States,1932-6203,6,PloS one,PLoS One,22768113,...,NCI,Subcommittee B - Comprehensiveness,15.0,,581052.0,Research Projects,,,,0


In [16]:
results.columns

Index(['pmcid', 'doi', 'pub_affiliation', 'pub_author_list', 'pub_country',
       'pub_issn', 'pub_journal_issue', 'pub_journal_title',
       'pub_journal_title_abbr', 'pub_pmid', 'pub_date', 'pub_title',
       'pub_year', 'pub_project_number', 'proj_application_id',
       'proj_activity', 'proj_administering_ic', 'proj_application_type',
       'proj_arra_funded', 'proj_award_notice_date', 'proj_budget_start',
       'proj_budget_end', 'proj_core_project_num', 'proj_ed_inst_type',
       'proj_foa_number', 'proj_full_project_num', 'proj_subproject_id',
       'proj_funding_ics', 'proj_fy', 'proj_ic_name', 'proj_nih_spending_cats',
       'proj_org_city', 'proj_org_country', 'proj_org_dept', 'proj_org_fips',
       'proj_org_name', 'proj_org_state', 'proj_phr', 'proj_pi_names',
       'proj_program_officer_name', 'proj_project_start', 'proj_project_end',
       'proj_project_terms', 'proj_project_title', 'proj_serial_number',
       'proj_study_section', 'proj_study_section_name', 

In [17]:
# Check
# results['pmcid'] = results['pmcid'].apply(lambda x: 'PMC' + str(x))

In [18]:
# Check pmc-ids
print(f"Unique pmc-ids (original): {len(merged_df2.pmcid.unique())}")
print(f"Unique pmc-id (results): {len(results.pmcid.unique())}")

Unique pmc-ids (original): 22146
Unique pmc-id (results): 22146


In [19]:
# Drop duplicates
filter_results = results.drop_duplicates(keep='last')
print(f'Removed items: {results.shape[0] - filter_results.shape[0]}')

Removed items: 4905


In [20]:
pmc_ids = filter_results.pmcid.unique()

In [21]:
# Writing the list of numbers to a CSV file
with open('data/output/pmc_ids.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(pmc_ids)

### Save final results (1000 rows)

In [22]:
filter_results_100 = filter_results.head(1000)
filter_results_100.shape

(1000, 55)

In [23]:
# Final results
# filter_results_100.to_csv('data/output/nih_results-1000.csv')