# RePORTER data

[RePORTER data](https://reporter.nih.gov/exporter/): publications + funding data of NIH projects (US) from 1980 until 2022. 

Analysis time window: `1985 - 2022`.

## Imports

In [1]:
import os
import json
import zipfile
import pandas as pd

## Scraper

[pynih](https://github.com/jermwatt/pynih) is a useful library designed to extract data from the [NIH RePORTER API](https://api.reporter.nih.gov/). However, it was not utilized for the Hackathon, where zip files were manually downloaded.

## Functions

In [2]:
def create_folder(folder_path):
    # Specify the folder containing the zip files
    # Create the extraction folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

In [3]:
def create_general_folders():
    folders = []
    for folder in ['raw', 'input', 'processing', 'output']:
        folder_path = os.path.join('data/', folder)
        folders.append(create_folder(folder_path)) 
    return folders   

In [4]:
def create_extraction_folders(document):
    # Specify the folder containing the zip files
    zip_folder = f'/{document}/'
    # Specify the folder where you want to extract the files
    extracted_folder = f'data/input/{document}/'
    # Create the extraction folder if it doesn't exist
    os.makedirs(zip_folder, exist_ok=True)
    os.makedirs(extracted_folder, exist_ok=True)
    print(f'Input folder: {zip_folder}')
    print(f'Output folder: {zip_folder}')
    return zip_folder, extracted_folder

In [5]:
def unzip_files(zip_folder, extracted_folder):
    # List all zip files in the zip folder
    zip_files = [f for f in os.listdir(zip_folder) if f.endswith('.zip')]

    # Loop through zip files and extract contents
    count = 0
    for zip_file in zip_files:
        zip_path = os.path.join(zip_folder, zip_file)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_folder)
        count += 1 
    print(f'{count} zips extracted successfully!')

In [6]:
def create_df_from_csv(folders, document, project_numbers=''):
    extracted_folder =  os.path.join(folders[1], document)
    # Create a DataFrame to store the data from the CSVs
    df_list = []
    # Loop through extracted CSV files and read them into the DataFrame
    for csv_file in os.listdir(extracted_folder):
        if csv_file.endswith('.csv'):
            csv_path = os.path.join(extracted_folder, csv_file)
            try:
                # Open and read the file manually with error handling
                with open(csv_path, 'r', encoding='utf-8', errors='replace') as file:
                    df = pd.read_csv(file, on_bad_lines='skip', encoding='utf-8', engine='python')
                    # if 'projects' in csv_path and project_numbers != '':
                        # df = df[df["CORE_PROJECT_NUM"].isin(project_numbers)]
                        # df = pd.merge(codes_df, df, on='pmid', how='left')
                df_list.append(df)
            except Exception as e:
                print(f"Error reading {csv_path}: {e}")

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(df_list, ignore_index=True)
    print(f'Number of {document}: {final_df.shape[0]}')
    return final_df


## Paths

In [7]:
folders = create_general_folders() # 'raw', 'input', 'processing', 'output'
folders

['data/raw', 'data/input', 'data/processing', 'data/output']

## Unzip files

In [8]:
zip_folder = os.path.join(folders[0], 'publications')
extracted_folder = os.path.join(folders[1], 'publications')

In [9]:
# UNZIP FILES - optional
# if any(os.scandir(zip_folder)):
#    unzip_files(zip_folder, extracted_folder)
# else:
#    print('No zips!')

## Data

### Count Data

In [10]:
json_file_path = 'data/results_for_analysis.json'

In [11]:
# Open the JSON file
with open(json_file_path, 'r') as file:
    # Load the JSON data
    data = json.load(file)

In [12]:
count_data = pd.DataFrame.from_dict(data)
count_data.head(2)

Unnamed: 0,pmcid,agg_sentence_index,agg_n_fem,agg_n_male,agg_perc_fem,agg_perc_male,agg_sample,clean_n_fem,clean_n_male,clean_perc_fem,...,article_categories,article_title,authors,copyright_info,funding,publisher_id,doi,journal_title,keywords,publication_date
0,PMC9683380,"[3, 5, 6, 7, 8]","[null, null, ""\""352\"""", null, null]","[null, null, ""\""328\"""", null, null]","[null, null, null, null, null]","[null, null, null, null, null]","[""\""##3\"""", ""\""##3\"""", ""\""70\"""", ""\""##6\"""", ""\...",[352],[328],[],...,"[""Endocrinology""]",Differential diagnostic value of plain CT scan...,"[""Zhijiang Han"", ""Mengwei Wu"", ""Peiying Wei"", ...","Copyright © 2022 Han, Wu, Wei, Zhu, Zhang, Din...",[],,10.3389/fendo.2022.1007870,Frontiers in Endocrinology,"[""adrenal gland neoplasms"", ""adrenal adenoma"",...",2022-11-09 00:00:00
1,PMC5137654,"[1, 3, 4]","[""\""30\"""", null, null]","[""\""30\"""", null, null]","[null, null, null]","[null, null, null]","[""\""60\"""", ""\""16\"""", ""\""46\""""]",[30],[30],[],...,"[""Spine""]",Posterior hemivertebra resection and monosegme...,"[""X Zhu"", ""X Wei"", ""J Chen"", ""C Li"", ""M Li"", ""...",Copyright © 2013 Royal College of Surgeons,[],650173.0,10.1308/003588414X13824511650173,Annals of The Royal College of Surgeons of Eng...,"[""Hemivertebra resection"", ""Monosegmental fusi...",


In [13]:
count_data.shape[0]

166011

In [14]:
count_data.columns

Index(['pmcid', 'agg_sentence_index', 'agg_n_fem', 'agg_n_male',
       'agg_perc_fem', 'agg_perc_male', 'agg_sample', 'clean_n_fem',
       'clean_n_male', 'clean_perc_fem', 'clean_perc_male', 'clean_sample',
       'max_n_fem', 'max_n_male', 'max_perc_fem', 'max_perc_male',
       'max_sample', 'article_categories', 'article_title', 'authors',
       'copyright_info', 'funding', 'publisher_id', 'doi', 'journal_title',
       'keywords', 'publication_date'],
      dtype='object')

### Filters

In [15]:
pmc_ids_df = pd.read_csv('data/pmcids_dois_from_counts_data.txt')
pmc_ids_df.head(2)

Unnamed: 0,pmcid,doi
0,PMC9683380,10.3389/fendo.2022.1007870
1,PMC5137654,10.1308/003588414X13824511650173


In [16]:
pmc_ids_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166011 entries, 0 to 166010
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   pmcid   166011 non-null  object
 1   doi     162158 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [17]:
pmc_ids = [x.replace('PMC', '') for x in pmc_ids_df.pmcid.unique()]
pmc_ids_df['pmcid'] = pmc_ids
print(f'Unique pmc ids: {len(pmc_ids)}')

Unique pmc ids: 166011


### Link Tables

In [18]:
# link_tables = create_df_from_csv(folders, 'link_tables')
link_tables = pd.read_csv('data/processing/link_tables.csv')
link_tables.head()

Unnamed: 0.1,Unnamed: 0,pmid,project_number
0,0,3968312,R01HL025826
1,1,3889259,R01HD016292
2,2,3918136,R01HD011011
3,3,3864856,T32HL007535
4,4,4064865,R23AM031070


In [19]:
# Convert column names to lowercase
link_tables.columns = link_tables.columns.str.lower()

In [20]:
link_tables.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6437869 entries, 0 to 6437868
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   unnamed: 0      int64 
 1   pmid            int64 
 2   project_number  object
dtypes: int64(2), object(1)
memory usage: 147.4+ MB


In [21]:
link_tables['pmid'] = link_tables['pmid'].fillna(0).astype(str)

### Publications

In [28]:
# publications = create_df_from_csv(folders, 'publications')
all_publications = pd.read_csv('data/processing/publications.csv', index_col=None, engine='python')
all_publications.columns

In [24]:
all_publications.head()

NameError: name 'publication' is not defined

In [23]:
# remove publication with only different year of publication
publications = all_publications[all_publications.duplicated(subset='column_name') | df.duplicated(subset='column_name', keep=False)]
publications.shape

SyntaxError: invalid syntax (4058105336.py, line 2)

In [None]:
# Convert column names to lowercase
publications.columns = publications.columns.str.lower()

In [None]:
publications.info()

In [None]:
publications['pmc_id'] = publications['pmc_id'].fillna(0).astype(int).astype(str)

In [None]:
publications['pmid'] = publications['pmid'].fillna(0).astype(int).astype(str)

### Codes

In [None]:
merged_df1 = pd.merge(pmc_ids_df, publications, left_on='pmcid', right_on='pmc_id', how='left')
merged_df1.head()

In [None]:
print(f'Merged df: {merged_df1.shape[0]}')

In [None]:
merged_df1.columns

In [None]:
merged_df1.isnull().sum()

In [None]:
# Convert column names to lowercase
merged_df1.columns = merged_df1.columns.str.lower()

In [None]:
partial_df = merged_df1[(merged_df1['pmcid'].notna()) & (merged_df1['pmc_id'].notna())]
print(f'Partial df: {partial_df.shape[0]}')

In [None]:
merged_df2 = pd.merge(partial_df, link_tables, on='pmid', how='left')
merged_df2.head()

In [None]:
len(merged_df2.pmcid.unique())

In [None]:
merged_df2.columns

In [None]:
codes_df = merged_df2[['pmcid', 'doi', 'pmid', 'project_number']]
codes_df.head()

In [None]:
print(f"Unique pmc-ids: {len(codes_df.pmcid.unique())}")

### Projects

In [None]:
projects_number = list(codes_df.project_number.unique())
print(type(projects_number))

In [None]:
print(len(projects_number))

In [None]:
# projects = create_df_from_csv(folders, 'projects', projects_number)  # filter: pmc_ids
projects = pd.read_csv('data/processing/projects.csv')
projects.shape[0]

In [None]:
projects.columns

In [None]:
-

In [None]:
df = df[df["CORE_PROJECT_NUM"].isin(project_numbers)]

In [None]:
projects.tail()

In [None]:
projects.columns = projects.columns.str.lower()

In [None]:
projects.columns

### Save partial results

In [None]:
# Partial results
# publications.to_csv('data/processing/publications.csv')
# link_tables.to_csv('data/processing/link_tables.csv')
# projects.to_csv('data/processing/projects.csv')

## Results

In [None]:
link_tables.columns

In [None]:
publications.columns

In [None]:
merged_df2.columns # link_tables + publications

In [None]:
# Remove pmc_id column (duplicate)
are_columns_equal = merged_df2["pmcid"].equals(merged_df2["pmc_id"])

if are_columns_equal:
    print("The two columns contain the same data.")
else:
    print("The two columns do not contain the same data.")

In [None]:
merged_df2 = merged_df2.drop("pmc_id", axis=1)

In [None]:
merged_df2.columns # link_tables + publications

In [None]:
publ_list = ['pub_' + col if 'pub_' not in col else col for col in merged_df2.columns[2:]]
merged_df2.columns = ['pmcid', 'doi'] + publ_list

In [None]:
merged_df2.columns # link_tables + publications

In [None]:
print(f"Unique pmc-ids: {len(merged_df2.pmcid.unique())}")

In [None]:
projects.columns

In [None]:
proj_list = ['proj_' + col if 'proj_' not in col else col for col in projects.columns]
projects.columns =  proj_list

In [None]:
projects.columns

In [None]:
# Check 
print(merged_df2.pub_project_number.loc[0])
print(projects.proj_core_project_num.loc[0])

In [None]:
# Final results
results = pd.merge(merged_df2, projects, left_on='pub_project_number', right_on='proj_core_project_num', how='left')

In [None]:
# Check
results['pmcid'] = results['pmcid'].apply(lambda x: 'PMC' + str(x))

In [None]:
results.head(2)

In [None]:
# Check pmc-ids 
print(f"Unique pmc-ids (original): {len(merged_df2.pmcid.unique())}")
print(f"Unique pmc-id (results): {len(results.pmcid.unique())}")

In [None]:
results.columns

### Save final results

In [None]:
# Final results
# results.to_csv('data/output/nih_projs_publs.csv')

### Random check

In [None]:
check = pd.read_csv('data/output/nih_projs_publs.csv')
check.head()

In [None]:
publications.columns

In [None]:
projects.columns

In [None]:
# Publication Y, project Y
example1 = results[(results.pub_pmid.notna()) & (results.proj_core_project_num.notna())]
print(example1['pub_pmid'])
print(example1['proj_core_project_num'])

In [None]:
publications[publications.pmid == '22768113']

In [None]:
projects[projects.proj_core_project_num == 'K08CA133103']

## Edit Table