In [1]:
import pandas as pd
import numpy as np
import os, sys
from tqdm import tqdm

# Matching CORDIS and NIH info

In [2]:
cordis_and_counts=pd.read_csv("data/cordis_and_counts.csv",sep=";",index_col=0)
cordis_and_counts.shape

(3436, 52)

In [3]:
nih_columns_redux=["pmcid","doi","pub_affiliation",
"pub_author_list", "pub_country", "pub_issn",
"pub_journal_issue", "pub_journal_title", "pub_journal_title_abbr",
"pub_pmid", "pub_date", "pub_title",
"pub_year", "pub_project_number", "proj_total_cost",
"proj_project_start", "proj_project_end"]

In [4]:
nih_results_1000=pd.read_csv("data/nih_results-1000.csv",index_col=0)
nih_results_1000.shape

(1000, 55)

In [5]:
columns_nih = nih_results_1000.columns.tolist()
len(columns_nih)

55

In [6]:
columns_cordis = cordis_and_counts.columns.tolist()
len(columns_cordis)

52

In [39]:
df_compare_cols=pd.DataFrame(
    np.array(
        [
            columns_cordis,
            ["pmcid"] + 
            [np.nan] * 17 + 
            [np.nan,
             np.nan,
             np.nan,
             np.nan,
             np.nan,
             "doi",
             np.nan,
             np.nan,
             np.nan,
             "proj_core_project_num",
             np.nan,
             np.nan,
             "proj_project_title",
             "proj_project_start",
             "proj_project_end",
             "proj_total_cost",
             np.nan,
             np.nan,
             "proj_nih_spending_cats",
             "proj_award_notice_date",
             "proj_funding_ics"
            ] + 
            [np.nan] * 6 +
            ["pub_title",
             "pub_author_list",
             "pub_journal_title",
             np.nan,
             "proj_ic_name",
             "pub_date",
             "pub_journal_title_abbr"
            ],
        ]
    ).T,
    columns=["cordis", "nih"],
)
df_compare_cols

Unnamed: 0,cordis,nih
0,pmcid,pmcid
1,agg_sentence_index,
2,agg_n_fem,
3,agg_n_male,
4,agg_perc_fem,
5,agg_perc_male,
6,agg_sample,
7,clean_n_fem,
8,clean_n_male,
9,clean_perc_fem,


In [8]:
# df_compare_cols.to_csv("compare_cordis_nih.csv")

In [40]:
list_final_nih=[]
for c,n in zip(df_compare_cols["cordis"],df_compare_cols["nih"]):
    if n=="nan":
        list_final_nih.append("nan")
    elif c.endswith("cordis"):
        list_final_nih.append(c.removesuffix("cordis")+"nih")
    else:
        list_final_nih.append(c)
#         list_final_nih.append(c+"_nih")
    

In [41]:
df_compare_cols["nih_new"]=list_final_nih

In [42]:
df_compare_cols.columns=["cordis","nih_old","nih_new"]

In [43]:
df_compare_cols

Unnamed: 0,cordis,nih_old,nih_new
0,pmcid,pmcid,pmcid
1,agg_sentence_index,,
2,agg_n_fem,,
3,agg_n_male,,
4,agg_perc_fem,,
5,agg_perc_male,,
6,agg_sample,,
7,clean_n_fem,,
8,clean_n_male,,
9,clean_perc_fem,,


# Create new table with nih_new columns

In [53]:
nih_old_names=np.delete(arr=df_compare_cols["nih_old"].unique(),
                        obj=np.where(df_compare_cols["nih_old"].unique()=="nan")
                       )
nih_old_names

array(['pmcid', 'doi', 'proj_core_project_num', 'proj_project_title',
       'proj_project_start', 'proj_project_end', 'proj_total_cost',
       'proj_nih_spending_cats', 'proj_award_notice_date',
       'proj_funding_ics', 'pub_title', 'pub_author_list',
       'pub_journal_title', 'proj_ic_name', 'pub_date',
       'pub_journal_title_abbr'], dtype=object)

In [54]:
nih_new_names=np.delete(arr=df_compare_cols["nih_new"].unique(),
                        obj=np.where(df_compare_cols["nih_new"].unique()=="nan")
                       )
nih_new_names

array(['pmcid', 'doi', 'project_id_nih', 'title_projects_nih',
       'startdate_nih', 'enddate_nih', 'totalcost_nih', 'topics_nih',
       'ecsignaturedate_nih', 'frameworkprogramme_nih',
       'title_publications_nih', 'authors_nih', 'journal_title_nih',
       'funding_body', 'publication_date_nih', 'acronym_publications_nih'],
      dtype=object)

In [None]:
nih_results_1000_new_cols = nih_results_1000[nih_old_names]
nih_results_1000_new_cols.columns = nih_new_names

In [81]:
pmcid_plus_pmc=["PMC"+str(i) for i in nih_results_1000_new_cols["pmcid"]]
nih_results_1000_new_cols["pmcid"]=pmcid_plus_pmc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nih_results_1000_new_cols["pmcid"]=pmcid_plus_pmc


In [90]:
# From MM/DD/YYY to YYYY-MM-DD
"-".join(np.array("07/01/2008".split("/"))[[2,1,0]])

'2008-01-07'

In [103]:
nih_results_1000_new_cols["frameworkprogramme_nih"] = [
    str(i)[:-1] for i in nih_results_1000_new_cols["frameworkprogramme_nih"]
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nih_results_1000_new_cols["frameworkprogramme_nih"]=np.array([str(i)[:-1] for i in nih_results_1000_new_cols["frameworkprogramme_nih"]])


In [104]:
nih_results_1000_new_cols.head()

Unnamed: 0,pmcid,doi,project_id_nih,title_projects_nih,startdate_nih,enddate_nih,totalcost_nih,topics_nih,ecsignaturedate_nih,frameworkprogramme_nih,title_publications_nih,authors_nih,journal_title_nih,funding_body,publication_date_nih,acronym_publications_nih
0,PMC3387267,10.1371/journal.pone.0039725,K08CA133103,Zebrafish Chemical and Classical Genetics Appr...,07/01/2008,06/30/2013,140940.0,Biotechnology;Cancer;Childhood Leukemia;Clinic...,2008-05-27T00:00:00,NCI:140940,NOTCH1 signaling promotes human T-cell acute l...,"Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",PloS one,NATIONAL CANCER INSTITUTE,2012,PLoS One
1,PMC3387267,10.1371/journal.pone.0039725,K08CA133103,Zebrafish Chemical and Classical Genetics Appr...,07/01/2008,06/30/2013,140940.0,Biotechnology;Clinical Research - Extramural;H...,06/05/2009,NCI:140940,NOTCH1 signaling promotes human T-cell acute l...,"Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",PloS one,NATIONAL CANCER INSTITUTE,2012,PLoS One
2,PMC3387267,10.1371/journal.pone.0039725,K08CA133103,Zebrafish Chemical and Classical Genetics Appr...,7/1/2008,6/30/2013,140940.0,Biotechnology;Clinical Research;Cancer;Hematol...,6/29/2010,NCI:140940,NOTCH1 signaling promotes human T-cell acute l...,"Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",PloS one,NATIONAL CANCER INSTITUTE,2012,PLoS One
3,PMC3387267,10.1371/journal.pone.0039725,K08CA133103,Zebrafish Chemical and Classical Genetics Appr...,07/01/2008,02/29/2012,140940.0,Rare Diseases;Genetics;Human Genome;Hematology...,06/29/2011,NCI:140940,NOTCH1 signaling promotes human T-cell acute l...,"Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",PloS one,NATIONAL CANCER INSTITUTE,2012,PLoS One
4,PMC3387267,10.1371/journal.pone.0039725,K08CA133103,Zebrafish Chemical and Classical Genetics Appr...,7/1/2008,6/30/2013,140940.0,Biotechnology; Cancer; Childhood Leukemia; Cli...,9/7/2012,NCI:140940,NOTCH1 signaling promotes human T-cell acute l...,"Ma, Wenxue; Gutierrez, Alejandro; Goff, Daniel...",PloS one,NATIONAL CANCER INSTITUTE,2012,PLoS One


# Join CORDIS and NIH data

In [105]:
cordis_and_counts.shape

(3436, 52)

In [106]:
nih_results_1000_new_cols.shape

(1000, 16)

In [None]:
nih_results_1000_new_cols.join(other=cordis_and_counts,on=["pmcid"],how="outer")