In [28]:
### Read in data file

In [29]:
library(data.table)
library(dplyr)

In [30]:
datafile <- "data/export_all.csv"
df <- as.data.frame(fread(datafile,header=FALSE,stringsAsFactors=FALSE))

if( df[1,1] == "pid1" ) {
    df <- as.data.frame(fread(datafile,header=TRUE,stringsAsFactors=FALSE))
}


In [31]:
fields <- c("ID","FirstName","MiddleName","LastName","Institution")
columnNames <- c(
    paste("Trainee",fields,sep="."),
    paste("Mentor",fields,sep="."),
    "relationshipCode","relationshipType", "startYear", "stopYear")

colnames(df) <- columnNames
head(df)


Unnamed: 0,Trainee.ID,Trainee.FirstName,Trainee.MiddleName,Trainee.LastName,Trainee.Institution,Mentor.ID,Mentor.FirstName,Mentor.MiddleName,Mentor.LastName,Mentor.Institution,relationshipCode,relationshipType,startYear,stopYear
1,1,Stephen,V,David,Oregon Health and Science University,184,Shihab,A,Shamma,"Ã‰cole Normale SupÃ©rieure, Paris",2,postdoc,2005,2012
2,1,Stephen,V,David,Oregon Health and Science University,3,Jack,L,Gallant,"University of California, Berkeley",1,student,1998,2005
3,3,Jack,L,Gallant,"University of California, Berkeley",16,David,C,Van Essen,"Washington University, Saint Louis",2,postdoc,0,0
4,3,Jack,L,Gallant,"University of California, Berkeley",26,Wendell,R (Tex),Garner,Yale University,1,student,0,0
5,3,Jack,L,Gallant,"University of California, Berkeley",595,Joy,,Hirsch,Columbia University,1,student,0,0
6,667145,Christian,T,Totten,"University of Florida, Gainesville",497187,Michael,D,Annable,"University of Florida, Gainesville",1,student,0,2005


In [32]:
dim(df)

In [33]:
comp <- as.data.frame(fread("data/schultz_companies.csv",header=TRUE,stringsAsFactors=FALSE))
dim(comp)
head(comp)

Unnamed: 0,Trainee.ID,Company.Name,Trainee.MiddleName,Trainee.LastName,Trainee.Institution,Founder.ID,Founder.FirstName,Founder.MiddleName,Founder.LastName,Founder.Institution,relationshipCode,relationshipType,YearFounded,stopYear
1,AD,Scientist.com (formerly Assay Depot),,,,633266,Andrew,B,Martin,,,company,2007,
2,XP,XenoPort Inc,,,,663570,Mark,A,Gallop,,,company,1999,
3,ET,Ensemble Therapeutics,,,,57073,David,R,Liu,,,company,2004,
4,PER,Permeon Biologics,,,,57073,David,R,Liu,,,company,2011,
5,EDT,Editas Medicine,,,,57073,David,R,Liu,,,company,2014,
6,SYR,Syros Pharmaceuticals,,,,477556,Nathanael,S.,Gray,,,company,2012,


In [34]:
colnames(comp) <- colnames(df)
df <- rbind(comp,df)
#head(df)

In [35]:
### remove duplicate entries (Trainee - Mentor pairs)

df <- df %>% distinct(Trainee.ID, Mentor.ID)


In [36]:
### in cases where year is missing, guess it or select randomly

setYear <- function(x){
    if(as.numeric(x["startYear"])==0) {
        if( as.numeric(x["stopYear"])==0) {
            x["startYear"] = sample(1985:2016,1)
        } else {
            x["startYear"] = as.numeric(x["stopYear"]) - 5;
        }
    }
    return(x)
}
df <- as.data.frame(t(apply(df,1,setYear)))


In [37]:
### define root persion ID
root.ID <- 52763 ### Peter Schultz

#root.ID <- 62876 ### test on Virginia Cornish
#root.ID <- 63525 ### test on Hening Lin
#root.ID <- 54496  ### test on Chris Walsh 
#root.ID <- 4338 ### Robert Woodward
#root.ID <- 9005 ### George Whitesides (161 direct children)
root.ID


In [38]:
getPerson <- function(ID, df, mentor.ID = NULL, include.children=TRUE, level=1 ) {
#    print(paste("getting person", ID, level))
    if( is.null(mentor.ID) ) {
        p <- df[df$Trainee.ID==ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    } else {
        p <- df[df$Trainee.ID==ID&df$Mentor.ID==mentor.ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    }
    person <- list()
    person$ID <- as.character(p$Trainee.ID[1])
    person$FirstName <- as.character(p$Trainee.FirstName[1])
    person$MiddleName <- as.character(p$Trainee.MiddleName[1])
    person$LastName <- as.character(p$Trainee.LastName[1])
    person$level <- level
    person$startYear <- as.character(p$startYear)
    if( is.na(person$LastName) ) {
        person$name <- person$ID;
    } else {
        person$name <- paste(c(
            substr(person$FirstName,0,1),
            substr(person$MiddleName,0,1),
            substr(person$LastName,0,1) ), collapse="")
        person$url = paste('http://academictree.org/chemistry/peopleinfo.php?pid=',person$ID,sep="")
    }
    if( include.children ) {
        person$children <- getChildren( person$ID, df, level = level + 1 )
    }

    return(person) 
}

getChildren <- function(root.ID, df, level = 0) {
    root <- getPerson(root.ID, df, include.children=FALSE)
#    print(paste("A: ", root$ID, level))
#    print(root)
#    print(dim(df))
    children.df <- df[df$Mentor.ID == root$ID,]
    children.df <- children.df[order(children.df$startYear,decreasing = TRUE),]
    children.ids <- unique(children.df$Trainee.ID)
#    print(paste("B: ", children.ids))
#    print(children.df)

    children <- list()
    if( dim(children.df)[1] == 0 ) { return(children); }
#    for( child.id in children.ids ) {
    for( i in 1:dim(children.df)[1] ) {
       child.id <- children.df[i,"Trainee.ID"]
       child.type <- children.df[i,"relationshipType"]
#       print(paste("B: ", child.id, level))
       child <- getPerson(child.id, df, level=level, mentor.ID=root.ID)
       child$type <- child.type
#        print(child$ID)
#       child$children <- getChildren( child$ID, df )
       children[[length(children)+1]] <- child
    }
    return(children)
}

In [39]:
df[df$Trainee.ID=="GNF",]
is.na(df[df$Trainee.ID=="GNF",][1,"Trainee.LastName"])

Unnamed: 0,Trainee.ID,Trainee.FirstName,Trainee.MiddleName,Trainee.LastName,Trainee.Institution,Mentor.ID,Mentor.FirstName,Mentor.MiddleName,Mentor.LastName,Mentor.Institution,relationshipCode,relationshipType,startYear,stopYear
22,GNF,Genomics Institute of the Novartis Research Foundation,,,,52763,Schultz,G,Peter,,,company,1999,


In [40]:
date()
root <- getPerson(root.ID, df)
date()


In [41]:
library(jsonlite)
json <- jsonlite::toJSON(root,pretty=TRUE,auto_unbox=TRUE)
write(json, paste(c("output/output_PGS.json"),collapse=""))
