In [1]:
### Read in data file

In [2]:
library(data.table)

: package 'data.table' was built under R version 3.2.5

In [3]:
datafile <- "data/export_all.csv"
df <- as.data.frame(fread(datafile,header=FALSE,stringsAsFactors=FALSE))

if( df[1,1] == "pid1" ) {
    df <- as.data.frame(fread(datafile,header=TRUE,stringsAsFactors=FALSE))
}


Read 99.2% of 523970 rowsRead 523969 rows and 14 (of 14) columns from 0.074 GB file in 00:00:03


In [4]:
fields <- c("ID","FirstName","MiddleName","LastName","Institution")
columnNames <- c(
    paste("Trainee",fields,sep="."),
    paste("Mentor",fields,sep="."),
    "relationshipCode","relationshipType", "startYear", "stopYear")

colnames(df) <- columnNames
head(df)


Unnamed: 0,Trainee.ID,Trainee.FirstName,Trainee.MiddleName,Trainee.LastName,Trainee.Institution,Mentor.ID,Mentor.FirstName,Mentor.MiddleName,Mentor.LastName,Mentor.Institution,relationshipCode,relationshipType,startYear,stopYear
1,2,Benjamin,Y,Hayden,University of Rochester,3,Jack,L,Gallant,"University of California, Berkeley",1,student,2000,2005
2,4,Benjamin,,Willmore,University of Oxford,3,Jack,L,Gallant,"University of California, Berkeley",2,postdoc,2003,2006
3,6,Ryan,,Prenger,Lawrence Livermore Laboratory,3,Jack,L,Gallant,"University of California, Berkeley",1,student,2002,2008
4,18761,Alan,P,Koretsky,National Institute of Neurological Disorders and Stroke,9,Melvin,P.,Klein,"University of California, Berkeley",1,student,0,1984
5,10,C,Edward,Connor,Johns Hopkins University,16,David,C,Van Essen,"Washington University, Saint Louis",2,postdoc,0,0
6,3,Jack,L,Gallant,"University of California, Berkeley",16,David,C,Van Essen,"Washington University, Saint Louis",2,postdoc,0,0


In [5]:
dim(df)

In [6]:
### in cases where year is missing, guess it or select randomly

setYear <- function(x){
    if(as.numeric(x["startYear"])==0) {
        if( as.numeric(x["stopYear"])==0) {
            x["startYear"] = sample(1985:2016,1)
        } else {
            x["startYear"] = as.numeric(x["stopYear"]) - 5;
        }
    }
    return(x)
}
df <- as.data.frame(t(apply(df,1,setYear)))


In [7]:
### define root persion ID
root.ID <- 52763 ### Peter Schultz

#root.ID <- 62876 ### test on Virginia Cornish
#root.ID <- 63525 ### test on Hening Lin
#root.ID <- 54496  ### test on Chris Walsh 
#root.ID <- 4338 ### Robert Woodward
#root.ID <- 9005 ### George Whitesides (161 direct children)
root.ID


In [8]:
getPerson <- function(ID, df, mentor.ID = NULL, include.children=TRUE, level=1 ) {
#    print(paste("getting person", ID, level))
    if( is.null(mentor.ID) ) {
        p <- df[df$Trainee.ID==ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    } else {
        p <- df[df$Trainee.ID==ID&df$Mentor.ID==mentor.ID,c("Trainee.ID","Trainee.FirstName","Trainee.MiddleName","Trainee.LastName","startYear")]
    }
    person <- list()
    person$ID <- as.character(p$Trainee.ID[1])
    person$FirstName <- as.character(p$Trainee.FirstName[1])
    person$MiddleName <- as.character(p$Trainee.MiddleName[1])
    person$LastName <- as.character(p$Trainee.LastName[1])
    person$level <- level
    person$startYear <- as.character(p$startYear)
    person$name <- paste(c(
        substr(person$FirstName,0,1),
        substr(person$MiddleName,0,1),
        substr(person$LastName,0,1) ), collapse="")
    person$url = paste('http://academictree.org/chemistry/peopleinfo.php?pid=',person$ID,sep="")
    if( include.children ) {
        person$children <- getChildren( person$ID, df, level = level + 1 )
    }

    return(person) 
}

getChildren <- function(root.ID, df, level = 0) {
    root <- getPerson(root.ID, df, include.children=FALSE)
#    print(paste("A: ", root$ID, level))
#    print(root)
#    print(dim(df))
    children.df <- df[df$Mentor.ID == root$ID,]
    children.df <- children.df[order(children.df$startYear,decreasing = TRUE),]
    children.ids <- unique(children.df$Trainee.ID)
#    print(paste("B: ", children.ids))
#    print(children.df)

    children <- list()
    if( dim(children.df)[1] == 0 ) { return(children); }
#    for( child.id in children.ids ) {
    for( i in 1:dim(children.df)[1] ) {
       child.id <- children.df[i,"Trainee.ID"]
       child.type <- children.df[i,"relationshipType"]
#       print(paste("B: ", child.id, level))
       child <- getPerson(child.id, df, level=level, mentor.ID=root.ID)
       child$type <- child.type
#        print(child$ID)
#       child$children <- getChildren( child$ID, df )
       children[[length(children)+1]] <- child
    }
    return(children)
}

In [9]:
date()
root <- getPerson(root.ID, df)
date()


In [10]:
library(jsonlite)
json <- jsonlite::toJSON(root,pretty=TRUE,auto_unbox=TRUE)
write(json, paste(c("output/output_PGS.json"),collapse=""))
