# hubble network

In [572]:
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import os
%matplotlib inline

In [573]:
def read_einstein():
    with open('allFellowsList.html') as handle:
        page = handle.read()
        years = page.split('<tr><th>')[1:]
        years[-1]=years[-1].split('</table>')[0]

        fellows = np.concatenate([parse_einstein_year(year) for year in years[:]],dtype='U10000')
        ## deal with the fermi fellows, all like 4 of them
        index = np.argwhere(fellows[:,-2]=='')[0,0]
        fermi_year = fellows[index,0]
        fellowtype = fellows[index,-3].split('>')[1].split(' ')[0]
        index+=1
        while fellows[index,0] == fermi_year:
            fellows[index,1] = fellowtype
            index+=1
        fellows = fellows[fellows[:,-1]!='']
        ## append the 2 institutions people (all two of them)
        for index in np.argwhere(fellows[:,1]=='')[:,0]: fellows[index-1][-1]+=' ; '+fellows[index][-1]
        fellows = fellows[fellows[:,-2]!='']
        
    return fellows

def parse_einstein_year(blob):
    blob=blob.replace('<tr>','').replace('<td>','').replace('</td></tr>','').replace('\t','').replace('  ','')
    blob = blob.split('<TR>')[0]
    year = blob[:4]
    fellows = blob.split('\n')[1:]
    fellowtype = fellows[0].split('>')[1].split(' ')[0]
    fellows = fellows[1:]
    fellows = [[year,fellowtype]+fellow.split('</td>')+['N/A'] for fellow in fellows if len(fellow)]
    return fellows
    

einstein_fellows = read_einstein()
#einstein_fellows

In [574]:
def read_sagan():
    fellows = [ parse_sagan_year(fname) for fname in sorted(os.listdir('sagan')) if 'sw' not in fname] 
    return np.concatenate(fellows,dtype='U10000')
        
def parse_sagan_year(fname):

    year = fname[:4]
    with open(os.path.join('sagan',fname)) as handle:
        blob = handle.read().split('footer')[0].split('Postdoctoral Fellowship Recipients')
        fellowtype=blob[0][-100:].split('\n')[-1].strip()
        blob = blob[1]
        postdocs = blob.split('panel')[2].strip().split('<h5>')[1:]
        #print(year,fellowtype,len(postdocs))
        postdocs = [[year,fellowtype] + parse_sagan_postdoc(postdoc) for postdoc in postdocs]
        #print('--------------------------------------------------------')
    return postdocs

    
def parse_sagan_postdoc(postdoc):
    name = postdoc.split('</h5>')[0].strip()
    location,abstract = postdoc.split('<h6>')[1].split('</h6>')
    location = location.strip()
    abstract = '\n'.join(abstract.split('</p>')[:-1]).replace('<p>','').strip().replace('  ','').replace('\n\n','\n')
    return [name,location,abstract]



sagan_fellows = read_sagan()

#sagan_fellows[:,:-1]

In [575]:
def read_hubble():

    fellows_pre17 = parse_hubble_pre17()
    fellows_post17 = parse_hubble_post17()

    return np.concatenate(fellows_pre17+fellows_post17,dtype='U10000')

def parse_hubble_pre17():

    with open('2017-and-prior-fellows','r') as handle:
        blob = handle.read()
        years = blob.split('<section id=')[4:]

    return [parse_hubble_pre17_year(year) for year in years]

def parse_hubble_pre17_year(blob):
    year = blob.split('data-name="section-')[1].split('class')[0].strip()[:-1]
    fellowtype='Hubble'
    fellows = blob.split('<tbody>')[1].replace('\n','').replace('\t','').replace('</tr>','').replace('</td>','').split('<tr>')[1:]

    return [[year,fellowtype]+parse_hubble_pre17_fellow(fellow) for fellow in fellows]

def parse_hubble_pre17_fellow(fellow):
    name,src,dst = fellow.split('<td>')[1:]

    ## sometimes they have /tbodys in them??
    dst = dst.split('</tbody>')[0]

    if '<' in dst: 
        print(dst)
        import pdb; pdb.set_trace()

    ## name, destination, and "abstract"
    return [name,dst,f'Came from: {src}. N/A']

def parse_hubble_post17():
    return [parse_hubble_post17_year(fname) for fname in sorted(os.listdir('hubble')) if '.sw' not in fname]

def parse_hubble_post17_year(fname):
    with open(os.path.join('hubble',fname),'r') as handle:
        year = fname[:4]
        fellows = handle.read().split('main-content')[2].split('<section id=')[2:]
    
    return [[year]+parse_hubble_post17_fellow(fellow) for fellow in fellows]

def parse_hubble_post17_fellow(blob):
    blob = blob.replace('\n','').replace('\t','').replace('  ','').replace('</p>','').replace('</section>','').replace('&nbsp;','')
    ## remove the footer
    blob = blob.split('<div class="credits dark-background-stsci-dark-blue">')[0]

    ## get the name
    name = blob.split('</h2>')[0][-100:].split('>')[1]

    ## seek to the name in the blob
    blob = blob[blob.index(name)+len(name):].replace('</div>','').split('Fellow')
    fellowtype,blob = blob[0],'Fellow'.join(blob[1:])
    fellowtype = fellowtype.split('>')[-1]

    blob = blob.split(r'Host Institution:')[1]

    ## [6:] at the end to get rid of &nbsp;
    dst = blob.split('<p>')[0].strip()[len('</strong>'):].strip()
    if '&nbsp;' in dst: dst = dst[len('&nbsp;'):]

    abstract = blob.split('<p>')[1:]
    abstract = '\n'.join(abstract)

    #print(abstract)
    #print('-----------------')

    return [fellowtype,name,dst,abstract]



hubble_fellows = read_hubble()

hubble_fellows[:,:-1]

array([['2017', 'Hubble', 'Rachael Beaton', 'Princeton University'],
       ['2017', 'Hubble', 'Ivan Cabrera Ziri Castro',
        'Harvard College Observatory'],
       ['2017', 'Hubble', 'Ena Choi', 'Columbia University'],
       ...,
       ['2022', 'Einstein ', 'David Vartanyan', 'Carnegie Observatories'],
       ['2022', 'Sagan ', 'Michael Wong',
        'Carnegie Earth and Planets Laboratory'],
       ['2022', 'Sagan ', 'Zhoujian Zhang',
        'University of California, Santa Cruz']], dtype='<U10000')

In [576]:
nsf_fellows = np.load('../2022-07-30-NSF_AAPF_map_viz/nsf_fellows.npy').astype('U10000')
#nsf_fellows

In [603]:
def clean_locations(fellows):
    locations = fellows[:,-2]
    for i in range(len(locations)):
        location = locations[i]
        fellows[i,-2] = clean_location(location)

## this is going to be ugly
def clean_location(location):
    cfa = 'Harvard-Smithsonian Center for Astrophysics'
    location = location.strip()
    location = location.replace('CalTech','California Institute of Technology')
    location = location.replace('Caltech','California Institute of Technology')
    location = location.replace('Institue','Institute')
    location = location.replace(' in New York City','')
    location = location.replace(' universities','')
    location = location.replace('and at','and')
    location = location.replace(
        'Kavli Institute for Theoretical Physics at University of California, Santa Barbara,',
        'University of California, Santa Barbara')
    location = location.replace('and then at','and')
    location = location.replace(
        'Harvard-Smithsonian Center for Astrophysics, NRAO - Charlottesville, and University of Virginia',
        'Harvard-Smithsonian Center for Astrophysics and NRAO - Charlottesville and University of Virginia',)
    location = location.replace(
        'Center for Astrophysics',
        'Harvard-Smithsonian Center for Astrophysics')
    location = location.replace(
        'Harvard-Smithsonian Harvard-Smithsonian',
        'Harvard-Smithsonian')
    location = location.replace(' at Harvard University','')
    location = location.replace(' (CfA)','')
    location = location.replace(
        'Harvard College Observatory',
        'Harvard-Smithsonian Center for Astrophysics')
    location = location.replace( 'Harvard University',cfa)
    location = location.replace(
        'Harvard College Observatories',
        'Harvard-Smithsonian Center for Astrophysics')
    location = location.replace(' &ndash; Tucson','')
    location = location.replace(
        'National Optical Astronomical Observatories','National Optical Astronomical Observatory')
    location = location.replace('UCO/','')
    location = location.replace('Unversity','')
    location = location.replace(
        'Institute for Theoretical Physics at University of California, Santa Barbara',
        'University of California, Santa Barbara')
    location = location.replace(' in New Haven, Connecticut','')
    location = location.replace('Case Western University','Case Western Reserve University')

    if location == 'Smithsonian Astrophysical Observatory': location = cfa

    if location == 'Princeton University Center for Theoretical Science': location = 'Princeton University'

    location = location.replace('Urbana- ','Urbana-')
    
    ## conditionals
    if location == 'Cerro Tololo InterAmerican Observatory (CTIO) of National Optical Astronomy Observatories (NOAO), in La Serena, Chile':
        location = 'Cerro Tololo InterAmerican Observatory'
    

    if ('Carnegie Instit' in location and 'Obs' not in location or
        'Carnegie Earth and Planets Laboratory' == location or
        'Department of Terrestrial Magnetism' in location):
        location = 'Carnegie Institute of Washington'

    mit = 'Massachusetts Institute of Technology'
    if mit in location and 'Kavli' in location: location = mit

    uoa = 'University of Arizona'
    if uoa in location: location = uoa

    cwru = 'Case Western Reserve University'
    if 'CWRU' in location: location = cwru

    if 'Virginia' in location and 'National Radio Astronomy Observatory' in location: 
        location = location.replace('National Radio Astronomy Observatory','NRAO - Charlottesville')
    if 'National Radio Astronomy Observatory' in location and 'Socorro' in location: 
        location = location.replace('National Radio Astronomy Observatory at Socorro','NRAO - Socorro')
        location = location.replace('National Radio Astronomy Observatory in Socorro','NRAO - Socorro')
    
    lasp = 'Laboratory for Atmospheric and Space Physics (LASP) at '
    if lasp in location: location = location.replace(lasp,'')

    if 'RIT' in location: location = 'Rochester Institute of Technology'

    if ' and ' in location: location = location.replace(' and ','; ')

    if 'Center for Adaptive Optics ' in location: location  = 'University of California, Santa Cruz'

    if '/' in location and 'Chicago' in location: location = location.split('/')[0]

    location = location.replace('/','; ')

    if 'University of Chicago' in location and 'Cosmological' in location: location = 'University of Chicago'

    location = location.replace('Cornell;','Cornell University;')
    location = location.replace('Princeton','Princeton University')

    if location == 'Harvard-Smithsonian Center for Astrophysics, NRAO - Charlottesville,; University of Virginia':
        location = 'Harvard-Smithsonian Center for Astrophysics; NRAO - Charlottesville; University of Virginia'

    if location == 'High Altitude Observatory of National Center for Atmospheric Research in Boulder, Colorado':
        'High Altitude Observatory of National Center for Atmospheric Research in Boulder, Colorado'
    
    if 'Boulder' in location: location ='University of Colorado, Boulder'

    if 'Hawaii' in location: location = 'University of Hawaii, Institute for Astronomy'

    if 'Institute for Advanced' in location: location = 'Princeton University'
    if 'Institute of Advanced' in location: location = 'Princeton University'
    if 'Lawrence Berk' in location: location = 'Lawrence Berkeley National Laboratory'

    location = location.replace('NASA Ames','NASA Ames Research Center')
    location = location.replace('NASA Ames NASA Ames','NASA Ames')
    location = location.replace('Research Center Research Center','Research Center')

    while '(' in location:
        index = location.index('(')
        findex = location[index+1:].index(')')
        location = location[:index-1] + location[index+findex+2:]
    
    location = location.replace('University of California-','University of California, ')
    location = location.replace('University of California at','University of California,')
    location = location.replace('University of California ','University of California, ')

    if 'University of California, Irvine' in location: location = 'University of California, Irvine'


    location = location.replace('KITP-UCSB','University of California, Santa Barbara')
    if location == 'Northwestern University; University of California, Santa': location += ' Barbara'

    if 'University of Massachusetts' in location: location = 'University of Massachusetts, Amherst'

    if 'Yale' in location: location = location.replace('Yale','Yale University')
    
    if 'Madison' in location: location = 'University of Wisconsin, Madison'

    if 'Observatories' in location and 'Carnegie' in location: location = 'Carnegie Observatories'

    if 'MIT' in location: location = mit
    
    location = location.replace('National Optical Astronomical Observatory','National Optical Astronomy Observatory')

    location = location.replace('The ','')

    if 'University of Texas' in location: location = 'University of Texas, Austin'
    if 'University of Wisconsin' in location: location = 'University of Wisconsin, Madison'

    if 'University of Illinois' in location: location = 'University of Illinois, Urbana-Champaign'
    if 'University of Colorado' in location: location = 'University of Colorado, Boulder'
    if 'University of Maryland' in location: location = 'University of Maryland, College Park'

    location = location.replace('Angles','Angeles')

    location = location.replace('Boston University of','Boston University')

    if 'Rutgers' in location: location = 'Rutgers University'

    if 'Accelerator' in location: location = 'SLAC National Laboratory'


    location = location.replace('State University of','State University')

    location = location.replace('University University','University')
    location = location.strip()
    return location

In [606]:
locations = set([])
total = 0
for j,fellows in enumerate([hubble_fellows,nsf_fellows,einstein_fellows,sagan_fellows]):
    total += len(fellows)
    clean_locations(fellows)
    these_locs = np.hstack([fellow.split('; ') for fellow in fellows[:,-2]])
    for i,loc in enumerate(these_locs):
        if 'Northwestern' in loc and 'California' in loc: print(j,i,loc)
    locations = locations.union(set(these_locs))
print(total,len(locations))

919 83


In [605]:

locations

{'*Fellow at Large',
 'Adler Planetarium',
 'American Museum of Natural History',
 'Arizona State University',
 'Boston University',
 'Bowling Green State University',
 'California Institute of Technology',
 'Carnegie Institute of Washington',
 'Carnegie Observatories',
 'Case Western Reserve University',
 'Cerro Tololo InterAmerican Observatory',
 'Columbia University',
 'Cornell University',
 'Dartmouth College',
 'Georgia State University',
 'Goddard Space Flight Center',
 'Harvard-Smithsonian Center for Astrophysics',
 'Iowa State University',
 'Jet Propulsion Laboratory',
 'Johns Hopkins University',
 'Kitt Peak National Observatory',
 'Lawrence Berkeley National Laboratory',
 'Lawrence Livermore National Laboratory',
 'Lick Observatory, University of California',
 'Los Alamos National Laboratory',
 'Lowell Observatory',
 'Massachusetts Institute of Technology',
 'Michigan State University',
 'NASA Ames Research Center',
 'NRAO - Charlottesville',
 'NRAO - Socorro',
 'National Cen