# Code for Transatlantic Ph.D. Graduates' Placement Flows

In [0]:
#uncomment all lines below in this cell to replicate my installation on colab

#!pip install qeds fiona geopandas xgboost gensim folium pyLDAvis descartes psaw pyarrow
#!apt-get install libproj-dev proj-data proj-bin  
#!apt-get install libgeos-dev  
#!pip install cython  
#!pip install cartopy 
#!brew install proj geos
#!pip3 uninstall shapely
#!pip3 install --upgrade cython numpy pyshp six
#!pip3 install shapely --no-binary shapely
#!pip3 install git+https://github.com/SciTools/cartopy.git --no-binary cartopy

In [0]:
import pandas as pd 
import os
import IPython

import json
with open("to_from_data.json", "r") as read_file:
    rankings = json.load(read_file)
data = pd.DataFrame(rankings)

In [0]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import qeds
qeds.themes.mpl_style();

import cartopy.crs as ccrs #cartopy documentation: https://scitools.org.uk/cartopy/docs/latest/
import cartopy.feature as cfeature

#import geopandas as gpd
#from shapely.geometry import Point

In [0]:
inst_data = pd.read_json("to_from_data.json") #set directory as necessary

In [0]:
inst_data = inst_data.dropna(subset = ["to_latitude", "to_longitude", "latitude", "longitude"]) #drop observations with missing geocoordinates

In [0]:
inst_data["startdate"] = pd.to_datetime(inst_data["startdate"]) #convert object to datetime

inst_data = inst_data[inst_data["startdate"].dt.year == 2019] #select rows with start date in year 2019

In [0]:
inst_data.name.unique().shape #note 33 unique categories, names

In [0]:
inst_data.description.unique() #note 10 unique recruiter_types, descriptions

In [0]:
import nltk
nltk.download("punkt")

def desc_reviser(string):
    string = string.lower()
    tokens = [i for i in nltk.tokenize.word_tokenize(string)]
    return(tokens)

#desc_reviser(inst_data.description[6])

In [0]:
desc_tokens = [desc_reviser(i) for i in inst_data.description]
inst_data["desc_tokens"] = desc_tokens

In [0]:
inst_data["revised_description"] = 0

for i in inst_data.index:
    if "academic" in inst_data.desc_tokens[i]:
        inst_data.revised_description[i] = inst_data.description[i]
        
    elif "government" in inst_data.desc_tokens[i]:
        inst_data.revised_description[i] = inst_data.description[i]
        
    elif "private" or "other" in inst_data.desc_tokens[i]:
        inst_data.revised_description[i] = "Private business or organization; Other type of organization"
        
    else: inst_data["revised_description"][i] = np.nan

In [0]:
inst_data['revised_recruiter_hash'] = inst_data.revised_description.map(hash) #create column of revised recruiter types

In [0]:
inst_data = inst_data.loc[inst_data["name"].isin(["Development; Growth", "Microeconomics", "Macroeconomics; Monetary", "Econometrics", "Political Economy", "Theory", "Behavioral Economics", "Finance", "Industrial Organization", "Labor; Demographic Economics"])] #restrict analysis to fields with informative maps

In [0]:
inst_data["from_coordinates"] = list(zip(inst_data.longitude, inst_data.latitude))
inst_data["to_coordinates"] = list(zip(inst_data.to_longitude, inst_data.to_latitude))

In [0]:
data_subsets = {}
for i in inst_data.category_id.unique():
    inst_data_subset_iteration = inst_data[inst_data.category_id == i]
    data_subsets[i] = inst_data_subset_iteration

##Code for Maps by Applicant's Primary Field

In [0]:
for j in inst_data.category_id.unique():
    proj1 = ccrs.PlateCarree()
    fig = plt.figure(figsize=(80, 80)) #revised from (25, 20)
    ax = fig.add_subplot(projection = proj1) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
    
    #ax.set_global() #sets extent of map to be global, choose this to avoid indiscernible plot in terms of country, note lines close together now very small, comment for zoom default
    ax.set_extent([-168.084722, 66.618056, 7.208889, 72], crs = proj1) #sets extent of map to include continental north america and mainland europe using https://en.wikipedia.org/wiki/Lists_of_extreme_points 
    #westernmost longitude of continental north america, easternmost longitude of mainland europe, southernmost point of continental north america, northernmost point of continental north america
    ax.coastlines()
    ax.add_feature(cfeature.BORDERS)
    ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0], fontsize = 32) #added fontsize
    
    for i in data_subsets[j].index:
        
        if inst_data.longitude[i] >= inst_data.to_longitude[i]:
            colors = "red"
        else: colors = "blue"
        
        ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic(), color = colors)
        

## Code for Maps by Applicant's Primary Field and Recruiter Types

In [0]:
for j in inst_data.category_id.unique():
    for k in data_subsets[j].revised_recruiter_hash.unique():
        proj1 = ccrs.PlateCarree()
        fig = plt.figure(figsize=(80, 80)) #revised from (25, 20)
        ax = fig.add_subplot(projection = proj1) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
    
        #ax.set_global() #sets extent of map to be global, choose this to avoid indiscernible plot in terms of country, note lines close together now very small, comment for zoom default
        ax.set_extent([-168.084722, 66.618056, 7.208889, 72], crs = proj1) #sets extent of map to include continental north america and mainland europe using https://en.wikipedia.org/wiki/Lists_of_extreme_points 
        #westernmost longitude of continental north america, easternmost longitude of mainland europe, southernmost point of continental north america, northernmost point of continental north america
        ax.coastlines()
        ax.add_feature(cfeature.BORDERS)
        ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0] + "; revised_recruiter_hash_" + str(k) + ": " + data_subsets[j][data_subsets[j].revised_recruiter_hash == k].revised_description.unique()[0], fontsize = 32) #NOTE data_description_aa_dsouza has method description instead of revised_description, needs to be fixed there
    
        for i in data_subsets[j][data_subsets[j].revised_recruiter_hash == k].index:
            
            if inst_data.longitude[i] >= inst_data.to_longitude[i]:
                colors = "red"
            else: colors = "blue"
                
            ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic(), color = colors)
