**Please run required sections of notebook as necessary.**

In [0]:
! pip install qeds fiona geopandas xgboost gensim folium pyLDAvis descartes psaw pyarrow

In [0]:
!apt-get install libproj-dev proj-data proj-bin  
!apt-get install libgeos-dev  
!pip install cython  
!pip install cartopy  

In [0]:
# Install proj and geos using brew
!brew install proj geos
# If shapely is already installed, first uninstall it
!pip3 uninstall shapely
# Install the cython, numpy, pyshp, and six dependencies using pip
!pip3 install --upgrade cython numpy pyshp six
# Install shapely from source with pip (the binary may not be linked to the correct version of geos
!pip3 install shapely --no-binary shapely
# Finally, install cartopy using pip. The version on pypi does not yet work with the most recent
# release of proj, so instead install cartopy from source using the most recent version on github
!pip3 install git+https://github.com/SciTools/cartopy.git --no-binary cartopy

In [0]:
import pandas as pd 
import os
import IPython

import json
with open("to_from_data.json", "r") as read_file:
    rankings = json.load(read_file)
data = pd.DataFrame(rankings)

In [0]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import qeds
qeds.themes.mpl_style();

import cartopy.crs as ccrs #cartopy documentation: https://scitools.org.uk/cartopy/docs/latest/
import cartopy.feature as cfeature

#import geopandas as gpd
#from shapely.geometry import Point

In [0]:
inst_data = pd.read_json("to_from_data.json") #set directory as necessary

In [0]:
inst_data #same as data ^

In [0]:
inst_data = inst_data.dropna(subset = ["to_latitude", "to_longitude", "latitude", "longitude"]) #drop observations with missing geocoordinates

In [0]:
inst_data["startdate"] = pd.to_datetime(inst_data["startdate"]) #convert object to datetime

inst_data = inst_data[inst_data["startdate"].dt.year == 2019] #select rows with start date in year 2019

In [0]:
inst_data.name.unique().shape #note 33 unique categories, names

In [0]:
inst_data.description.unique() #note 10 unique recruiter_types, descriptions

In [0]:
import nltk
nltk.download("punkt")

def desc_reviser(string):
    string = string.lower()
    tokens = [i for i in nltk.tokenize.word_tokenize(string)]
    return(tokens)

#desc_reviser(inst_data.description[6])

In [0]:
desc_tokens = [desc_reviser(i) for i in inst_data.description]
inst_data["desc_tokens"] = desc_tokens

In [0]:
inst_data["revised_description"] = 0

for i in inst_data.index:
    if "academic" in inst_data.desc_tokens[i]:
        inst_data.revised_description[i] = inst_data.description[i]
        
    elif "government" in inst_data.desc_tokens[i]:
        inst_data.revised_description[i] = inst_data.description[i]
        
    elif "private" or "other" in inst_data.desc_tokens[i]:
        inst_data.revised_description[i] = "Private business or organization; Other type of organization"
        
    else: inst_data["revised_description"][i] = np.nan

In [0]:
inst_data[inst_data["description"] == "Other type of organization"]

In [0]:
inst_data[inst_data["description"] == "Academic organization (other than econ, business, or ag econ)"] #check to ensure conditional in correct order - notice word "other"

In [0]:
inst_data['revised_recruiter_hash'] = inst_data.revised_description.map(hash) #create column of revised recruiter types

In [0]:
inst_data["rank"].unique()#.shape

In [0]:
inst_data["from_coordinates"] = list(zip(inst_data.longitude, inst_data.latitude))
inst_data["to_coordinates"] = list(zip(inst_data.to_longitude, inst_data.to_latitude))

In [0]:
#before dropping NaN and restricting analysis
inst_data.category_id.unique() #no category 28?
#inst_data[inst_data.category_id == 28]

In [0]:
data_subsets = {}
for i in inst_data.category_id.unique():
    inst_data_subset_iteration = inst_data[inst_data.category_id == i]
    data_subsets[i] = inst_data_subset_iteration

## Maps by Applicant Primary Field (Names) - using scatter-plot arrow proxy

In [0]:
#note that points converging at Gulf of Guinea are at intersection of Equator and Prime Meridian

#NaN probably coded as (0, 0) - dropped na above

#FIXME some institutions such as IMF, CBO, BEA coded as (0, 0) for example uncomment and run the next line
#inst_data.loc[inst_data["to_latitude"] == 0]

In [0]:
for j in inst_data.category_id.unique():
    fig = plt.figure(figsize=(80, 80)) #on GitHub (25, 20)
    ax = fig.add_subplot(projection = ccrs.PlateCarree()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html

    ax.coastlines()
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.BORDERS)
    ax.add_feature(cfeature.LAND)
    ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0], fontsize = 32) #added fontsize
    
    for i in data_subsets[j].index:
        ax.scatter(inst_data.longitude[i], inst_data.latitude[i], transform = ccrs.Geodetic(), color = "blue", marker = "o") #use scatter point method from maps emailed
        ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic())
        ax.scatter(inst_data.to_longitude[i], inst_data.to_latitude[i], transform = ccrs.Geodetic(), color = "red", marker = "^") #use scatter point method from maps emailed, ideally cartopy adaptation of basemap https://stackoverflow.com/questions/45512429/python-basemap-drawgreatcircle-with-arrow-end-cap?rq=1



## Maps by Applicant Primary Field (Names) - using annotation arrow proxy

In [0]:
#with arrows but not great circle plot

for j in inst_data.category_id.unique():
    fig = plt.figure(figsize=(80, 80)) #on GitHub (25, 20)
    ax = fig.add_subplot(projection = ccrs.Mercator()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html

    ax.coastlines()
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.BORDERS)
    ax.add_feature(cfeature.LAND)
    ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0], fontsize = 32) #added fontsize
    transform = ccrs.PlateCarree()._as_mpl_transform(ax)
    for i in data_subsets[j].index:
        ax.annotate(" ", xy = (inst_data.to_longitude[i], inst_data.to_latitude[i]), xytext = (inst_data.longitude[i], inst_data.latitude[i]), arrowprops = dict(arrowstyle = "->", connectionstyle="arc3, rad = -0.8", color = "red"), xycoords = transform, ha = 'right', va = 'top', annotation_clip = False)



## Maps by Recruiter Type and Applicant Primary Field 

In [0]:
data_subsets[1][data_subsets[1].recruiter_type == 1]

In [0]:
for j in inst_data.category_id.unique():
    for k in data_subsets[j].recruiter_type.unique():
        fig = plt.figure(figsize=(80, 80)) #on GitHub (25, 20)
        ax = fig.add_subplot(projection = ccrs.PlateCarree()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
    
        ax.coastlines()
        ax.add_feature(cfeature.OCEAN)
        ax.add_feature(cfeature.BORDERS)
        ax.add_feature(cfeature.LAND)
        ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0] + "; recruiter_type_" + str(k) + ": " + data_subsets[j][data_subsets[j].recruiter_type == k].description.unique()[0], fontsize = 32)
    
        for i in data_subsets[j][data_subsets[j].recruiter_type == k].index:
            ax.scatter(inst_data.longitude[i], inst_data.latitude[i], transform = ccrs.Geodetic(), color = "blue", marker = "o") #use scatter point method from maps emailed
            ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic())
            ax.scatter(inst_data.to_longitude[i], inst_data.to_latitude[i], transform = ccrs.Geodetic(), color = "red", marker = "^") #use scatter point method from maps emailed, ideally cartopy adaptation of basemap https://stackoverflow.com/questions/45512429/python-basemap-drawgreatcircle-with-arrow-end-cap?rq=1
     

##  Interactive Map - no arrows

In [0]:
import plotly.graph_objects as go

In [0]:
from_location_data = pd.DataFrame(inst_data.loc[:, ["latitude", "longitude", "from_institution_name", "rank"]])
to_location_data = pd.DataFrame(inst_data.loc[:, ["to_latitude", "to_longitude", "to_name", "to_rank"]])
location_data_1 = from_location_data.rename(columns = {"from_institution_name": "institution_name"})
location_data_2 = to_location_data.rename(columns = {"to_latitude": "latitude", "to_longitude": "longitude", "to_name": "institution_name", "to_rank": "rank"})

location_data = pd.concat([location_data_1, location_data_2], ignore_index = True)
location_data = location_data.loc[:, ["latitude", "longitude", "institution_name", "rank"]]
location_data = location_data.drop_duplicates(ignore_index = True)

In [0]:
location_data

In [0]:
fig = go.Figure()

In [0]:
fig.add_trace(go.Scattergeo(lon = location_data["longitude"], lat = location_data["latitude"], hoverinfo = "text", text = location_data.loc[:, ["institution_name", "rank"]], mode = "markers", marker = dict(size = 2, color = "rgb(255, 0, 0)", line = dict(width = 3, color = "rgba(68, 68, 68, 0)"))))

In [0]:
for i in inst_data.index:
    fig.add_trace(go.Scattergeo(lon = [inst_data["longitude"][i], inst_data["to_longitude"][i]], lat = [inst_data["latitude"][i], inst_data["to_latitude"][i]], mode = "lines", line = dict(width = 1, color = "red")))
    
#opacity = float(df["count"][i]) / float(df["count"].max()) 

In [0]:
fig.update_layout(
    title_text = "Interactive Map", showlegend = False, geo = dict(projection_type = "equirectangular", showland = True, landcolor = "rgb(243, 243, 243)", countrycolor = "rgb(204, 204, 204)",))

#to add markers https://plotly.com/python/marker-style/

#FIXME institution name and rank instead of lat, lon

## Ranking Scatterplot

In [0]:
import seaborn as sns

In [0]:
data_clean_rank = data.dropna(subset = ["rank", "to_rank"])

In [0]:
fig, ax = plt.subplots(figsize = (10, 10))
rank_scat = sns.regplot(x = "rank", y = "to_rank", data = data_clean_rank, ax = ax)
rank_scat.get_lines()[0].set_color("red")

## Maps by Applicant Primary Field (Names) - East to West: Red; West to East: Black - Great Circle Plots

In [0]:
# note longitude 180 == -180 

In [0]:
for j in inst_data.category_id.unique():
    fig = plt.figure(figsize=(80, 80)) #on GitHub (25, 20)
    ax = fig.add_subplot(projection = ccrs.PlateCarree()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
    
    ax.set_global() #sets extent of map to be global, choose this to avoid indiscernible plot in terms of country, note lines close together now very small, comment for zoom default
    ax.coastlines()
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.BORDERS)
    ax.add_feature(cfeature.LAND)
    ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0], fontsize = 32)
    
    for i in data_subsets[j].index:
        
        if inst_data.longitude[i] >= inst_data.to_longitude[i]:
            colors = "red"
        else: colors = "black"
        
        ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic(), color = colors)
        

## B/W Maps by Applicant Primary Field (Names) - East to West: Red; West to East: Blue - Great Circle Plots

In [0]:
for j in inst_data.category_id.unique():
    fig = plt.figure(figsize=(80, 80)) #on GitHub (25, 20)
    ax = fig.add_subplot(projection = ccrs.PlateCarree()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
    
    ax.set_global() #sets extent of map to be global, choose this to avoid indiscernible plot in terms of country, note lines close together now very small, comment for zoom default
    ax.coastlines()
    ax.add_feature(cfeature.BORDERS)
    ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0], fontsize = 32)
    
    for i in data_subsets[j].index:
        
        if inst_data.longitude[i] >= inst_data.to_longitude[i]:
            colors = "red"
        else: colors = "blue"
        
        ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic(), color = colors)
        

## Maps by Applicant Primary Field (Names) - East to West: Red; West to East: Black - using annotation arrow proxy

In [0]:
#with arrows but not great circle plot

for j in inst_data.category_id.unique():
    fig = plt.figure(figsize=(80, 80)) #on GitHub (25, 20)
    ax = fig.add_subplot(projection = ccrs.Mercator()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html

    ax.coastlines()
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.BORDERS)
    ax.add_feature(cfeature.LAND)
    ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0], fontsize = 32)
    transform = ccrs.PlateCarree()._as_mpl_transform(ax)
    for i in data_subsets[j].index:
        
        if inst_data.longitude[i] >= inst_data.to_longitude[i]:
            colors = "red"
        else: colors = "black"
        
        ax.annotate(" ", xy = (inst_data.to_longitude[i], inst_data.to_latitude[i]), xytext = (inst_data.longitude[i], inst_data.latitude[i]), arrowprops = dict(arrowstyle = "->", connectionstyle="arc3, rad = -0.8", color = colors), xycoords = transform, ha = 'right', va = 'top', annotation_clip = False)



## B/W Maps by Applicant Primary Field (Names) - East to West: Red; West to East: Blue - using annotation arrow proxy

In [0]:
#with arrows but not great circle plot

for j in inst_data.category_id.unique():
    fig = plt.figure(figsize=(80, 80))
    ax = fig.add_subplot(projection = ccrs.Mercator()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html

    ax.coastlines()
    ax.add_feature(cfeature.BORDERS)
    ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0], fontsize = 32)
    transform = ccrs.PlateCarree()._as_mpl_transform(ax)
    for i in data_subsets[j].index:
        
        if inst_data.longitude[i] >= inst_data.to_longitude[i]:
            colors = "red"
        else: colors = "blue"
        
        ax.annotate(" ", xy = (inst_data.to_longitude[i], inst_data.to_latitude[i]), xytext = (inst_data.longitude[i], inst_data.latitude[i]), arrowprops = dict(arrowstyle = "->", connectionstyle="arc3, rad = -0.8", color = colors), xycoords = transform, ha = 'right', va = 'top', annotation_clip = False)


## Maps by Revised Recruiter Type and Applicant Primary Field

In [0]:
for j in inst_data.category_id.unique():
    for k in data_subsets[j].revised_recruiter_hash.unique():
        fig = plt.figure(figsize=(80, 80)) #on GitHub (25, 20)
        ax = fig.add_subplot(projection = ccrs.PlateCarree()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
        
        ax.set_global() #sets extent of map to be global, choose this to avoid indiscernible plot in terms of country, note lines close together now very small, comment for zoomed in default extent
        ax.coastlines()
        ax.add_feature(cfeature.OCEAN)
        ax.add_feature(cfeature.BORDERS)
        ax.add_feature(cfeature.LAND)
        ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0] + "; revised_recruiter_hash_" + str(k) + ": " + data_subsets[j][data_subsets[j].revised_recruiter_hash == k].revised_description.unique()[0], fontsize = 32)
    
        for i in data_subsets[j][data_subsets[j].revised_recruiter_hash == k].index:
            ax.scatter(inst_data.longitude[i], inst_data.latitude[i], transform = ccrs.Geodetic(), color = "blue", marker = "o") #use scatter point method from maps emailed
            ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic())
            ax.scatter(inst_data.to_longitude[i], inst_data.to_latitude[i], transform = ccrs.Geodetic(), color = "red", marker = "^") #use scatter point method from maps emailed, ideally cartopy adaptation of basemap https://stackoverflow.com/questions/45512429/python-basemap-drawgreatcircle-with-arrow-end-cap?rq=1
     

## Maps by Revised Recruiter Type and Applicant Primary Field - East to West: Red; West to East: Black

In [0]:
for j in inst_data.category_id.unique():
    for k in data_subsets[j].revised_recruiter_hash.unique():
        fig = plt.figure(figsize=(80, 80))
        ax = fig.add_subplot(projection = ccrs.PlateCarree()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
        
        ax.set_global() #sets extent of map to be global, choose this to avoid indiscernible plot in terms of country, note lines close together now very small, comment for zoomed in default extent
        ax.coastlines()
        ax.add_feature(cfeature.OCEAN)
        ax.add_feature(cfeature.BORDERS)
        ax.add_feature(cfeature.LAND)
        ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0] + "; revised_recruiter_hash_" + str(k) + ": " + data_subsets[j][data_subsets[j].revised_recruiter_hash == k].revised_description.unique()[0], fontsize = 32)
    
        for i in data_subsets[j][data_subsets[j].revised_recruiter_hash == k].index:
            
            if inst_data.longitude[i] >= inst_data.to_longitude[i]:
                colors = "red"
            else: colors = "black"
                
            ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic(), color = colors)



## B/W Maps by Revised Recruiter Type and Applicant Primary Field - East to West: Red; West to East: Blue

In [0]:
for j in inst_data.category_id.unique():
    for k in data_subsets[j].revised_recruiter_hash.unique():
        fig = plt.figure(figsize=(80, 80))
        ax = fig.add_subplot(projection = ccrs.PlateCarree()) #Projection list: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html
        
        ax.set_global() #sets extent of map to be global, choose this to avoid indiscernible plot in terms of country, note lines close together now very small, comment for zoomed in default extent
        ax.coastlines()
        ax.add_feature(cfeature.BORDERS)
        ax.set_title("category_id_" + str(j) + ": " + data_subsets[j].name.unique()[0] + "; revised_recruiter_hash_" + str(k) + ": " + data_subsets[j][data_subsets[j].revised_recruiter_hash == k].revised_description.unique()[0], fontsize = 32)
    
        for i in data_subsets[j][data_subsets[j].revised_recruiter_hash == k].index:
            
            if inst_data.longitude[i] >= inst_data.to_longitude[i]:
                colors = "red"
            else: colors = "blue"
                
            ax.plot([inst_data.longitude[i], inst_data.to_longitude[i]], [inst_data.latitude[i], inst_data.to_latitude[i]], transform=ccrs.Geodetic(), color = colors)

