In [19]:
import json
import pandas as pd
import numpy as np

In [20]:
# Sets country names in CSV to match world.json names (GeoJSON):
def clean_names(tbl):
    tbl = tbl.replace('Bolivia (Plurinational State of)', "Bolivia")
    tbl = tbl.replace("Iran (Islamic Rep. of)", "Iran")
    tbl = tbl.replace("Rep. of Korea", "South Korea")
    tbl = tbl.replace("Rep. of Moldova", "Moldova")
    tbl = tbl.replace("Russian Federation", "Russia")
    tbl = tbl.replace("Dem. Rep. of the Congo", "Democratic Republic of the Congo")
    tbl = tbl.replace("United Rep. of Tanzania", "Tanzania")
    tbl = tbl.replace("Syrian Arab Rep.", "Syria")
    tbl = tbl.replace("Serbia and Kosovo (S/RES/1244 (1999))", "Serbia")
    tbl = tbl.replace("Lao People's Dem. Rep.", "Lao PDR")
    tbl = tbl.replace("Dem. People's Rep. of Korea", "Dem. Rep. Korea")
    tbl = tbl.replace("Viet Nam", "Vietnam")
    tbl = tbl.replace("The former Yugoslav Republic of Macedonia", "Macedonia")
    tbl = tbl.replace('Venezuela (Bolivarian Republic of)', "Venezuela")
    tbl = tbl.replace('China, Hong Kong SAR', "Hong Kong")
    tbl = tbl.replace('Brunei Darussalam', "Brunei")
    tbl = tbl.replace('China, Macao SAR', "Macao")
    tbl = tbl.replace('Micronesia (Federated States of)', "Micronesia")
    tbl = tbl.replace('United States of America', "United States")
    return tbl

# Country to Country Refugee Statistics:
- Binding JSON Data
- Using `clean_unhcr_refugee.csv`
- This dataset includes "Persons of Interest" according to the UNHCR:
   - Refugees (incl. refugee-like situations)
   - Asylum-seekers (pending cases)
   - Returned refugees
   - Internally displaced persons (IDPs)
   - Returned IDPs
   - Stateless persons
   - Others of concern
- Source: http://popstats.unhcr.org/en/persons_of_concern

In [20]:
with open("world.json") as f:
    data = json.load(f)

refugee_table = pd.read_csv("clean_unhcr_refugee.csv", delimiter=',', encoding="latin-1")
asylum_set = set(refugee_table["asylum"].unique())
total = refugee_table.groupby(["asylum", "year"]).sum().reset_index().set_index("asylum")
total = clean_names(total)

tmp = refugee_table.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["origin","refugees"], ascending =False)
tmp = tmp[tmp["asylum"]!=tmp["origin"]]
# Top 5 places refugees are leaving to:
asylum = tmp.groupby("origin").head(5)

refugees = refugee_table.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["asylum","refugees"], ascending =False)
refugees = refugees[refugees["asylum"]!=refugees["origin"]]
# Top 5 places refugees are coming from:
refugees = refugees.groupby("asylum").head(5)

for country in data["features"]:
    country_name = country["properties"]["name"]
    if(country_name in asylum_set):
        years = total.loc[country_name]["year"].tolist()
        if(type(years) == int):
            country["all_refugees"] = []
            continue
        all_refugees = total.loc[country_name]["refugees"].tolist()
        country["all_refugees"] = [{"year":years[i], "refugees":all_refugees[i]} for i in range(0, len(years))]
    else:
        country["all_refugees"] = []

with open("refugee_world.json", "w") as f:
    json.dump(data, f)

FileNotFoundError: File b'clean_unhcr_refugee.csv' does not exist

# Top 5 Sources of Refugees (inbound) and Destinations for Refugees (outbound)

In [21]:
with open("world.json") as f:
    data = json.load(f)

refugee_table = pd.read_csv("unhcr_refugee.csv", delimiter=',', encoding="latin-1")
refugee_table = clean_names(refugee_table)
asylum_set = set(refugee_table["asylum"].unique())
# total = refugee_table.groupby(["asylum", "year"]).sum().reset_index().set_index("asylum")
# total = clean_names(total)

tmp = refugee_table.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["origin","refugees"], ascending =False)
tmp = tmp[tmp["asylum"]!=tmp["origin"]]
# Top 5 places refugees are leaving to:
asylum = tmp.groupby("origin").head(5).set_index("origin")

refugees = refugee_table.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["asylum","refugees"], ascending =False)
refugees = refugees[refugees["asylum"]!=refugees["origin"]]
# Top 5 places refugees are coming from:
refugees = refugees.groupby("asylum").head(5).set_index("asylum")

for country in data["features"]:
    if (country["properties"]["name"] in asylum_set) or (country["properties"]["name_long"] in asylum_set):
        if (country["properties"]["name_long"] in asylum_set):
            country_name = country["properties"]["name_long"]
        else:
            country_name = country["properties"]["name"]
        p = refugees.loc[country_name]
        
        if(isinstance(p["origin"], str)):
            country["inbound_countries"] = [p["origin"]]
            country["inbound_num"] = [int(p["refugees"])]
        else:
            country["inbound_countries"] = p["origin"].tolist()
            country["inbound_num"] = p["refugees"].tolist()
        
        try:
            p = asylum.loc[country_name]
        except:
            continue
        if(isinstance(p["asylum"], str)):
            country["outbound_countries"] = [p["asylum"]]
            country["outbound_num"] = [int(p["refugees"])]
        
        else:
            country["outbound_countries"] = p["asylum"].tolist()
            country["outbound_num"] = p["refugees"].tolist()
        
#         for col in col_names:
#             country[inbound] = p[col].tolist()
    
    # Country not in UNHCR List:
    else:
        country["inbound_countries"] = []
        country["inbound_num"] = []
        country["outbound_countries"] = []
        country["outbound_num"] = []


FileNotFoundError: File b'unhcr_refugee.csv' does not exist

In [22]:
with open("refugee_world.json", "w") as f:
    json.dump(data, f)

# Refugee Demographic Statistics:
- Binding to JSON data

- Using `unhcr_demographics.csv`

- Breaks down "Persons of Concern" (see previous section for UNHCR definition) by gender and age

- Source: http://popstats.unhcr.org/en/demographics

In [11]:
# with open("world.json") as f:
#     data = json.load(f)

demo = pd.read_csv("unhcr_demographics.csv", converters = {}).fillna(0).replace("*", int(0))
for i in range(3, demo.shape[1]):
    demo.iloc[:,i] = demo.iloc[:,i].astype("int32")

demo = demo.groupby(["Year", "Country / territory of asylum/residence"]).sum().reset_index()
tmp = set(demo["Country / territory of asylum/residence"])
demo = clean_names(demo)
demo = demo.sort_values(by=["Country / territory of asylum/residence", "Year"])
demo_set = set(demo["Country / territory of asylum/residence"])
demo = demo.set_index("Country / territory of asylum/residence")
col_names = demo.columns

In [13]:
demo.loc["United States"]

Unnamed: 0_level_0,Year,Female0-4,Female5-11,Female5-17,Female12-17,Female18-59,Female60+,F:Unknown,F:Total,Male0-4,Male 5-11,Male5-17,Male12-17,Male18-59,Male60+,M:Unknown,M:Total,total
Country / territory of asylum/residence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
United States,2015,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
United States,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
United States,2017,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
# with open("world.json") as f:
#     data = json.load(f)

min_year = {}

demo = pd.read_csv("unhcr_demographics.csv", converters = {}).fillna(0).replace("*", int(0))
for i in range(3, demo.shape[1]):
    demo.iloc[:,i] = demo.iloc[:,i].astype("int32")

demo = demo.groupby(["Year", "Country / territory of asylum/residence"]).sum().reset_index()
tmp = set(demo["Country / territory of asylum/residence"])
demo = clean_names(demo)
demo = demo.sort_values(by=["Country / territory of asylum/residence", "Year"])
demo_set = set(demo["Country / territory of asylum/residence"])
demo = demo.set_index("Country / territory of asylum/residence")
col_names = demo.columns

for country in data["features"]:
    if (country["properties"]["name"] in demo_set) or (country["properties"]["name_long"] in demo_set):
        if (country["properties"]["name_long"] in demo_set):
            country_name = country["properties"]["name_long"]
        else:
            country_name = country["properties"]["name"]
        p = demo.loc[country_name]
        min_year[country_name] = min(p["Year"])
        for col in col_names:
            country[col] = p[col].tolist()
    
    # Country not in UNHCR List:
    else:
        for col in col_names:
            country[col] = []


In [None]:

with open("refugee_world.json", "w") as f:
    json.dump(data, f)

# Combined Demographic and Top 5 Data:

## 1.) Add Demographic Data:
Add demographic data to `.json` file

In [148]:
with open("world.json") as f:
    data = json.load(f)

min_year = {}
max_year = {}

demo = pd.read_csv("unhcr_demographics.csv", converters = {}).fillna(0).replace("*", int(0))
for i in range(3, demo.shape[1]):
    demo.iloc[:,i] = demo.iloc[:,i].astype("int32")

demo = demo.groupby(["Year", "Country / territory of asylum/residence"]).sum().reset_index()
tmp = set(demo["Country / territory of asylum/residence"])
demo = clean_names(demo)
demo = demo.sort_values(by=["Country / territory of asylum/residence", "Year"])
demo_set = set(demo["Country / territory of asylum/residence"])
demo = demo.set_index("Country / territory of asylum/residence")
col_names = demo.columns

for country in data["features"]:
    if (country["properties"]["name"] in demo_set) or (country["properties"]["name_long"] in demo_set):
        if (country["properties"]["name_long"] in demo_set):
            country_name = country["properties"]["name_long"]
        else:
            country_name = country["properties"]["name"]
        p = demo.loc[country_name]
        min_year[country_name] = min(p["Year"])
        max_year[country_name] = max(p["Year"])
        for col in col_names:
            country[col] = p[col].tolist()
    
    # Country not in UNHCR List:
    else:
        for col in col_names:
            country[col] = []

## 2.) Add Top 5 Data:

In [149]:
refugee_table = pd.read_csv("unhcr_refugees.csv", delimiter=',', encoding="latin-1").fillna(0).replace("*", int(0))
refugee_table = clean_names(refugee_table)
for i in range(3, refugee_table.shape[1]):
    refugee_table.iloc[:,i] = refugee_table.iloc[:,i].astype("int32")


Outbound Refugees:

In [150]:
# Excluding countries that are not in min_year
# This is effectively removing countries that are not in the demographics table:
old_years = set()
x = refugee_table[[country in min_year for country in refugee_table["origin"]]]
keep_array = []
for row in x.iterrows():
    origin = row[1][2]
    year = row[1][0]
    if year < min_year[origin] or year > max_year[origin]:
        keep_array.append(False)
        old_years.add(year)
    else:
        keep_array.append(True)
        
# keep arrays is all the rows where the years are with the max/min range of years from the 
# UNHCR demographic data

In [151]:
all_refugees = x.iloc[keep_array]
tmp = all_refugees.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["origin","refugees"], ascending =False)

# Old code without excluding years not in demographic data:
# tmp = refugee_table.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["origin","refugees"], ascending =False)

tmp = tmp[tmp["asylum"]!=tmp["origin"]]
refugees = tmp.groupby("origin").head(5).set_index("origin")

Inbound Refugees Seeking Asylum

In [152]:
# Excluding countries that are not in min_year
# This is effectively removing countries that are not in the demographics table:
old_years2 = set()
y = refugee_table[[country in min_year for country in refugee_table["asylum"]]]
keep_array2 = []
for row in y.iterrows():
    asylum = row[1][1]
    year = row[1][0]
    if year < min_year[asylum] or year > max_year[asylum]:
        keep_array2.append(False)
        old_years2.add(year)
    else:
        keep_array2.append(True)
        
# keep arrays is all the rows where the years are with the max/min range of years from the 
# UNHCR demographic data

In [153]:
all_asylum = y.iloc[keep_array2]
asylum = all_asylum.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["asylum","refugees"], ascending =False)

# Old code without excluding years not in demographic data:
# asylum = refugee_table.drop("year", axis = 1).groupby(["asylum", "origin"]).sum().reset_index().sort_values(["asylum","refugees"], ascending =False)

asylum = asylum[asylum["asylum"]!=asylum["origin"]]

# Top 5 places refugees are coming from:
asylum = asylum.groupby("asylum").head(5).set_index("asylum")

Create set of countries in asylum and origin columns:

In [154]:
asylum_set = set(x["origin"].unique())

for item in y["asylum"]:
    if item not in asylum_set:
        asylum_set.add(item)

Add top 5 data to `.json` file:

In [155]:
country_list = []
for country in data["features"]:
    if (country["properties"]["name"] in asylum_set) or (country["properties"]["name_long"] in asylum_set):
        if (country["properties"]["name_long"] in asylum_set):
            country_name = country["properties"]["name_long"]
        else:
            country_name = country["properties"]["name"]
        
        country_list.append(country_name)

        try:
            p = refugees.loc[country_name]
        except:
            continue
        
        if(isinstance(p["asylum"], str)):
            if int(p["refugees"]) < 1:
                country["outbound_num"] = []
                country["outbound_countries"] = []
            else:
                country["outbound_num"] = [int(p["refugees"])]
                country["outbound_countries"] = [p["asylum"]]
        
        else:
            num = p["refugees"].tolist()
            countries = p["asylum"].tolist()
            non_zero_idx = [x[0] for x in np.argwhere(np.array(num)>0)]
            num_new = [num[i] for i in non_zero_idx]
            countries_new = [countries[i] for i in non_zero_idx]
            
            country["outbound_num"] = num_new
            country["outbound_countries"] = countries_new
        
        try:
            p = asylum.loc[country_name]
        except:
            continue
        
        if(isinstance(p["origin"], str)):
            if int(p["refugees"]) < 1:
                country["inbound_num"] = []
                country["inbound_countries"] = []
            else:
                country["inbound_num"] = [int(p["refugees"])]
                country["inbound_countries"] = [p["origin"]]

        else:
            num = p["refugees"].tolist()
            countries = p["origin"].tolist()
            non_zero_idx = [x[0] for x in np.argwhere(np.array(num)>0)]
            num_new = [num[i] for i in non_zero_idx]
            countries_new = [countries[i] for i in non_zero_idx]
            
            country["inbound_num"] = num_new
            country["inbound_countries"] = countries_new
    
    # Country not in UNHCR List:
    else:
        country["inbound_countries"] = []
        country["inbound_num"] = []
        country["outbound_countries"] = []
        country["outbound_num"] = []

OPTIONAL: Test to see if data encoded in JSON makes sense

In [156]:
#test(country_num):
# Args:
#     country_num = number of a country in the json file
# Output:
#     prints refugee data from a country to see if our code above worked correctly . 
def test(country_num):
    country = data["features"][country_num]
    print(country["properties"]["name"])
    print("outbound_countries: " + str(country["outbound_countries"]))
    print("outbound_num: " + str(country["outbound_num"]))
    print("inbound_countries: " + str(country["inbound_countries"]))
    print("inbound_num: " + str(country["inbound_num"]))

test(40)

Argentina
outbound_countries: ['Canada', 'Spain', 'United States', 'Germany', 'Brazil']
outbound_num: [4050, 3472, 1721, 550, 189]
inbound_countries: ['Lao PDR', 'Peru', 'Various/Unknown', 'Colombia', 'Cuba']
inbound_num: [9299, 8587, 5522, 5011, 4723]


## Adding in Refugee Data by Year:

In [157]:
# Remove rows where there are 0 refugees:
asylum = asylum[asylum["refugees"] > 0]
refugees = refugees[refugees["refugees"] > 0]

In [158]:
r = all_refugees.set_index(["origin", "asylum"]).sort_values(["origin", "asylum", "year"])
a = all_asylum.set_index(["asylum", "origin"]).sort_values(["asylum", "origin", "year"])

In [159]:
outbound_sum = all_refugees[["origin", "refugees"]].groupby("origin").sum()
inbound_sum = all_asylum[["asylum", "refugees"]].groupby("asylum").sum()

In [160]:
country_list = []
missing = set()
for country in data["features"]:
    if (country["properties"]["name"] in asylum_set) or (country["properties"]["name_long"] in asylum_set):
        if (country["properties"]["name_long"] in asylum_set):
            country_name = country["properties"]["name_long"]
        else:
            country_name = country["properties"]["name"]
        
        country_list.append(country_name)
        # Outbound Refugees: (seeking refuge)
        try:
            p = r.loc[country_name]
            top = refugees.loc[country_name]
            top_names = top["asylum"]
        except:
            country["outbound_year"] = []
            country["annual_outbound"] = []
            country["max_outbound"] = 0
            continue
        
        if(isinstance(top["asylum"], str)):
            if int(top["refugees"]) < 1:
                country["outbound_year"] = []
                country["annual_outbound"] = []
                country["max_outbound"] = 0
            else:
                country["outbound_year"] = [p.loc[top[0]]["year"].tolist()]
                country["annual_outbound"] = [p.loc[top[0]]["refugees"].tolist()]
                country["max_outbound"] = max(country["annual_outbound"])
        else:
            country["outbound_year"] = [[p.loc[country]["year"].tolist()] if isinstance(p.loc[country]["year"].tolist(), int) else p.loc[country]["year"].tolist() for country in top_names]
            country["annual_outbound"] = [[p.loc[country]["refugees"].tolist()] if isinstance(p.loc[country]["refugees"].tolist(), int) else p.loc[country]["refugees"].tolist()for country in top_names]
            country["max_outbound"] = max([max(x) for x in country["annual_outbound"]])
        
        country["outbound_sum"] = int(outbound_sum.loc[country_name][0])
        
        # Inbound Refugees: (asylum)
        try:
            p = a.loc[country_name]
            top = asylum.loc[country_name]
            top_names = top["origin"]
        except:
            country["inbound_year"] = []
            country["annual_inbound"] = []
            country["max_inbound"]= 0
            country["inbound_sum"] = 0
            continue
        
        if(isinstance(top["origin"], str)):
            if int(top["refugees"]) < 1:
                country["inbound_year"] = []
                country["annual_inbound"] = []
                country["max_inbound"]= 0
            else:
                country["inbound_year"] = [p.loc[top[0]]["year"].tolist()]
                country["annual_inbound"] = [p.loc[top[0]]["refugees"].tolist()]
                country["max_inbound"] = max(country["annual_inbound"])

        else:
            country["inbound_year"] = [[p.loc[country]["year"].tolist()] if isinstance(p.loc[country]["year"].tolist(), int) else p.loc[country]["year"].tolist() for country in top_names]
            country["annual_inbound"] = [[p.loc[country]["refugees"].tolist()] if isinstance(p.loc[country]["refugees"].tolist(), int) else p.loc[country]["refugees"].tolist()for country in top_names]
            country["max_inbound"] = max([max(x) for x in country["annual_inbound"]])
            
        country["inbound_sum"] = int(inbound_sum.loc[country_name][0])
    
    # Country not in UNHCR List:
    else:
        country["outbound_year"] = []
        country["annual_outbound"] = []
        country["inbound_year"] = []
        country["annual_inbound"] = []
        country["inbound_sum"] = 0
        country["outbound_sum"] = 0
        country["max_outbound"] = 0
        country["max_inbound"]= 0
        missing.add(country["properties"]["name"])

In [161]:
def test2(country_num):
    country = data["features"][country_num]
    print(country["properties"]["name"])
    print("outbound_year: " + str(country["outbound_year"]))
    print("annual_outbound: " + str(country["annual_outbound"]))
    print("inbound_year: " + str(country["inbound_year"]))
    print("annual_inbound: " + str(country["annual_inbound"]))
    print("outbound_countries: " + str(country["outbound_countries"]))
    print("inbound_countries: " + str(country["inbound_countries"]))
    print("outbound_sum: " + str(country["outbound_sum"]))
    print("inbound_sum: " + str(country["inbound_sum"]))
    print("years:" + str(country["Year"]))
    print("max_outbound: " + str(country["max_outbound"]))
    print("max_inbound: " + str(country["max_inbound"]))

test2(11)

Grenada
outbound_year: [[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], [2010, 2015, 2016, 2017], [2014, 2016, 2017]]
annual_outbound: [[325, 303, 292, 304, 294, 239, 67, 64], [15, 17, 19, 20, 23, 23, 24, 24], [0, 3, 5, 6, 7, 8, 9, 15], [5, 0, 0, 0], [0, 1, 0]]
inbound_year: [[2010, 2011], [2015, 2016, 2017]]
annual_inbound: [[0, 3], [1, 1, 0]]
outbound_countries: ['Canada', 'United States', 'United Kingdom', 'Germany', 'Italy']
inbound_countries: ['Iran', 'Syria']
outbound_sum: 2112
inbound_sum: 5
years:[2010, 2011, 2015, 2016, 2017]
max_outbound: 325
max_inbound: 3


## 4.) Persons of Concern Data:

According to UNHCR, the demographics data does not align with the total persons of concern data:
> This page presents information about persons of concern broken down by sex and age, as well as by location within the country of residence (where such information is available). Note that data broken down in this way is not always available, so it may not be possible to reconcile the figures on this page with those on the Persons of Concern and Time Series pages. Such data is available since 2000.

*Source: http://popstats.unhcr.org/en/demographics*

In [162]:
persons = pd.read_csv("persons_of_concern.csv", delimiter=',', encoding="latin-1", converters = {}).fillna(0).replace("*", int(0))
persons["Total Population"] = persons["Total Population"].astype("int32")
persons_sum = clean_names(persons.groupby(["Country / territory of asylum/residence", "Year"]).sum().reset_index())

In [163]:
keep_array = []
missing_country = set()

for row in persons_sum.iterrows():
    country = row[1][0]
    year = row[1][1]

    if country not in min_year or country not in max_year:
        keep_array.append(False)
        missing_country.add(country)
        
    elif year < min_year[country] or year > max_year[country]:
        keep_array.append(False)
        old_years.add(year)
    else:
        keep_array.append(True)

In [164]:
persons_clean = persons_sum.iloc[keep_array]
persons_clean.head(5)
persons_set = set(persons_clean["Country / territory of asylum/residence"])
persons_clean = persons_clean.set_index("Country / territory of asylum/residence")

In [165]:
missing_countries = set()

for country in data["features"]:
    if (country["properties"]["name"] in persons_set) or (country["properties"]["name_long"] in persons_set):
        if (country["properties"]["name_long"] in persons_set):
            country_name = country["properties"]["name_long"]
        else:
            country_name = country["properties"]["name"]
        
        p = persons_clean.loc[country_name]
        country["total_no_demo"] = p["Total Population"].tolist()
        
    
    # Country not in UNHCR List:
    else:
        missing_countries.add(country["properties"]["name"])
        country["total_no_demo"] = []

## Save data:

In [166]:
with open("refugee_world.json", "w") as f:
    json.dump(data, f)