In [1]:
import altair as alt
import pandas as pd

pd.set_option("display.max_columns", None)

In [2]:
migration_df = pd.read_csv("migration_flow.csv")
migration_df.sample(5)

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,REF_AREA,Reference area,CITIZENSHIP,Citizenship,FREQ,Frequency of observation,MEASURE,Measure,SEX,Sex,BIRTH_PLACE,Place of birth,EDUCATION_LEV,Education level,UNIT_MEASURE,Unit of measure,TIME_PERIOD,Time period,OBS_VALUE,Observation value,OBS_STATUS,Observation status,UNIT_MULT,Unit multiplier,DECIMALS,Decimals
107434,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,CAN,Canada,CPV,Cabo Verde,A,Annual,B11,Inflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2021.0,,10.0,,A,Normal value,0.0,Units,0,Zero
16265,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,NOR,Norway,SWE,Sweden,A,Annual,B12,Outflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2004.0,,2222.0,,A,Normal value,0.0,Units,0,Zero
215578,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,SWE,Sweden,CAF,Central African Republic,A,Annual,B11,Inflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2014.0,,10.0,,A,Normal value,0.0,Units,0,Zero
46307,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,CAN,Canada,COL,Colombia,A,Annual,B11,Inflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2003.0,,4318.0,,A,Normal value,0.0,Units,0,Zero
2159,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,KOR,Korea,TWN,Chinese Taipei,A,Annual,B12,Outflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2013.0,,1614.0,,A,Normal value,0.0,Units,0,Zero


# Data Cleanup

In [3]:
columns_to_drop = [
    "STRUCTURE",
    "STRUCTURE_ID",
    "STRUCTURE_NAME",
    "ACTION",
    "FREQ",
    "Frequency of observation",
    "SEX",
    "Sex",
    "BIRTH_PLACE",
    "Place of birth",
    "EDUCATION_LEV",
    "Education level",
    "OBS_STATUS",
    "Observation status",
    "UNIT_MULT",
    "Unit multiplier",
    "DECIMALS",
    "Decimals",
    "UNIT_MEASURE",
    "Unit of measure",
    "Time period",
    "Observation value",
]

df_cleaned = migration_df.drop(columns=columns_to_drop)

In [4]:
df_cleaned.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
24765,NOR,Norway,USA,United States,B11,Inflows of foreign population,2016.0,895.0
163558,ITA,Italy,TJK,Tajikistan,B11,Inflows of foreign population,2006.0,3.0
124487,LUX,Luxembourg,QAT,Qatar,B11,Inflows of foreign population,2009.0,0.0
7891,NOR,Norway,BIH,Bosnia and Herzegovina,B11,Inflows of foreign population,2011.0,107.0
19470,NOR,Norway,UKR,Ukraine,B11,Inflows of foreign population,2004.0,228.0


# Creating distinct datasets for inbound and outbound migration

In [5]:
inflow_df = df_cleaned[df_cleaned["Measure"] == "Inflows of foreign population"]
inflow_df.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
125646,KOR,Korea,MOZ,Mozambique,B11,Inflows of foreign population,2009.0,8.0
90507,ISL,Iceland,SLE,Sierra Leone,B11,Inflows of foreign population,2020.0,1.0
212293,CAN,Canada,RWA,Rwanda,B11,Inflows of foreign population,1998.0,211.0
42582,TUR,Türkiye,GRC,Greece,B11,Inflows of foreign population,1998.0,8018.0
207463,AUS,Australia,MRT,Mauritania,B11,Inflows of foreign population,2001.0,0.0


In [6]:
inflow_df = inflow_df.fillna(0)
inflow_df["TIME_PERIOD"] = inflow_df["TIME_PERIOD"].astype(int)

In [7]:
inflow_df = inflow_df[inflow_df["TIME_PERIOD"] != 0]

In [8]:
outflow_df = df_cleaned[df_cleaned["Measure"] == "Outflows of foreign population"]
outflow_df.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
149609,NOR,Norway,BLZ,Belize,B12,Outflows of foreign population,2019.0,0.0
105313,KOR,Korea,BRB,Barbados,B12,Outflows of foreign population,2006.0,2.0
17258,CHE,Switzerland,ATG,Antigua and Barbuda,B12,Outflows of foreign population,2013.0,3.0
40629,DNK,Denmark,MDA,Moldova,B12,Outflows of foreign population,2010.0,26.0
15431,LVA,Latvia,GIN,Guinea,B12,Outflows of foreign population,2020.0,0.0


In [9]:
iso_url = "https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv"
iso_df = pd.read_csv(iso_url)

iso_df.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [10]:
outflow_df.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
85458,JPN,Japan,CPV,Cabo Verde,B12,Outflows of foreign population,2017.0,7.0
157276,ITA,Italy,KIR,Kiribati,B12,Outflows of foreign population,2004.0,0.0
27977,AUT,Austria,CYP,Cyprus,B12,Outflows of foreign population,2004.0,5.0
206488,FIN,Finland,BGR,Bulgaria,B12,Outflows of foreign population,2005.0,13.0
109618,NLD,Netherlands,KGZ,Kyrgyzstan,B12,Outflows of foreign population,2002.0,1.0


In [11]:
outflow_df.drop(columns=["REF_AREA", "Reference area", "MEASURE", "Measure"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outflow_df.drop(columns=["REF_AREA", "Reference area", "MEASURE", "Measure"], inplace=True)


In [12]:
outflow_df = outflow_df.merge(iso_df[["country-code", "alpha-3"]], left_on="CITIZENSHIP", right_on="alpha-3", how="right")

In [13]:
outflow_df.drop(columns=["alpha-3"], inplace=True)

In [14]:
outflow_df.sample(10)

Unnamed: 0,CITIZENSHIP,Citizenship,TIME_PERIOD,OBS_VALUE,country-code
59723,PNG,Papua New Guinea,2019.0,0.0,598
48265,MHL,Marshall Islands,2009.0,0.0,584
14705,TCD,Chad,2000.0,4.0,148
44294,LIE,Liechtenstein,2010.0,10.0,438
53193,NAM,Namibia,2001.0,0.0,516
44956,LTU,Lithuania,2004.0,52.0,440
71657,ESP,Spain,2004.0,1468.0,724
2158,ATG,Antigua and Barbuda,2011.0,0.0,28
42767,LBN,Lebanon,2014.0,18.0,422
73272,SUR,Suriname,2015.0,0.0,740


In [15]:
outflow_df.dropna(inplace=True)
outflow_df['country-code'] = outflow_df['country-code'].astype(int)

outflow_df['TIME_PERIOD'] = outflow_df['TIME_PERIOD'].astype(int)

In [16]:
outflow_df.head()

Unnamed: 0,CITIZENSHIP,Citizenship,TIME_PERIOD,OBS_VALUE,country-code
0,AFG,Afghanistan,1998,0.0,4
1,AFG,Afghanistan,2001,174.0,4
2,AFG,Afghanistan,2000,0.0,4
3,AFG,Afghanistan,1999,0.0,4
4,AFG,Afghanistan,2001,12.0,4


In [17]:
outflows_gb = outflow_df.groupby(['CITIZENSHIP', 'country-code', 'TIME_PERIOD', "Citizenship"]).sum().reset_index()
outflows_gb.head()

Unnamed: 0,CITIZENSHIP,country-code,TIME_PERIOD,Citizenship,OBS_VALUE
0,AFG,4,1995,Afghanistan,1405.0
1,AFG,4,1996,Afghanistan,1730.0
2,AFG,4,1997,Afghanistan,2226.0
3,AFG,4,1998,Afghanistan,2739.0
4,AFG,4,1999,Afghanistan,2424.0


In [18]:
outflows_gb.columns = outflows_gb.columns.astype(str)
outflows_gb.fillna(0, inplace=True)
outflows_gb["OBS_VALUE"] = outflows_gb["OBS_VALUE"].astype(int)

In [19]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [20]:
from vega_datasets import data
world = alt.topo_feature(data.world_110m.url, "countries")

base_map = alt.Chart(world).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project(
    type='naturalEarth1'
).properties(
    width=800,
    height=400
)

In [21]:
slider = alt.binding_range(min=1995, max=2022, step=1, name="Year:")
selection = alt.selection_point(
    fields=["TIME_PERIOD"], bind=slider, name="Year", value=2022
)

out_choropleth = (
    alt.Chart(world)
    .mark_geoshape()
    .encode(
        color=alt.Color(
            "OBS_VALUE:Q",
            scale=alt.Scale(scheme="blues"),
            legend=alt.Legend(title="Migration Flow"),
        ),
        tooltip=[
            alt.Tooltip("Citizenship:N", title="Country"),
            alt.Tooltip("OBS_VALUE:Q", title="Migration Flow"),
        ],
    )
    .add_params(selection)
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            data=outflows_gb,
            key="country-code",
            fields=["TIME_PERIOD", "OBS_VALUE", "Citizenship"],
        ),
    ).transform_calculate(selected_year=selection["TIME_PERIOD"])
    .transform_filter("datum.TIME_PERIOD == datum.selected_year")
    .properties(width=800, height=400)
    .project("naturalEarth1")
)


outflow_vis = base_map + out_choropleth
outflow_vis

In [22]:
outflows_gb.shape

(5630, 5)

In [23]:
outflows_gb.sample(20)

Unnamed: 0,CITIZENSHIP,country-code,TIME_PERIOD,Citizenship,OBS_VALUE
2646,KIR,296,2018,Kiribati,621
2741,LAO,418,2001,Lao People’s Democratic Republic,39
2517,KAZ,398,2001,Kazakhstan,850
1026,COG,178,2005,Congo,714
4349,SDN,729,2004,Sudan,725
479,BHR,48,2006,Bahrain,76
2774,LBN,422,2006,Lebanon,2350
699,BRB,52,2008,Barbados,34
31,AGO,24,1998,Angola,843
3459,MRT,478,2001,Mauritania,9


In [24]:
inflow_df.drop(columns=["REF_AREA", "Reference area", "MEASURE", "Measure"], inplace=True)
inflow_df = inflow_df.merge(iso_df[["country-code", "alpha-3"]], left_on="CITIZENSHIP", right_on="alpha-3", how="right")
inflow_df.drop(columns=["alpha-3"], inplace=True)
inflows_gb = inflow_df.groupby(['CITIZENSHIP', 'country-code', 'TIME_PERIOD', "Citizenship"]).sum().reset_index()
inflows_gb.head()

Unnamed: 0,CITIZENSHIP,country-code,TIME_PERIOD,Citizenship,OBS_VALUE
0,ABW,533,2014.0,Aruba,0.0
1,ABW,533,2015.0,Aruba,0.0
2,ABW,533,2016.0,Aruba,0.0
3,AFG,4,1995.0,Afghanistan,12204.0
4,AFG,4,1996.0,Afghanistan,12765.0


In [25]:
inflows_gb.dropna(inplace=True)
inflows_gb['country-code'] = inflows_gb['country-code'].astype(int)

inflows_gb['TIME_PERIOD'] = inflows_gb['TIME_PERIOD'].astype(int)

In [26]:

in_choropleth = (
    alt.Chart(world)
    .mark_geoshape()
    .encode(
        color=alt.Color(
            "OBS_VALUE:Q",
            scale=alt.Scale(scheme="reds"),
            legend=alt.Legend(title="Migration Flow"),
        ),
        tooltip=[
            alt.Tooltip("Citizenship:N", title="Country"),
            alt.Tooltip("OBS_VALUE:Q", title="Migration Flow"),
        ],
    )
    .add_params(selection)
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            data=inflows_gb,
            key="country-code",
            fields=["TIME_PERIOD", "OBS_VALUE", "Citizenship"],
        ),
    ).transform_calculate(selected_year=selection["TIME_PERIOD"])
    .transform_filter("datum.TIME_PERIOD == datum.selected_year")
    .properties(width=800, height=400)
    .project("naturalEarth1")
)


inflow_vis = base_map + in_choropleth
inflow_vis

In [27]:
inflows_gb.to_csv("Final_inflow_data.csv", index=False)
outflows_gb.to_csv("Final_outflow_data.csv", index=False)

In [29]:
inflow_vis.save("inflow_vis.json")
outflow_vis.save("outflow_vis.json")