In [1]:
import altair as alt
import pandas as pd

pd.set_option("display.max_columns", None)

In [2]:
migration_df = pd.read_csv("migration_flow.csv")
migration_df.sample(5)

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,REF_AREA,Reference area,CITIZENSHIP,Citizenship,FREQ,Frequency of observation,MEASURE,Measure,SEX,Sex,BIRTH_PLACE,Place of birth,EDUCATION_LEV,Education level,UNIT_MEASURE,Unit of measure,TIME_PERIOD,Time period,OBS_VALUE,Observation value,OBS_STATUS,Observation status,UNIT_MULT,Unit multiplier,DECIMALS,Decimals
73073,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,FIN,Finland,NIU,Niue,A,Annual,B11,Inflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2017.0,,0.0,,A,Normal value,0.0,Units,0,Zero
188301,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,LUX,Luxembourg,SLB,Solomon Islands,A,Annual,B11,Inflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2002.0,,0.0,,A,Normal value,0.0,Units,0,Zero
202418,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,KOR,Korea,HND,Honduras,A,Annual,B11,Inflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2022.0,,13.0,,A,Normal value,0.0,Units,0,Zero
177569,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,CAN,Canada,SEN,Senegal,A,Annual,B11,Inflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2020.0,,280.0,,A,Normal value,0.0,Units,0,Zero
186008,DATAFLOW,OECD.ELS.IMD:DSD_MIG@DF_MIG(1.0),International migration database,I,DNK,Denmark,MMR,Myanmar,A,Annual,B12,Outflows of foreign population,_T,Total,_Z,Not applicable,_Z,Not applicable,PS,Persons,2005.0,,1.0,,A,Normal value,0.0,Units,0,Zero


# Data Cleanup

In [3]:
columns_to_drop = [
    "STRUCTURE",
    "STRUCTURE_ID",
    "STRUCTURE_NAME",
    "ACTION",
    "FREQ",
    "Frequency of observation",
    "SEX",
    "Sex",
    "BIRTH_PLACE",
    "Place of birth",
    "EDUCATION_LEV",
    "Education level",
    "OBS_STATUS",
    "Observation status",
    "UNIT_MULT",
    "Unit multiplier",
    "DECIMALS",
    "Decimals",
    "UNIT_MEASURE",
    "Unit of measure",
    "Time period",
    "Observation value",
]

df_cleaned = migration_df.drop(columns=columns_to_drop)

In [4]:
df_cleaned.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
159919,ITA,Italy,ZAF,South Africa,B11,Inflows of foreign population,2010.0,69.0
56313,DNK,Denmark,UGA,Uganda,B11,Inflows of foreign population,2012.0,64.0
147357,ISL,Iceland,ZMB,Zambia,B12,Outflows of foreign population,2002.0,0.0
165810,AUT,Austria,CAF,Central African Republic,B11,Inflows of foreign population,2021.0,1.0
110187,AUS,Australia,GAB,Gabon,B11,Inflows of foreign population,2004.0,0.0


# Creating distinct datasets for inbound and outbound migration

In [5]:
inflow_df = df_cleaned[df_cleaned["Measure"] == "Inflows of foreign population"]
inflow_df.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
104470,DEU,Germany,LCA,Saint Lucia,B11,Inflows of foreign population,2018.0,15.0
105562,SVN,Slovenia,SYR,Syrian Arab Republic,B11,Inflows of foreign population,2014.0,15.0
93763,LUX,Luxembourg,NZL,New Zealand,B11,Inflows of foreign population,2019.0,2.0
77920,AUT,Austria,NIC,Nicaragua,B11,Inflows of foreign population,2008.0,12.0
114363,AUT,Austria,KAZ,Kazakhstan,B11,Inflows of foreign population,2001.0,18.0


In [6]:
inflow_df = inflow_df.fillna(0)
inflow_df["TIME_PERIOD"] = inflow_df["TIME_PERIOD"].astype(int)

In [7]:
inflow_df = inflow_df[inflow_df["TIME_PERIOD"] != 0]

In [8]:
outflow_df = df_cleaned[df_cleaned["Measure"] == "Outflows of foreign population"]
outflow_df.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
168285,DNK,Denmark,YUG_F,Former Yugoslavia,B12,Outflows of foreign population,2005.0,46.0
57237,ISL,Iceland,ROU,Romania,B12,Outflows of foreign population,2005.0,1.0
72807,JPN,Japan,IDN,Indonesia,B12,Outflows of foreign population,2013.0,7270.0
200423,DNK,Denmark,JPN,Japan,B12,Outflows of foreign population,2016.0,315.0
16475,NLD,Netherlands,LBY,Libya,B12,Outflows of foreign population,2007.0,21.0


In [9]:
iso_url = "https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv"
iso_df = pd.read_csv(iso_url)

iso_df.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [10]:
outflow_df.sample(5)

Unnamed: 0,REF_AREA,Reference area,CITIZENSHIP,Citizenship,MEASURE,Measure,TIME_PERIOD,OBS_VALUE
15232,HUN,Hungary,ROU,Romania,B12,Outflows of foreign population,2007.0,2693.0
53724,NOR,Norway,CHL,Chile,B12,Outflows of foreign population,2008.0,36.0
216981,JPN,Japan,KOR,Korea,B12,Outflows of foreign population,2015.0,18826.0
211245,AUT,Austria,NZL,New Zealand,B12,Outflows of foreign population,2015.0,46.0
41362,NZL,New Zealand,KHM,Cambodia,B12,Outflows of foreign population,2013.0,69.0


In [11]:
outflow_df.drop(columns=["REF_AREA", "Reference area", "MEASURE", "Measure"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outflow_df.drop(columns=["REF_AREA", "Reference area", "MEASURE", "Measure"], inplace=True)


In [12]:
outflow_df = outflow_df.merge(iso_df[["country-code", "alpha-3"]], left_on="CITIZENSHIP", right_on="alpha-3", how="right")

In [13]:
outflow_df.drop(columns=["alpha-3"], inplace=True)

In [14]:
outflow_df.sample(10)

Unnamed: 0,CITIZENSHIP,Citizenship,TIME_PERIOD,OBS_VALUE,country-code
52617,MOZ,Mozambique,2019.0,198.0,508
13115,CMR,Cameroon,2012.0,23.0,120
82934,UZB,Uzbekistan,2016.0,74.0,860
77087,TGO,Togo,2004.0,8.0,768
30955,GNB,Guinea-Bissau,2020.0,7.0,624
29288,GRC,Greece,2002.0,4.0,300
85708,ZWE,Zimbabwe,2016.0,0.0,716
76580,TLS,Timor-Leste,2004.0,0.0,626
45021,LTU,Lithuania,2007.0,679.0,440
38383,JOR,Jordan,2017.0,1052.0,400


In [15]:
outflow_df.dropna(inplace=True)
outflow_df['country-code'] = outflow_df['country-code'].astype(int)

outflow_df['TIME_PERIOD'] = outflow_df['TIME_PERIOD'].astype(int)

In [16]:
outflow_df.head()

Unnamed: 0,CITIZENSHIP,Citizenship,TIME_PERIOD,OBS_VALUE,country-code
0,AFG,Afghanistan,1998,0.0,4
1,AFG,Afghanistan,2001,174.0,4
2,AFG,Afghanistan,2000,0.0,4
3,AFG,Afghanistan,1999,0.0,4
4,AFG,Afghanistan,2001,12.0,4


In [17]:
outflows_gb = outflow_df.groupby(['CITIZENSHIP', 'country-code', 'TIME_PERIOD', "Citizenship"]).sum().reset_index()
outflows_gb.head()

Unnamed: 0,CITIZENSHIP,country-code,TIME_PERIOD,Citizenship,OBS_VALUE
0,AFG,4,1995,Afghanistan,1405.0
1,AFG,4,1996,Afghanistan,1730.0
2,AFG,4,1997,Afghanistan,2226.0
3,AFG,4,1998,Afghanistan,2739.0
4,AFG,4,1999,Afghanistan,2424.0


In [18]:
outflows_gb.columns = outflows_gb.columns.astype(str)
outflows_gb.fillna(0, inplace=True)
outflows_gb["OBS_VALUE"] = outflows_gb["OBS_VALUE"].astype(int)

In [19]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [20]:
from vega_datasets import data
world = alt.topo_feature(data.world_110m.url, "countries")

base_map = alt.Chart(world).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project(
    type='naturalEarth1'
).properties(
    width=800,
    height=400
)

In [23]:
min_year = outflows_gb["TIME_PERIOD"].min()
max_year = outflows_gb["TIME_PERIOD"].max()

wide_data = outflows_gb.pivot(
    index='country-code',
    columns='TIME_PERIOD',
    values='OBS_VALUE'
).reset_index()


country_data = outflows_gb[['country-code', 'Citizenship']].drop_duplicates('country-code')
wide_data = wide_data.merge(country_data, on='country-code', how='left')


wide_data.columns = [str(c) for c in wide_data.columns]

year_param = alt.param(
    name='Year',
    bind=alt.binding_range(min=1995, max=2022, step=1),
    value=max_year  # default to last year
)

world = alt.topo_feature(data.world_110m.url, "countries")

base_map = alt.Chart(world).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project(type='naturalEarth1').properties(width=800, height=400)

year_fields = [str(y) for y in range(1995, 2022 + 1)]
fields = ['Citizenship'] + year_fields

# Choropleth
out_choropleth = (
    alt.Chart(world)
    .mark_geoshape()
    .encode(
        color=alt.Color(
            'value:Q',
            scale=alt.Scale(scheme='blues'),
            legend=alt.Legend(title='Migration Flow')
        ),
        tooltip=[
            alt.Tooltip("Citizenship:N", title="Country"),
            alt.Tooltip("value:Q", title="Migration Flow")
        ]
    )
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            data=wide_data,
            key="country-code",
            fields=fields
        )
    )
    .transform_calculate(
        # Dynamically select the column based on the chosen year
        value='datum[toString(Year)]'
    )
    .add_params(year_param)
    .properties(width=800, height=400)
    .project("naturalEarth1")
)

outflow_vis = base_map + out_choropleth
outflow_vis


In [25]:
inflow_df.drop(columns=["REF_AREA", "Reference area", "MEASURE", "Measure"], inplace=True)
inflow_df = inflow_df.merge(iso_df[["country-code", "alpha-3"]], left_on="CITIZENSHIP", right_on="alpha-3", how="right")
inflow_df.drop(columns=["alpha-3"], inplace=True)
inflows_gb = inflow_df.groupby(['CITIZENSHIP', 'country-code', 'TIME_PERIOD', "Citizenship"]).sum().reset_index()
inflows_gb.head()

Unnamed: 0,CITIZENSHIP,country-code,TIME_PERIOD,Citizenship,OBS_VALUE
0,ABW,533,2014.0,Aruba,0.0
1,ABW,533,2015.0,Aruba,0.0
2,ABW,533,2016.0,Aruba,0.0
3,AFG,4,1995.0,Afghanistan,12204.0
4,AFG,4,1996.0,Afghanistan,12765.0


In [26]:
inflows_gb.dropna(inplace=True)
inflows_gb['country-code'] = inflows_gb['country-code'].astype(int)

inflows_gb['TIME_PERIOD'] = inflows_gb['TIME_PERIOD'].astype(int)

In [27]:
inflow_wide_data = inflows_gb.pivot(
    index='country-code',
    columns='TIME_PERIOD',
    values='OBS_VALUE'
).reset_index()

inflow_country_data = inflows_gb[['country-code', 'Citizenship']].drop_duplicates('country-code')
inflow_wide_data = inflow_wide_data.merge(inflow_country_data, on='country-code', how='left')

inflow_wide_data.columns = [str(c) for c in inflow_wide_data.columns]
inflow_fields = ['Citizenship'] + year_fields

inflow_choropleth = (
    alt.Chart(world)
    .mark_geoshape()
    .encode(
        color=alt.Color(
            'value:Q',
            scale=alt.Scale(scheme='reds'),
            legend=alt.Legend(title='Migration Flow')
        ),
        tooltip=[
            alt.Tooltip("Citizenship:N", title="Country"),
            alt.Tooltip("value:Q", title="Migration Flow")
        ]
    )
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            data=inflow_wide_data,
            key="country-code",
            fields=inflow_fields
        )
    )
    .transform_calculate(
        value='datum[toString(Year)]'
    )
    .add_params(year_param)
    .properties(width=800, height=400)
    .project("naturalEarth1")
)

inflow_vis = base_map + inflow_choropleth
inflow_vis

In [None]:
inflows_gb.to_csv("Final_inflow_data.csv", index=False)
outflows_gb.to_csv("Final_outflow_data.csv", index=False)

In [None]:
inflow_vis.save("inflow_vis.json")
outflow_vis.save("outflow_vis.json")