# **Net Migration in the World**
#### Analyzing Trends, Drops, and Spikes Through Political & Economic Events
##### Team Dotorhy Thomas: Fari Santoso, Jing Huang, Joseph Sanchez, Misato Okamoto

##### **This section includes Deliverables A and B**

# **Deliverable A and B**

### 0. Imports

In [29]:
%pip install wbdata
%pip install plotly
!pip install plotly
import plotly.express as px

import wbdata
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as gp

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### 1. Population DataFrame Function

In [30]:
# Install necessary packages
!pip install pandas_datareader pandas requests lxml --quiet

# Import required libraries
import pandas as pd
import warnings
from pandas_datareader import wb

def get_indicator_code(sex, age_range):
    """
    Generates the World Bank indicator code based on the age range and sex.
    """
    age_str = f"{age_range[0]:02d}{age_range[1]:02d}" if age_range[1] < 80 else "80UP"
    return f"SP.POP.{age_str}.{sex}"

def population_df(countries, start_yr, end_yr, age_ranges):
    """
    Fetches population data dynamically for any age range.
    @param: countries (list) : List of country codes (e.g., ["USA", "IND"])
    @param: start_yr (int) : Starting year
    @param: end_yr (int) : End year
    @param: age_ranges (list of tuples) : List of age ranges (e.g., [(0, 4), (5, 9)])
    @return: Population DataFrame where each country is a row, years are columns
    """
    df_list = []
    for age_range in age_ranges:
        for sex in ["MA", "FE"]:  # MA = Male, FE = Female
            indicator = get_indicator_code(sex, age_range)
            try:
                # Suppress the FutureWarning triggered by wb.download:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=FutureWarning)
                    data = wb.download(
                        indicator=indicator,
                        country=countries,
                        start=start_yr,
                        end=end_yr
                    )
                # Convert values safely
                data = data.apply(pd.to_numeric, errors='coerce')
                # Rename column to reflect age range and sex
                col_name = f"{age_range[0]}-{age_range[1]}_{'Male' if sex == 'MA' else 'Female'}"
                data = data.rename(columns={indicator: col_name})
                df_list.append(data)
            except Exception as e:
                print(f"Error fetching {indicator}: {e}")

    # If we successfully fetched data, concatenate and pivot:
    if df_list:
        main_df = pd.concat(df_list, axis=1)
        main_df.reset_index(inplace=True)
        main_df = main_df.pivot(index="country", columns="year")
        # Flatten the MultiIndex columns
        main_df.columns = [f"{col[0]}_{col[1]}" for col in main_df.columns]
        main_df.index.rename("Country", inplace=True)
        main_df.sort_index(inplace=True)
    else:
        main_df = pd.DataFrame()

    return main_df

# Example usage:
age_ranges = [(0, 4), (5, 9), (10, 14), (20, 24), (30, 34), (40, 44), (50, 54), (65, 69)]
df = population_df(["USA", "IND", "JPN"], 2010, 2020, age_ranges)
df

Unnamed: 0_level_0,0-4_Male_2010,0-4_Male_2011,0-4_Male_2012,0-4_Male_2013,0-4_Male_2014,0-4_Male_2015,0-4_Male_2016,0-4_Male_2017,0-4_Male_2018,0-4_Male_2019,...,65-69_Female_2011,65-69_Female_2012,65-69_Female_2013,65-69_Female_2014,65-69_Female_2015,65-69_Female_2016,65-69_Female_2017,65-69_Female_2018,65-69_Female_2019,65-69_Female_2020
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
India,67558455,67233033,66960821,66609159,65911331,65048361,64352535,63599613,62849630,62332993,...,12538533,12926744,13328840,13895517,14662200,15484923,16362651,17311858,18182743,18896130
Japan,2781164,2763649,2733976,2699257,2661847,2623720,2589849,2554051,2507852,2451241,...,4210758,4405102,4635838,4886056,5164954,5228847,5021032,4727380,4436301,4203855
United States,10536218,10452040,10328270,10191513,10098990,10062473,10050450,10014399,9937540,9819024,...,6757239,7130722,7461598,7777314,8118197,8364764,8481932,8611326,8795614,9080960


### 2. Population (Counts) Function

In [31]:
import wbdata
import datetime as dt

def population(year, sex, age_range, place):
    """
    Return the population count (an integer) for a given:
      - year: int
      - sex: str in {"male", "female", "people"}
      - age_range: str in one of the 5-year brackets, e.g. "0-4", "5-9", ..., "75-79", "80+"
      - place: e.g., "USA"
    
    If sex == "people", returns the sum of male + female in that age range.
    If data is missing or invalid, returns None.
    """
    
    # Dictionary mapping each 5-year age bracket to the absolute population indicators.
    ages_5yr = {
        "0-4":   {"male": "SP.POP.0004.MA",   "female": "SP.POP.0004.FE"},
        "5-9":   {"male": "SP.POP.0509.MA",   "female": "SP.POP.0509.FE"},
        "10-14": {"male": "SP.POP.1014.MA",   "female": "SP.POP.1014.FE"},
        "15-19": {"male": "SP.POP.1519.MA",   "female": "SP.POP.1519.FE"},
        "20-24": {"male": "SP.POP.2024.MA",   "female": "SP.POP.2024.FE"},
        "25-29": {"male": "SP.POP.2529.MA",   "female": "SP.POP.2529.FE"},
        "30-34": {"male": "SP.POP.3034.MA",   "female": "SP.POP.3034.FE"},
        "35-39": {"male": "SP.POP.3539.MA",   "female": "SP.POP.3539.FE"},
        "40-44": {"male": "SP.POP.4044.MA",   "female": "SP.POP.4044.FE"},
        "45-49": {"male": "SP.POP.4549.MA",   "female": "SP.POP.4549.FE"},
        "50-54": {"male": "SP.POP.5054.MA",   "female": "SP.POP.5054.FE"},
        "55-59": {"male": "SP.POP.5559.MA",   "female": "SP.POP.5559.FE"},
        "60-64": {"male": "SP.POP.6064.MA",   "female": "SP.POP.6064.FE"},
        "65-69": {"male": "SP.POP.6569.MA",   "female": "SP.POP.6569.FE"},
        "70-74": {"male": "SP.POP.7074.MA",   "female": "SP.POP.7074.FE"},
        "75-79": {"male": "SP.POP.7579.MA",   "female": "SP.POP.7579.FE"},
        "80+":   {"male": "SP.POP.80UP.MA",   "female": "SP.POP.80UP.FE"},
    }
    
    # Validate inputs.
    if age_range not in ages_5yr or sex not in ["male", "female", "people"]:
        raise ValueError("Invalid input. Check 'age_range' or 'sex' parameter.")
    
    def fetch_value(indicator):
        """Helper function to retrieve a population value for the given indicator."""
        data = wbdata.get_data(
            indicator,
            country=place,
            date=(dt.datetime(year, 1, 1), dt.datetime(year, 1, 1))
        )
        if data and isinstance(data, list):
            for item in data:
                if item.get("value") is not None:
                    return item["value"]
        return None
    
    if sex in ["male", "female"]:
        indicator = ages_5yr[age_range][sex]
        return fetch_value(indicator)
    else:  # sex == "people"
        male_value = fetch_value(ages_5yr[age_range]["male"])
        female_value = fetch_value(ages_5yr[age_range]["female"])
        if male_value is None or female_value is None:
            return None
        return male_value + female_value

def report_population(year, sex, age_range, place):
    """
    Returns a formatted string answering the query:
    "In [year], how many [people/males/females] aged [low] to [high] were living in [place]?"
    """
    pop_count = population(year, sex, age_range, place)
    
    # Format the sex label to be plural where appropriate.
    sex_label = {"male": "males", "female": "females", "people": "people"}[sex]
    
    # Format the age range.
    if "-" in age_range:
        low, high = age_range.split("-")
        age_text = f"{low} to {high}"
    elif age_range.endswith("+"):
        low = age_range.rstrip("+")
        age_text = f"{low} and above"
    else:
        age_text = age_range
    
    # Format the population count.
    if pop_count is None:
        count_str = "Data not available"
    else:
        count_str = f"{pop_count:,}"  # adds commas as thousand separators
    
    return f"In {year}, how many {sex_label} aged {age_text} were living in {place}? --> {count_str}"

# Example test call:
print(report_population(2018, "male", "20-24", "USA"))

In 2018, how many males aged 20 to 24 were living in USA? --> 11,246,062


### 3. Population Pyramid Function

##### Below is the general function/code of a population pyramid. It generates population pyramid based on the given dataframe.

In [32]:
import plotly.graph_objects as gp
import pandas as pd

def population_pyramid(df):
    """
    @param df: DataFrame
    @return: None

    Assume df has columns:
        {"age_range", "male_net_migration", "female_net_migration"}
    """

    fig = gp.Figure()

    # Plot male net migration (as negative x-values to appear on the left)
    fig.add_trace(
        gp.Bar(
            y=df["age_range"],
            x=-df["male_net_migration"],
            name="Male",
            orientation="h",
            marker_color="blue"
        )
    )

    # Plot female net migration (positive x-values to appear on the right)
    fig.add_trace(
        gp.Bar(
            y=df["age_range"],
            x=df["female_net_migration"],
            name="Female",
            orientation="h",
            marker_color="green"
        )
    )

    # Update the layout for a mirrored “population pyramid” effect
    fig.update_layout(
        title="Population Pyramid of Net Migration",
        xaxis=dict(
            title="Net Migration",
            tickvals=[-max(df["male_net_migration"]), 0, max(df["female_net_migration"])],
            ticktext=[
                f"{max(df['male_net_migration']):,}",
                "0",
                f"{max(df['female_net_migration']):,}"
            ]
        ),
        yaxis=dict(title="Age Range", categoryorder='category ascending'),
        barmode="relative"
    )

    fig.show()


# Example usage
df = pd.DataFrame({
    "age_range": ["0-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71+"],
    "male_net_migration": [500, 600, 700, 800, 900, 800, 700, 550],
    "female_net_migration": [550, 650, 750, 850, 950, 850, 750, 600]
})

population_pyramid(df)

##### Below is the population pyramid of the United States in the year 2020. We chose this year because the US had lowerst net migration in 2020. 

In [33]:
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

# Define age ranges in five-year intervals up to 80+.
age_ranges = []
for i in range(0, 80, 5):
    age_ranges.append(f"{i:02d}{i+4:02d}")
age_ranges.append("80UP")
#print("Age ranges:", age_ranges)

# Construct indicator dictionaries for population counts.
# These indicator codes are examples and may need to be verified.
male_variables = {f"SP.POP.{age_range}.MA": f"Males {age_range}" for age_range in age_ranges}
female_variables = {f"SP.POP.{age_range}.FE": f"Females {age_range}" for age_range in age_ranges}

variables = {}
variables.update(male_variables)
variables.update(female_variables)

#print("Indicator variables:")
#for k, v in variables.items():
#    print(k, ":", v)

# Fetch the data for the United States.
# The country code for the United States is "USA".
df = wbdata.get_dataframe(variables, country="USA", parse_dates=True)

# Choose a specific year, e.g., January 1, 2015.
year = '2020-01-01'
if year not in df.index:
    raise ValueError(f"The requested year {year} is not available in the data. Available dates: {df.index.unique()}")

# Display a sample of the data.
#print("\nData for {}: ".format(year))
#print(df.loc[year])

# Build the population pyramid. # Change the year based on your choice. 
layout = go.Layout(
    title="Population Pyramid of the United States (2020)",
    barmode='overlay',
    yaxis=go.layout.YAxis(range=[0, 90], title='Age'),
    xaxis=go.layout.XAxis(title='Population (in thousands)')
)

# Create two bar traces: one for males and one for females.
bars = [
    go.Bar(
        x = df.loc[year].filter(regex="Males").values,
        y = [int(s[:2]) + 1 for s in age_ranges],
        orientation='h',
        name='Men',
        marker=dict(color='blue'),
        hoverinfo='skip'
    ),
    go.Bar(
        x = -df.loc[year].filter(regex="Females").values,
        y = [int(s[:2]) + 1 for s in age_ranges],
        orientation='h',
        name='Women',
        marker=dict(color='red'),
        hoverinfo='skip'
    )
]

py.iplot(dict(data=bars, layout=layout))

### 4. World Map

##### Below is the code that creates and displays a choropleth map of net migration data around the world. We chose the year 2018 randomly from the years before the COVID-19 pandemic took place. 

In [35]:
def plot_net_migration_map(year):
    """
    Creates a choropleth map of net migration by country for the given year,
    using the country's ISO-3 code for geometry (so 'TUR' is recognized),
    while displaying the WB name 'Turkiye' in hover labels.
    """
    # 1. Fetch the net migration data (all countries, parse dates)
    indicator = {"SM.POP.NETM": "Net Migration"}
    df = wbdata.get_dataframe(indicator, country="all", parse_dates=True)
    df.dropna(inplace=True)
    df.reset_index(inplace=True)  # columns: ['country', 'date', 'Net Migration']
    # 2. Extract the year from the date (datetime)
    df["Year"] = df["date"].dt.year
    df_year = df[df["Year"] == year]
    if df_year.empty:
        raise ValueError(f"No net migration data found for {year}. Try another year.")
    # 3. Get the list of WB countries to retrieve their ISO-3 'id'
    wb_countries = wbdata.get_countries()
    # Convert that to a DataFrame
    country_df = pd.DataFrame(wb_countries)
    # 'country_df' has columns like: ['id','iso2Code','name','region',...]
    # 4. Merge the net migration data (df_year) with the country info (country_df) on "name" vs. "country"
    merged = pd.merge(
        df_year,
        country_df[['id','name']],
        how='left',
        left_on='country',
        right_on='name'
    )
    # Now 'merged' has columns: ['country','date','Net Migration','Year','id','name'].
    #  - merged['id'] is the ISO-3 code (e.g., "TUR").
    #  - merged['country'] is "Turkiye".
    # 5. Create the choropleth
    #    Use locationmode="ISO-3" and pass 'id' (the ISO-3 code) to 'locations',
    fig = px.choropleth(
        merged,
        locations="id",                 # This is the ISO-3 code
        locationmode="ISO-3",
        color="Net Migration",
        hover_name="country",
        color_continuous_scale=[
    (0.0, "lightblue"),
    (0.5, "white"),
    (1.0, "lightcoral")
],
        range_color=(-10000, 10000),
        title=f"Net Migration by Country in {year}",
        labels={"Net Migration": "Number of Migrants"},
        projection="natural earth"
    )
    # 6. Improve layout
    fig.update_layout(
        geo=dict(showframe=False, showcoastlines=True),
        height=600
    )
    fig.show()
plot_net_migration_map(2018)

##### Below is the choropleth map of net migration data of 2020, during COVID-19.

In [36]:
plot_net_migration_map(2020)