The following section was an intial attempt to bring in the annual percent change in gdp for each country, as well as both net migration and population.  Net migration and population are used to calculate net migration per capita (net migration/population).

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'Resources/gdp_percent_change_migration.csv'
data = pd.read_csv(file_path)
data.head()

Assessing and cleaning the dataset.

In [None]:
data.info()

In [4]:
# Strip whitespace from column names
data.columns = data.columns.str.strip()

# Clean and reshape the dataset
def reshape_data(df, value_name):
    df_long = df.melt(
        id_vars=["Country Code", "Country Name"],  # Updated id_vars
        var_name="Year",
        value_name=value_name
    )
    # Extract year from column names and drop rows with invalid years
    df_long["Year"] = df_long["Year"].str.extract(r"(\d{4})")
    df_long = df_long.dropna(subset=["Year"])
    df_long["Year"] = df_long["Year"].astype(int)
    return df_long

# Filter data for relevant Series Codes
net_migration = data[data['Series Code'] == 'SM.POP.NETM']
population = data[data['Series Code'] == 'SP.POP.TOTL']
gdp_percent_change = data[data['Series Code'] == 'NY.GDP.MKTP.KD.ZG']

# Reshape the data
net_migration_long = reshape_data(net_migration, "Net Migration")
population_long = reshape_data(population, "Population")
gdp_change_long = reshape_data(gdp_percent_change, "GDP Percent Change")


Dataset merge and cleaning of new dataframe.

In [None]:
# Merge the datasets
merged_data = (
    net_migration_long
    .merge(population_long, on=["Country Code", "Country Name", "Year"], how="inner")
    .merge(gdp_change_long, on=["Country Code", "Country Name", "Year"], how="inner")
)

# Convert relevant columns to numeric
for col in ["Net Migration", "Population", "GDP Percent Change"]:
    merged_data[col] = pd.to_numeric(merged_data[col], errors="coerce")

# Calculate Net Migration per Capita
merged_data["Net Migration per Capita"] = merged_data["Net Migration"] / merged_data["Population"]

# Drop rows with missing or invalid values
clean_data = merged_data.dropna(subset=["Net Migration per Capita", "GDP Percent Change"])

# Filter out regions using Country Code (retain only ISO country codes)
# Assuming regions have non-standard codes (e.g., 3 letters like 'EAS', 'SSA')
iso_country_codes = clean_data[clean_data['Country Code'].str.match(r'^[A-Z]{3}$')]



Plot the xy scatter plot for each country.

In [None]:
# Generate dual-axis time-series plots for each country
unique_countries = iso_country_codes["Country Name"].unique()

for country in unique_countries:
    country_data = iso_country_codes[iso_country_codes["Country Name"] == country]
    
    # Create the dual-axis plot
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot GDP Percent Change
    ax1.set_xlabel("Year")
    ax1.set_ylabel("GDP Percent Change (%)", color="blue")
    ax1.plot(country_data["Year"], country_data["GDP Percent Change"], label="GDP Percent Change", color="blue")
    ax1.tick_params(axis="y", labelcolor="blue")

    # Create second y-axis for Net Migration per Capita
    ax2 = ax1.twinx()
    ax2.set_ylabel("Net Migration per Capita", color="green")
    ax2.plot(country_data["Year"], country_data["Net Migration per Capita"], label="Net Migration per Capita", color="green")
    ax2.tick_params(axis="y", labelcolor="green")

    # Add title and grid
    plt.title(f"{country} - GDP Percent Change and Net Migration per Capita Over Time")
    plt.grid(True)

    # Show the plot
    plt.show()


The above code is functional but the output demonstrated that the dataframe still includes data for regions and not just specific countries.  Code below consolidates the code above and includes generating a list of countries to create a new dataframe with only country data. 

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'Resources/gdp_percent_change_migration.csv'
data = pd.read_csv(file_path)

# Strip whitespace from column names
data.columns = data.columns.str.strip()

iso_country_codes_list = [
    'AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ARG', 'ARM', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 
    'BLR', 'BEL', 'BLZ', 'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA', 'BDI', 'CPV', 
    'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COG', 'COD', 'CRI', 'CIV', 'HRV', 
    'CUB', 'CYP', 'CZE', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 
    'ETH', 'FJI', 'FIN', 'FRA', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GTM', 'GIN', 'GNB', 
    'GUY', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 
    'JOR', 'KAZ', 'KEN', 'KIR', 'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 
    'LIE', 'LTU', 'LUX', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MRT', 'MUS', 'MEX', 'FSM', 
    'MDA', 'MCO', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 
    'NGA', 'NOR', 'OMN', 'PAK', 'PLW', 'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'POL', 'PRT', 'QAT', 'ROU', 
    'RUS', 'RWA', 'KNA', 'LCA', 'VCT', 'WSM', 'SMR', 'STP', 'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 
    'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SWE', 'CHE', 'SYR', 'TWN', 
    'TJK', 'TZA', 'THA', 'TLS', 'TGO', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TUV', 'UGA', 'UKR', 'ARE', 
    'GBR', 'USA', 'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'YEM', 'ZMB', 'ZWE'
]

# Filter for actual countries
data = data[data['Country Code'].isin(iso_country_codes_list)]

# Clean and reshape the dataset
def reshape_data(df, value_name):
    df_long = df.melt(
        id_vars=["Country Code", "Country Name"],  # Updated id_vars
        var_name="Year",
        value_name=value_name
    )
    # Extract year from column names and drop rows with invalid years
    df_long["Year"] = df_long["Year"].str.extract(r"(\d{4})")
    df_long = df_long.dropna(subset=["Year"])
    df_long["Year"] = df_long["Year"].astype(int)
    return df_long

# Filter data for relevant Series Codes
net_migration = data[data['Series Code'] == 'SM.POP.NETM']
population = data[data['Series Code'] == 'SP.POP.TOTL']
gdp_percent_change = data[data['Series Code'] == 'NY.GDP.MKTP.KD.ZG']

# Reshape the data
net_migration_long = reshape_data(net_migration, "Net Migration")
population_long = reshape_data(population, "Population")
gdp_change_long = reshape_data(gdp_percent_change, "GDP Percent Change")

# Merge the datasets
merged_data = (
    net_migration_long
    .merge(population_long, on=["Country Code", "Country Name", "Year"], how="inner")
    .merge(gdp_change_long, on=["Country Code", "Country Name", "Year"], how="inner")
)

# Convert relevant columns to numeric
for col in ["Net Migration", "Population", "GDP Percent Change"]:
    merged_data[col] = pd.to_numeric(merged_data[col], errors="coerce")

# Calculate Net Migration per Capita
merged_data["Net Migration per Capita"] = merged_data["Net Migration"] / merged_data["Population"]

# Drop rows with missing or invalid values
clean_data = merged_data.dropna(subset=["Net Migration per Capita", "GDP Percent Change"])

# Generate dual-axis time-series plots for each country
unique_countries = clean_data["Country Name"].unique()

for country in unique_countries:
    country_data = clean_data[clean_data["Country Name"] == country]
    
    # Create the dual-axis plot
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot GDP Percent Change
    ax1.set_xlabel("Year")
    ax1.set_ylabel("GDP Percent Change (%)", color="blue")
    ax1.plot(country_data["Year"], country_data["GDP Percent Change"], label="GDP Percent Change", color="blue")
    ax1.tick_params(axis="y", labelcolor="blue")

    # Create second y-axis for Net Migration per Capita
    ax2 = ax1.twinx()
    ax2.set_ylabel("Net Migration per Capita", color="green")
    ax2.plot(country_data["Year"], country_data["Net Migration per Capita"], label="Net Migration per Capita", color="green")
    ax2.tick_params(axis="y", labelcolor="green")

    # Add title and grid
    plt.title(f"{country} - GDP Percent Change and Net Migration per Capita Over Time")
    plt.grid(True)

    # Show the plot
    plt.show()


In order to focus on the most impactful results, screening the results for the highest signficance as measured by r value.  

In [None]:
from scipy.stats import pearsonr
import pandas as pd

# Initialize an empty list to store results
results = []

# Iterate over each country in the dataset
for country in clean_data["Country Name"].unique():
    # Filter data for the current country
    country_data = clean_data[clean_data["Country Name"] == country]
    
    # Ensure there is enough data for correlation calculation
    if len(country_data) > 1:  # Pearson requires at least two data points
        # Calculate the Pearson correlation coefficient
        r, p_value = pearsonr(
            country_data["GDP Percent Change"],
            country_data["Net Migration per Capita"]
        )
        # Append results as a dictionary
        results.append({
            "Country Name": country,
            "Pearson r": r,
            "P-value": p_value
        })

# Create a DataFrame from the results
correlation_df = pd.DataFrame(results)

# Sort the DataFrame by the Pearson r column in descending order
sorted_correlation_df = correlation_df.sort_values(by="Pearson r", ascending=False)

# Display the top 5 rows
top_5 = sorted_correlation_df.head(5)
print(top_5)

Expanding that list to include only the 5 top and 5 bottom r values.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Load the datasets from 'Old Material' directory
gdp_data = pd.read_csv('Old Material/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_2.csv', skiprows=4)
migration_data = pd.read_csv('Old Material/API_SM.POP.NETM_DS2_en_csv_v2_33.csv', skiprows=4)

# Keep relevant columns (Country Code, Country Name, and yearly data)
gdp_data_clean = gdp_data.drop(columns=["Indicator Name", "Indicator Code"], errors='ignore')
migration_data_clean = migration_data.drop(columns=["Indicator Name", "Indicator Code"], errors='ignore')

# Melt datasets to reshape them into long format
gdp_melted = gdp_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                 var_name="Year", value_name="GDP")
migration_melted = migration_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                             var_name="Year", value_name="Net Migration")

# Merge datasets on Country Code, Country Name, and Year
merged_data = pd.merge(gdp_melted, migration_melted, 
                       on=["Country Name", "Country Code", "Year"], 
                       how="inner")

# Convert Year, GDP, and Net Migration to numeric
merged_data["Year"] = pd.to_numeric(merged_data["Year"], errors="coerce")
merged_data["GDP"] = pd.to_numeric(merged_data["GDP"], errors="coerce")
merged_data["Net Migration"] = pd.to_numeric(merged_data["Net Migration"], errors="coerce")

# Drop rows with NaN values (in GDP or Net Migration)
merged_data = merged_data.dropna(subset=["GDP", "Net Migration"])

# Calculate correlation for each country
country_correlations = merged_data.groupby("Country Name").apply(
    lambda group: group["GDP"].corr(group["Net Migration"])
).reset_index()

# Rename columns for clarity
country_correlations.columns = ["Country Name", "Correlation (GDP vs. Net Migration)"]

# Sort to get the top 5 highest and bottom 5 lowest correlation countries
top_5_countries = country_correlations.nlargest(5, "Correlation (GDP vs. Net Migration)")
bottom_5_countries = country_correlations.nsmallest(5, "Correlation (GDP vs. Net Migration)")

# Combine the top 5 and bottom 5 into a single DataFrame
top_bottom_countries = pd.concat([top_5_countries, bottom_5_countries])

# Print only the top and bottom 5 r values
print(top_bottom_countries)


Create the xy scatter plots and include regression lines and r values for the 5 highest and lowest r values.

In [None]:
from scipy.stats import pearsonr
import numpy as np
import matplotlib.pyplot as plt

# Sort the correlation DataFrame to get the top 5 and bottom 5 countries by Pearson r
top_5_countries = correlation_df.sort_values(by="Pearson r", ascending=False).head(5)["Country Name"]
bottom_5_countries = correlation_df.sort_values(by="Pearson r", ascending=True).head(5)["Country Name"]

# Combine the two sets of countries
selected_countries = pd.concat([top_5_countries, bottom_5_countries])

# Filter the clean_data DataFrame for only the selected countries
selected_data = clean_data[clean_data["Country Name"].isin(selected_countries)]

# Generate xy plots with regression lines for each of the selected countries
for country in selected_countries:
    country_data = selected_data[selected_data["Country Name"] == country]
    
    # Calculate Pearson correlation coefficient
    r, _ = pearsonr(
        country_data["GDP Percent Change"],
        country_data["Net Migration per Capita"]
    )
    
    # Fit a linear regression line
    x = country_data["GDP Percent Change"]
    y = country_data["Net Migration per Capita"]
    slope, intercept = np.polyfit(x, y, 1)
    regression_line = slope * x + intercept
    
    # Create the scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(x, y, alpha=0.7)
    plt.plot(x, regression_line, color="red")
    
    # Add annotations and labels
    plt.title(f"$\\bf{{{country}}}$\nGDP Percent Change vs. Net Migration per Capita\n(r = {r:.2f})")
    plt.xlabel("GDP Percent Change (%)")
    plt.ylabel("Net Migration per Capita")
    plt.grid(True)
    
    # Show the plot
    plt.show()
