This jupyter notebook was initial data analysis of gdp relationship to migration.  This data is using annual gdp which, when initially visualized, made clear that the data would be hard to interpret since almost every county had significant growth in gdp over time.

Preparing the notebook and bringing in gdp and migration data.

In [10]:

import pandas as pd

# Load the two datasets
gdp_data = pd.read_csv('Old Material/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_2.csv', skiprows=4)
migration_data = pd.read_csv('Old Material/API_SM.POP.NETM_DS2_en_csv_v2_33.csv', skiprows=4)


Clean and reshape the datasets

In [11]:
# Keep relevant columns (Country Code, Country Name, and yearly data)
gdp_data_clean = gdp_data.drop(columns=["Indicator Name", "Indicator Code", "Unnamed: 68"])
migration_data_clean = migration_data.drop(columns=["Indicator Name", "Indicator Code", "Unnamed: 68"])

# Melt datasets to have 'Year' as a column
gdp_melted = gdp_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                 var_name="Year", value_name="GDP")
migration_melted = migration_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                             var_name="Year", value_name="Net Migration")

# Merge datasets on Country Code, Country Name, and Year
merged_data = pd.merge(gdp_melted, migration_melted, 
                       on=["Country Name", "Country Code", "Year"], 
                       how="inner")

# Convert Year to numeric for proper analysis
merged_data["Year"] = pd.to_numeric(merged_data["Year"], errors="coerce")

# Drop rows with NaN values (in GDP or Net Migration)
merged_data = merged_data.dropna(subset=["GDP", "Net Migration"])

# Save the merged data to a file for further analysis
#merged_data.to_csv("merged_gdp_migration_data.csv", index=False)


Generating correlative data across the two data sets.

In [None]:
# Calculate correlation for each country
country_correlations = merged_data.groupby("Country Name").apply(
    lambda group: group["GDP"].corr(group["Net Migration"])
).reset_index()

# Rename columns for clarity
country_correlations.columns = ["Country Name", "Correlation (GDP vs. Net Migration)"]

# Display final message
print("Processing complete! Results saved as 'merged_gdp_migration_data.csv' and 'country_correlations.csv'.")


Generating plotted visualizations for each county.

In [None]:
import matplotlib.pyplot as plt

# Group data by Country Name
grouped_data = merged_data.groupby("Country Name")

# Create a scatter plot for each country
for country, group in grouped_data:
    plt.figure(figsize=(8, 6))
    plt.scatter(group["GDP"], group["Net Migration"], alpha=0.6)
    plt.title(f"GDP vs. Net Migration for {country}")
    plt.xlabel("GDP (Current USD)")
    plt.ylabel("Net Migration")
    plt.grid(True)
    plt.tight_layout()
    plt.show()  # Display each plot sequentially


In [None]:
import matplotlib.pyplot as plt

# Group data by Country Name
grouped_data = merged_data.groupby("Country Name")

# Create a dual-axis line plot for each country
for country, group in grouped_data:
    fig, ax1 = plt.subplots(figsize=(10, 6))
    
    # Plot GDP on the primary y-axis
    ax1.plot(group["Year"], group["GDP"], color="tab:blue", label="GDP")
    ax1.set_xlabel("Year")
    ax1.set_ylabel("GDP (Current USD)", color="tab:blue")
    ax1.tick_params(axis="y", labelcolor="tab:blue")
    
    # Create a secondary y-axis for Net Migration
    ax2 = ax1.twinx()
    ax2.plot(group["Year"], group["Net Migration"], color="tab:orange", label="Net Migration")
    ax2.set_ylabel("Net Migration", color="tab:orange")
    ax2.tick_params(axis="y", labelcolor="tab:orange")
    
    # Add title and grid
    plt.title(f"GDP and Net Migration Over Time for {country}")
    fig.tight_layout()
    plt.grid(visible=False)  # Adjust grid as needed
    
    # Show the plot
    plt.show()


Generating the Pearson coefficients for each country and creating a list of the 5 top r values and 5 lowest r values.

In [None]:
import pandas as pd

# Load the two datasets
gdp_data = pd.read_csv('Old Material/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_2.csv', skiprows=4)
migration_data = pd.read_csv('Old Material/API_SM.POP.NETM_DS2_en_csv_v2_33.csv', skiprows=4)

# Keep relevant columns (Country Code, Country Name, and yearly data)
gdp_data_clean = gdp_data.drop(columns=["Indicator Name", "Indicator Code"], errors='ignore')
migration_data_clean = migration_data.drop(columns=["Indicator Name", "Indicator Code"], errors='ignore')

# Melt datasets to have 'Year' as a column
gdp_melted = gdp_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                 var_name="Year", value_name="GDP")
migration_melted = migration_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                             var_name="Year", value_name="Net Migration")

# Merge datasets on Country Code, Country Name, and Year
merged_data = pd.merge(gdp_melted, migration_melted, 
                       on=["Country Name", "Country Code", "Year"], 
                       how="inner")

# Convert Year, GDP, and Net Migration to numeric
merged_data["Year"] = pd.to_numeric(merged_data["Year"], errors="coerce")
merged_data["GDP"] = pd.to_numeric(merged_data["GDP"], errors="coerce")
merged_data["Net Migration"] = pd.to_numeric(merged_data["Net Migration"], errors="coerce")

# Drop rows with NaN values (in GDP or Net Migration)
merged_data = merged_data.dropna(subset=["GDP", "Net Migration"])

# Calculate correlation for each country
country_correlations = merged_data.groupby("Country Name").apply(
    lambda group: group["GDP"].corr(group["Net Migration"])
).reset_index()

# Rename columns for clarity
country_correlations.columns = ["Country Name", "Correlation (GDP vs. Net Migration)"]

# Sort the dataframe to get the top 5 highest and lowest correlations
top_5_countries = country_correlations.nlargest(5, "Correlation (GDP vs. Net Migration)")
bottom_5_countries = country_correlations.nsmallest(5, "Correlation (GDP vs. Net Migration)")

# Combine the top 5 and bottom 5 into a single DataFrame
top_bottom_countries = pd.concat([top_5_countries, bottom_5_countries])

# Display the result
print(top_bottom_countries)


Generating xy scatter plots with regression data for the top 5 and bottom 5 Pearson coefficients.  This analysis was completed after our slides were nearly complete, so this information is not being presented.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Load the datasets from 'Old Material' directory
gdp_data = pd.read_csv('Old Material/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_2.csv', skiprows=4)
migration_data = pd.read_csv('Old Material/API_SM.POP.NETM_DS2_en_csv_v2_33.csv', skiprows=4)

# Keep relevant columns (Country Code, Country Name, and yearly data)
gdp_data_clean = gdp_data.drop(columns=["Indicator Name", "Indicator Code"], errors='ignore')
migration_data_clean = migration_data.drop(columns=["Indicator Name", "Indicator Code"], errors='ignore')

# Melt datasets to reshape them into long format
gdp_melted = gdp_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                 var_name="Year", value_name="GDP")
migration_melted = migration_data_clean.melt(id_vars=["Country Name", "Country Code"], 
                                             var_name="Year", value_name="Net Migration")

# Merge datasets on Country Code, Country Name, and Year
merged_data = pd.merge(gdp_melted, migration_melted, 
                       on=["Country Name", "Country Code", "Year"], 
                       how="inner")

# Convert Year, GDP, and Net Migration to numeric
merged_data["Year"] = pd.to_numeric(merged_data["Year"], errors="coerce")
merged_data["GDP"] = pd.to_numeric(merged_data["GDP"], errors="coerce")
merged_data["Net Migration"] = pd.to_numeric(merged_data["Net Migration"], errors="coerce")

# Drop rows with NaN values (in GDP or Net Migration)
merged_data = merged_data.dropna(subset=["GDP", "Net Migration"])

# Calculate correlation for each country
country_correlations = merged_data.groupby("Country Name").apply(
    lambda group: group["GDP"].corr(group["Net Migration"])
).reset_index()

# Rename columns for clarity
country_correlations.columns = ["Country Name", "Correlation (GDP vs. Net Migration)"]

# Sort to get the top 5 highest and bottom 5 lowest correlation countries
top_5_countries = country_correlations.nlargest(5, "Correlation (GDP vs. Net Migration)")
bottom_5_countries = country_correlations.nsmallest(5, "Correlation (GDP vs. Net Migration)")

# Combine the top 5 and bottom 5 into a single list
selected_countries = pd.concat([top_5_countries, bottom_5_countries])["Country Name"]

# Filter the merged dataset to include only the selected countries
selected_data = merged_data[merged_data["Country Name"].isin(selected_countries)]

# Generate xy plots with regression lines for only the selected countries
for country in selected_countries:
    country_data = selected_data[selected_data["Country Name"] == country]
    
    # Calculate Pearson correlation coefficient
    r, _ = pearsonr(
        country_data["GDP"],
        country_data["Net Migration"]
    )
    
    # Fit a linear regression line
    x = country_data["GDP"]
    y = country_data["Net Migration"]
    slope, intercept = np.polyfit(x, y, 1)
    regression_line = slope * x + intercept
    
    # Create the scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(x, y, alpha=0.7)
    plt.plot(x, regression_line, color="red")
    
    # Automatically adjust axes dynamically for each country
    plt.xlim(min(x) * 0.95, max(x) * 1.05)  # Add slight padding for readability
    plt.ylim(min(y) * 0.95, max(y) * 1.05)

    # Add title and labels
    plt.title(f"$\\bf{{{country}}}$\nGDP vs. Net Migration\n(r = {r:.2f})")
    plt.xlabel("GDP (Current US$)")
    plt.ylabel("Net Migration")
    plt.grid(True)

    # Show the plot
    plt.show()
