## Step 1: Importing Libraries

In [5]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set visualization styles
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("Libraries imported successfully")

Libraries imported successfully


## Step 2: Load Data

In [22]:
# Data Loading and Initial Inspection
print(" Loading and Inspecting Data...")
df_raw = pd.read_csv('../data/raw/supply_raw.csv')
print(f"Dataset Shape: {df_raw.shape}")
print(f"\nColumns: {df_raw.columns.tolist()}")
print(f"\nFirst 5 rows:")
display(df_raw.head())
print(f"\nData Types:")
print(df_raw.dtypes)
print(f"\nBasic Statistics:")
display(df_raw.describe())

 Loading and Inspecting Data...
Dataset Shape: (233, 12)

Columns: ['#', 'Country (or dependency)', 'Population 2025', 'Yearly Change', 'Net Change', 'Density (P/KmÂ²)', 'Land Area (KmÂ²)', 'Migrants (net)', 'Fert. Rate', 'Median Age', 'Urban Pop %', 'World Share']

First 5 rows:


Unnamed: 0,#,Country (or dependency),Population 2025,Yearly Change,Net Change,Density (P/KmÂ²),Land Area (KmÂ²),Migrants (net),Fert. Rate,Median Age,Urban Pop %,World Share
0,1,India,1463865525,0.89%,12929734,492,2973190,"â495,753",1.94,28.8,37.1%,17.78%
1,2,China,1416096094,â0.23%,"â3,225,184",151,9388211,"â268,126",1.02,40.1,67.5%,17.20%
2,3,United States,347275807,0.54%,1849236,38,9147420,1230663,1.62,38.5,82.8%,4.22%
3,4,Indonesia,285721236,0.79%,2233305,158,1811570,"â39,509",2.1,30.4,59.6%,3.47%
4,5,Pakistan,255219554,1.57%,3950390,331,770880,"â1,235,336",3.5,20.6,34.4%,3.10%



Data Types:
#                            int64
Country (or dependency)     object
Population 2025             object
Yearly Change               object
Net Change                  object
Density (P/KmÂ²)            object
Land Area (KmÂ²)            object
Migrants (net)              object
Fert. Rate                 float64
Median Age                 float64
Urban Pop %                 object
World Share                 object
dtype: object

Basic Statistics:


Unnamed: 0,#,Fert. Rate,Median Age
count,233.0,233.0,233.0
mean,117.0,2.306094,31.896996
std,67.405489,1.134316,9.860936
min,1.0,0.69,14.5
25%,59.0,1.47,22.9
50%,117.0,1.94,32.8
75%,175.0,2.95,39.8
max,233.0,5.94,57.4


## Step 3: Data Cleaning and Preprocessing

In [23]:
#Detailed Data Quality Assessment

print(" Data Quality Assessment...")
print("\n1. Missing Values:")
missing_data = df_raw.isnull().sum()
missing_percentage = (missing_data / len(df_raw)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_data,
    'Percentage (%)': missing_percentage
})
display(missing_df[missing_df['Missing Values'] > 0])

print("\n2. Duplicate Rows:")
print(f"Number of duplicate rows: {df_raw.duplicated().sum()}")

print("\n3. Unique Values per Column:")
for col in df_raw.columns:
    unique_count = df_raw[col].nunique()
    print(f"{col}: {unique_count} unique values")


 Data Quality Assessment...

1. Missing Values:


Unnamed: 0,Missing Values,Percentage (%)
Urban Pop %,23,9.871245



2. Duplicate Rows:
Number of duplicate rows: 0

3. Unique Values per Column:
#: 233 unique values
Country (or dependency): 233 unique values
Population 2025: 233 unique values
Yearly Change: 177 unique values
Net Change: 231 unique values
Density (P/KmÂ²): 164 unique values
Land Area (KmÂ²): 224 unique values
Migrants (net): 233 unique values
Fert. Rate: 159 unique values
Median Age: 173 unique values
Urban Pop %: 191 unique values
World Share: 165 unique values


In [24]:

# Data Cleaning Function
def clean_migration_data(df):
    df_clean = df.copy()
    
    # First, let's see what columns we actually have
    print("Original columns in raw data:")
    for col in df_clean.columns:
        print(f"  '{col}'")
    
    # Clean column names - handle special characters properly
    df_clean.columns = [col.strip() for col in df_clean.columns]
    
    # Rename columns with simple names to avoid encoding issues
    rename_dict = {}
    for col in df_clean.columns:
        if 'Country' in col:
            rename_dict[col] = 'Country'
        elif 'Population 2025' in col:
            rename_dict[col] = 'Population'
        elif 'Migrants' in col:
            rename_dict[col] = 'Net_Migrants'
        elif 'Fert.' in col:
            rename_dict[col] = 'Fertility_Rate'
        elif 'Median Age' in col:
            rename_dict[col] = 'Median_Age'
        elif 'Urban Pop' in col:
            rename_dict[col] = 'Urban_Pop_Percent'
        elif 'Density' in col:
            rename_dict[col] = 'Density'
        elif 'Land Area' in col:
            rename_dict[col] = 'Land_Area'
        elif 'World Share' in col:
            rename_dict[col] = 'World_Share'
    
    df_clean = df_clean.rename(columns=rename_dict)
    
    # Keep only the columns we need
    keep_columns = ['Country', 'Population', 'Yearly Change', 'Net Change', 
                   'Density', 'Land_Area', 'Net_Migrants', 'Fertility_Rate', 
                   'Median_Age', 'Urban_Pop_Percent', 'World_Share']
    
    # Filter to only keep columns that exist
    existing_columns = [col for col in keep_columns if col in df_clean.columns]
    df_clean = df_clean[existing_columns]
    
    # Clean numeric columns
    def clean_numeric(series):
        if series.dtype == 'object':
            series = series.astype(str)
            series = series.str.replace('창혞혪', '-', regex=False)
            series = series.str.replace(',', '')
            series = series.str.replace('%', '')
            # Remove any non-numeric characters except minus and decimal
            series = series.str.replace(r'[^\d\.\-]', '', regex=True)
            # Handle empty strings
            series = series.replace('', np.nan)
        return pd.to_numeric(series, errors='coerce')
    
    # Apply cleaning to all numeric columns
    numeric_cols = ['Population', 'Yearly Change', 'Net Change', 'Density', 
                   'Land_Area', 'Net_Migrants', 'Fertility_Rate', 'Median_Age', 
                   'Urban_Pop_Percent', 'World_Share']
    
    for col in numeric_cols:
        if col in df_clean.columns:
            df_clean[col] = clean_numeric(df_clean[col])
    
    # Create derived metrics
    df_clean['Migration_Rate_per_1000'] = (df_clean['Net_Migrants'] / df_clean['Population']) * 1000
    
    # Add continent classification
    continent_mapping = {
        'Asia': ['India', 'China', 'Japan', 'Indonesia', 'Pakistan', 'Bangladesh', 
                'Philippines', 'Vietnam', 'Thailand', 'Iran', 'Turkey', 'Myanmar',
                'South Korea', 'Afghanistan', 'Yemen', 'Uzbekistan', 'Malaysia'],
        'Europe': ['Russia', 'Germany', 'United Kingdom', 'France', 'Italy', 'Spain',
                  'Poland', 'Ukraine', 'Romania', 'Netherlands', 'Belgium'],
        'North America': ['United States', 'Canada', 'Mexico'],
        'Africa': ['Nigeria', 'Ethiopia', 'Egypt', 'DR Congo', 'Tanzania', 'South Africa',
                  'Kenya', 'Uganda', 'Sudan', 'Algeria'],
        'South America': ['Brazil', 'Argentina', 'Colombia', 'Peru', 'Chile'],
        'Oceania': ['Australia', 'New Zealand', 'Papua New Guinea']
    }
    
    country_to_continent = {}
    for continent, countries in continent_mapping.items():
        for country in countries:
            country_to_continent[country] = continent
    
    df_clean['Continent'] = df_clean['Country'].map(country_to_continent)
    df_clean['Continent'] = df_clean['Continent'].fillna('Other')
    
    print("\nCleaned columns:")
    print(df_clean.columns.tolist())
    print(f"\nCleaned data shape: {df_clean.shape}")
    
    return df_clean

print(" Cleaning data...")
df = clean_migration_data(df_raw)

# Display cleaned data info
print("\n Data cleaned successfully")
print(f"Sample of cleaned data:")
display(df.head())
print("\n")

 Cleaning data...
Original columns in raw data:
  '#'
  'Country (or dependency)'
  'Population 2025'
  'Yearly Change'
  'Net Change'
  'Density (P/KmÂ²)'
  'Land Area (KmÂ²)'
  'Migrants (net)'
  'Fert. Rate'
  'Median Age'
  'Urban Pop %'
  'World Share'

Cleaned columns:
['Country', 'Population', 'Yearly Change', 'Net Change', 'Density', 'Land_Area', 'Net_Migrants', 'Fertility_Rate', 'Median_Age', 'Urban_Pop_Percent', 'World_Share', 'Migration_Rate_per_1000', 'Continent']

Cleaned data shape: (233, 13)

 Data cleaned successfully
Sample of cleaned data:


Unnamed: 0,Country,Population,Yearly Change,Net Change,Density,Land_Area,Net_Migrants,Fertility_Rate,Median_Age,Urban_Pop_Percent,World_Share,Migration_Rate_per_1000,Continent
0,India,1463865525,0.89,12929734,492,2973190,495753,1.94,28.8,37.1,17.78,0.33866,Asia
1,China,1416096094,0.23,3225184,151,9388211,268126,1.02,40.1,67.5,17.2,0.189342,Asia
2,United States,347275807,0.54,1849236,38,9147420,1230663,1.62,38.5,82.8,4.22,3.543763,North America
3,Indonesia,285721236,0.79,2233305,158,1811570,39509,2.1,30.4,59.6,3.47,0.138278,Asia
4,Pakistan,255219554,1.57,3950390,331,770880,1235336,3.5,20.6,34.4,3.1,4.840287,Asia






In [25]:
# Generate Dynamic Report Header
total_countries = len(df)
total_population = df['Population'].sum() / 1e9
top_immigration = df.nlargest(1, 'Net_Migrants').iloc[0]
top_emigration = df.nsmallest(1, 'Net_Migrants').iloc[0]

markdown_content = f"""
#  Global Migration Analysis Report

**Analysis Date:** {pd.Timestamp.now().strftime('%Y-%m-%d')}
**Total Countries Analyzed:** {total_countries:,}
**Global Population:** {total_population:.2f} billion

## Executive Summary
This report analyzes migration patterns across {total_countries} countries. 
Key findings show that **{top_immigration['Country']}** leads in immigration with {top_immigration['Net_Migrants']/1e6:.1f} million net migrants, 
while **{top_emigration['Country']}** experiences the highest emigration with {abs(top_emigration['Net_Migrants'])/1e6:.1f} million net migrants.

---
"""

print(markdown_content)


#  Global Migration Analysis Report

**Analysis Date:** 2025-12-05
**Total Countries Analyzed:** 233
**Global Population:** 8.23 billion

## Executive Summary
This report analyzes migration patterns across 233 countries. 
Key findings show that **Ukraine** leads in immigration with 1.7 million net migrants, 
while **Brunei** experiences the highest emigration with 0.0 million net migrants.

---



## Step 4: Visualization 1

In [26]:
# Top Migration Countries with Dynamic Explanation

print("Creating Visualization 1: Top Migration Countries...")

top_immigration_10 = df.nlargest(10, 'Net_Migrants')
top_emigration_10 = df.nsmallest(10, 'Net_Migrants')

fig1 = make_subplots(rows=1, cols=2, subplot_titles=('Top Immigration Countries', 'Top Emigration Countries'))

fig1.add_trace(go.Bar(x=top_immigration_10['Net_Migrants']/1e6, y=top_immigration_10['Country'],
                     orientation='h', marker_color='#2E86AB', name='Immigration'),
              row=1, col=1)

fig1.add_trace(go.Bar(x=abs(top_emigration_10['Net_Migrants'])/1e6, y=top_emigration_10['Country'],
                     orientation='h', marker_color='#A23B72', name='Emigration'),
              row=1, col=2)

fig1.update_layout(height=500, showlegend=False, title_text="Migration Leaders Analysis")
fig1.show()


Creating Visualization 1: Top Migration Countries...


# Dynamic explanation for Visualization 1

###  Visualization 1: Migration Leaders

**What this shows:** 
- **Left panel:** Top 10 countries gaining population through immigration
- **Right panel:** Top 10 countries losing population through emigration
- **Unit:** Millions of net migrants

**Key Insight:** Ukraine leads immigration with 1.7M migrants, while Brunei leads emigration with 0.0M migrants.

**Pattern:** Developed economies typically show net immigration, while developing countries often show net emigration.


## Step 5: Visualization 2

In [27]:
# Migration vs Urbanization with Trendline

print(" Creating Visualization 2: Migration vs Urbanization...")

fig2 = px.scatter(df, x='Urban_Pop_Percent', y='Migration_Rate_per_1000',
                 size='Population', hover_name='Country',
                 title='Migration Rate vs Urbanization Level',
                 labels={'Urban_Pop_Percent': 'Urban Population (%)',
                        'Migration_Rate_per_1000': 'Migration Rate per 1000'},
                 trendline='ols')

fig2.update_layout(width=800, height=500)
fig2.show()

# Calculate correlation for dynamic text
urban_migration_corr = df['Urban_Pop_Percent'].corr(df['Migration_Rate_per_1000'])

 Creating Visualization 2: Migration vs Urbanization...


 ###  Visualization 2: Urbanization and Migration

**What this shows:** 
- **X-axis:** Percentage of population living in urban areas
- **Y-axis:** Migration rate per 1000 population
- **Size:** Country population (larger circles = more people)
- **Color:** Continent classification

**Statistical Insight:** 
The correlation between urbanization and migration rate is **0.191**.
This positive correlation suggests that more urbanized countries tend to have higher migration rates.

**Interpretation:** Countries with higher urbanization (typically developed nations) tend to attract more migrants, reflecting economic opportunities in urban centers.

## Step 6: Visualization 3 - Migration vs Density with Dynamic Explanation

In [28]:
# Migration vs Density with Dynamic Explanation
print("Creating Visualization 3: Migration vs Population Density...")

# Check column names
print(f"Available columns: {df.columns.tolist()}")
print(f"Density column exists: {'Density' in df.columns}")

# Filter out NaN values
density_df = df.dropna(subset=['Density', 'Migration_Rate_per_1000'])

fig3 = px.scatter(density_df, 
                 x='Density', 
                 y='Migration_Rate_per_1000',
                 size='Population', 
                 color='Net_Migrants',
                 hover_name='Country', 
                 log_x=True,
                 title='Migration Rate vs Population Density',
                 labels={'Density': 'Population Density (log scale)',
                        'Migration_Rate_per_1000': 'Migration Rate per 1000'},
                 color_continuous_scale='RdBu',
                 color_continuous_midpoint=0)

fig3.update_layout(width=800, height=500)
fig3.show()

# Calculate density statistics for dynamic text
density_migration_corr = np.log1p(density_df['Density']).corr(density_df['Migration_Rate_per_1000'])
avg_density = density_df['Density'].mean()

Creating Visualization 3: Migration vs Population Density...
Available columns: ['Country', 'Population', 'Yearly Change', 'Net Change', 'Density', 'Land_Area', 'Net_Migrants', 'Fertility_Rate', 'Median_Age', 'Urban_Pop_Percent', 'World_Share', 'Migration_Rate_per_1000', 'Continent']
Density column exists: True



### Visualization 3: Population Density and Migration

**What this shows:** 
- **X-axis:** Population density (log scale for better visualization)
- **Y-axis:** Migration rate per 1000 population
- **Size:** Country population (larger circles = more people)
- **Color:** Net migration (blue = immigration, red = emigration)

**Statistical Insight:** 
The correlation between population density (log) and migration rate is **0.116**.
Average population density across all countries: **488 people/km²**.

**Key Finding:** Medium-density countries often show the highest immigration rates, suggesting there's an optimal density level for attracting migrants—not too crowded, but developed enough.

## Step 7: Visualization 4 - Migration by Continent with Dynamic Explanation

In [29]:
#  Visualization 4 - Migration by Continent with Dynamic Explanation
print(" Creating Visualization 4: Migration by Continent...")

continent_data = df.groupby('Continent').agg({
    'Net_Migrants': 'sum',
    'Population': 'sum',
    'Country': 'count'
}).round(2)

continent_data['Migration_per_Capita'] = (continent_data['Net_Migrants'] / continent_data['Population']) * 1000

fig4 = make_subplots(rows=2, cols=2, 
                     subplot_titles=('Total Net Migration (Millions)', 'Migration per 1000 People',
                                    'Number of Countries', 'Total Population (Billions)'))

# Panel 1: Total Net Migration
colors_panel1 = ['#2E86AB' if x > 0 else '#A23B72' for x in continent_data['Net_Migrants']]
fig4.add_trace(
    go.Bar(x=continent_data.index, 
           y=continent_data['Net_Migrants']/1e6,
           marker_color=colors_panel1,
           text=continent_data['Net_Migrants'].apply(lambda x: f'{x/1e6:.1f}M'),
           textposition='auto'),
    row=1, col=1
)

# Panel 2: Migration per Capita
fig4.add_trace(
    go.Bar(x=continent_data.index, 
           y=continent_data['Migration_per_Capita'],
           marker_color='#F18F01',
           text=continent_data['Migration_per_Capita'].round(1),
           textposition='auto'),
    row=1, col=2
)

# Panel 3: Number of Countries
fig4.add_trace(
    go.Bar(x=continent_data.index, 
           y=continent_data['Country'],
           marker_color='#73AB84',
           text=continent_data['Country'],
           textposition='auto'),
    row=2, col=1
)

# Panel 4: Total Population
fig4.add_trace(
    go.Bar(x=continent_data.index, 
           y=continent_data['Population']/1e9,
           marker_color='#99C1B9',
           text=continent_data['Population'].apply(lambda x: f'{x/1e9:.1f}B'),
           textposition='auto'),
    row=2, col=2
)

fig4.update_layout(height=600, showlegend=False, title_text="Continental Migration Analysis")
fig4.show()

# Find top immigration/emigration continents for dynamic text
top_imm_continent = continent_data['Net_Migrants'].idxmax()
top_emm_continent = continent_data['Net_Migrants'].idxmin()
top_rate_continent = continent_data['Migration_per_Capita'].idxmax()

 Creating Visualization 4: Migration by Continent...



### Visualization 4: Continental Migration Patterns

**What this shows (four panels):**
1. **Total Net Migration:** Overall migration balance per continent (blue = gain, red = loss)
2. **Migration per Capita:** Migration rate adjusted for population size
3. **Number of Countries:** How many countries in each continent
4. **Total Population:** Relative population sizes

**Key Continental Insights:**
- **Asia** has the highest net immigration: 3.6M
- **Oceania** has the highest net emigration: 0.2M
- **Migration Intensity:** Europe has the highest migration rate per capita: 5.8 per 1000

**Regional Pattern:** Migration flows primarily from developing continents to developed ones, reflecting economic disparities.
"""



## FINAL MIGRATION ANALYSIS SUMMARY

## Global Overview
- **Total Countries Analyzed:** 233
- **Global Population:** 8.23 billion
- **Average Migration Rate:** 4.4 migrants per 1000 population

## Migration Distribution
- **Countries with Net Immigration:** 232 (99.6%)
- **Countries with Net Emigration:** 0 (0.0%)
- **Balanced Countries:** 1

## Top Performers
| Category | Country | Value |
|----------|---------|-------|
| **Highest Immigration** | Ukraine | 1.7M |
| **Highest Emigration** | Brunei | 0.0M |
| **Highest Migration Rate** | Marshall Islands | 46.8/1000 |

## Key Findings
1. **Urbanization Correlation:** Migration rate correlates with urbanization level (r = 0.191)
2. **Density Pattern:** Medium-density countries attract the most migrants
3. **Continental Flow:** Clear migration from Oceania to Asia

## Data Quality Notes
- Data cleaned and validated
- Special characters removed from numeric fields
- All calculations use cleaned, standardized data
- Missing values handled appropriately



