# Solution

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

# Set random seed for reproducibility
np.random.seed(42)

## Problem Analysis

<!-- Add your analysis here -->

In [2]:
# Country-level data
countries = ['USA', 'China', 'India', 'Indonesia', 'Brazil', 
             'Pakistan', 'Nigeria', 'Bangladesh', 'Russia', 'Mexico']

data = {
    'country': countries,
    'population': [331_000_000, 1_412_000_000, 1_408_000_000, 273_000_000, 213_000_000,
                   225_000_000, 211_000_000, 166_000_000, 146_000_000, 130_000_000],
    'mean_height': [175.3, 169.5, 164.7, 162.3, 170.7, 
                    165.1, 168.9, 162.0, 176.5, 169.4],
    'std_height': [8.5, 7.2, 7.8, 6.9, 8.1, 
                   7.5, 8.3, 7.1, 8.9, 7.6]
}

df = pd.DataFrame(data)

# Display the data
print("Country-Level Statistics:")
print(df.to_string(index=False))
print(f"\nTotal population: {df['population'].sum():,}")


Country-Level Statistics:
   country  population  mean_height  std_height
       USA   331000000        175.3         8.5
     China  1412000000        169.5         7.2
     India  1408000000        164.7         7.8
 Indonesia   273000000        162.3         6.9
    Brazil   213000000        170.7         8.1
  Pakistan   225000000        165.1         7.5
   Nigeria   211000000        168.9         8.3
Bangladesh   166000000        162.0         7.1
    Russia   146000000        176.5         8.9
    Mexico   130000000        169.4         7.6

Total population: 4,515,000,000


## Naive approach: simple averaging

In [5]:
# Naive approach: simple averaging
naive_mean = df['mean_height'].mean()
naive_std = df['std_height'].mean()

print("\n=== NAIVE APPROACH (INCORRECT) ===")
print(f"Naive global mean height: {naive_mean:.2f} cm")
print(f"Naive global std height: {naive_std:.2f} cm")



=== NAIVE APPROACH (INCORRECT) ===
Naive global mean height: 168.44 cm
Naive global std height: 7.79 cm


## Correct Approach (Population-Weighted)

In [6]:
# Correct approach: population-weighted statistics
total_population = df['population'].sum()

# Part A: Weighted mean
weighted_mean = (df['population'] * df['mean_height']).sum() / total_population

print("\n=== CORRECT APPROACH (WEIGHTED) ===")
print(f"Weighted global mean height: {weighted_mean:.2f} cm")

# Part B: Weighted variance
# Component 1: Within-country variance (weighted average of variances)
within_variance = (df['population'] * df['std_height']**2).sum() / total_population

# Component 2: Between-country variance (variance of means)
between_variance = (df['population'] * (df['mean_height'] - weighted_mean)**2).sum() / total_population

# Total variance
global_variance = within_variance + between_variance
global_std = np.sqrt(global_variance)

print(f"Weighted global std height: {global_std:.2f} cm")

print(f"\nVariance decomposition:")
print(f"  Within-country variance: {within_variance:.2f}")
print(f"  Between-country variance: {between_variance:.2f}")
print(f"  Total variance: {global_variance:.2f}")



=== CORRECT APPROACH (WEIGHTED) ===
Weighted global mean height: 167.75 cm
Weighted global std height: 8.54 cm

Variance decomposition:
  Within-country variance: 58.56
  Between-country variance: 14.43
  Total variance: 72.98


## Comparison and Impact

In [7]:
# Calculate the bias
mean_bias = naive_mean - weighted_mean
std_bias = naive_std - global_std

print("\n=== IMPACT OF USING NAIVE APPROACH ===")
print(f"Mean estimation error: {mean_bias:+.2f} cm ({abs(mean_bias/weighted_mean)*100:.1f}% relative error)")
print(f"Std estimation error: {std_bias:+.2f} cm ({abs(std_bias/global_std)*100:.1f}% relative error)")



=== IMPACT OF USING NAIVE APPROACH ===
Mean estimation error: +0.69 cm (0.4% relative error)
Std estimation error: -0.75 cm (8.8% relative error)


## Conclusion

The naive mean will overestimate because it treats small wealthy countries (higher mean heights) equally with large developing countries

The naive std significantly underestimates because it ignores between-country variance entirely

In realistic scenarios, these errors can be 2-5 cm for means and 1-3 cm for standard deviations