In [3]:
import ee
import geemap
import pandas as pd
import matplotlib.pyplot as plt

ee.Initialize()

# Load Rwanda sector boundaries
sectors = ee.FeatureCollection("projects/ml-for-earth-observation/assets/rwa_sector")

# Print basic info about the sectors dataset
print(f"Number of sectors --> {sectors.size().getInfo()}")
print(f"First sector properties --> {sectors.first().toDictionary().getInfo()}")

Number of sectors --> 416
First sector properties --> {'Dist_ID': 11, 'District': 'Nyarugenge', 'Name': 'Gitega', 'Prov_ID': 1, 'Province': 'Kigali City', 'Sect_ID': 1101}


In [None]:
# Define the time period
start_date = '2020-01-01'
end_date = '2025-01-31'

# Load Sentinel-5P collections for different pollutants
no2 = ee.ImageCollection(
    'COPERNICUS/S5P/NRTI/L3_NO2').select('NO2_column_number_density')
co = ee.ImageCollection(
    'COPERNICUS/S5P/NRTI/L3_CO').select('CO_column_number_density')
so2 = ee.ImageCollection(
    'COPERNICUS/S5P/NRTI/L3_SO2').select('SO2_column_number_density')
o3 = ee.ImageCollection(
    'COPERNICUS/S5P/NRTI/L3_O3').select('O3_column_number_density')
aer = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_AER_AI") 
ch4 = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_CH4")
# Filter collections by date and Rwanda boundary
rwandaExtent = sectors.geometry().bounds()
no2 = no2.filterDate(start_date, end_date).filterBounds(rwandaExtent)
co = co.filterDate(start_date, end_date).filterBounds(rwandaExtent)
so2 = so2.filterDate(start_date, end_date).filterBounds(rwandaExtent)
o3 = o3.filterDate(start_date, end_date).filterBounds(rwandaExtent)
aer = aer.filterDate(start_date, end_date).filterBounds(rwandaExtent)
ch4 = ch4.filterDate(start_date, end_date).filterBounds(rwandaExtent)


# Print collection sizes
print(f"NO2 collection size: {no2.size().getInfo()}")
print(f"CO collection size: {co.size().getInfo()}")
print(f"SO2 collection size: {so2.size().getInfo()}")
print(f"O3 collection size: {o3.size().getInfo()}")
print(f"AER collection size: {aer.size().getInfo()}")
print(f"CH4 collection size: {ch4.size().getInfo()}")


# Function to calculate mean value of a pollutant for each sector


def calculate_pollutant_by_sector(image_collection, band_name):
    # Get the mean image
    mean_image = image_collection.mean()

    # Use reduceRegions to get mean value for each sector
    sector_means = mean_image.reduceRegions(
        collection=sectors,
        reducer=ee.Reducer.mean(),
        scale=1000  # 5km resolution
    )

    return sector_means


# Calculate for each pollutant
no2_by_sector = calculate_pollutant_by_sector(no2, 'NO2_column_number_density')
co_by_sector = calculate_pollutant_by_sector(co, 'CO_column_number_density')
so2_by_sector = calculate_pollutant_by_sector(so2, 'SO2_column_number_density')
o3_by_sector = calculate_pollutant_by_sector(o3, 'O3_column_number_density')
aer_by_sector = calculate_pollutant_by_sector(aer, 'absorbing_aerosol_index')
ch4_by_sector = calculate_pollutant_by_sector(ch4, 'CH4_column_volume_mixing_ratio_dry_air')

# Export the first few features to check the structure
print("NO2 by sector sample:")
print(aer_by_sector.limit(3).getInfo())

NO2 collection size: 980
CO collection size: 608
SO2 collection size: 649
O3 collection size: 1150
AER collection size: 5154
CH4 collection size: 4613
NO2 by sector sample:
{'type': 'FeatureCollection', 'columns': {'Dist_ID': 'Float', 'District': 'String', 'Name': 'String', 'Prov_ID': 'Float', 'Province': 'String', 'Sect_ID': 'Float', 'absorbing_aerosol_index': 'Float', 'sensor_altitude': 'Float', 'sensor_azimuth_angle': 'Float', 'sensor_zenith_angle': 'Float', 'solar_azimuth_angle': 'Float', 'solar_zenith_angle': 'Float', 'system:index': 'String'}, 'features': [{'type': 'Feature', 'geometry': {'type': 'Polygon', 'coordinates': [[[30.04570996035956, -1.9413336267672765], [30.045821495227276, -1.941409933357362], [30.04585720399942, -1.9414323846165285], [30.04586610914853, -1.941450353517211], [30.045875014297046, -1.9414952447684195], [30.045897321250663, -1.9415266725156022], [30.045924124858033, -1.9415446160439096], [30.04595983360299, -1.9415536061212209], [30.046013352617848, -1.

In [15]:
# Function to export features to DataFrame
def ee_to_df(fc):
    features = fc.getInfo()['features']
    return pd.DataFrame([f['properties'] for f in features])


# Convert to pandas DataFrames
no2_df = ee_to_df(no2_by_sector)
co_df = ee_to_df(co_by_sector)
so2_df = ee_to_df(so2_by_sector)
o3_df = ee_to_df(o3_by_sector)
aer_df = ee_to_df(aer_by_sector)
ch4_df = ee_to_df(ch4_by_sector)


# Merge data into a single DataFrame
sector_df = no2_df[['Name', 'Dist_ID', 'District',
                    'Prov_ID', 'Province', 'Sect_ID', 'mean']]
sector_df.rename(columns={'mean': 'NO2_mean'}, inplace=True)

# Add other pollutants
sector_df['CO_mean'] = co_df['mean']
sector_df['SO2_mean'] = so2_df['mean']
sector_df['O3_mean'] = o3_df['mean']
sector_df['AER_mean'] = aer_df['mean']
sector_df['CH4_mean'] = ch4_df['mean']


# Preview the dataset
print(sector_df.head())
print(f"Total sectors with data: {len(sector_df)}")

# # Save to CSV
# sector_df.to_csv('rwanda_air_quality_by_sector_2023.csv', index=False)
# print("Data saved to rwanda_air_quality_by_sector_2023.csv")

KeyError: 'mean'

In [8]:
# write df to csv
no2_monthly_df.to_csv('rwanda_no2_monthly_2023.csv', index=False)

In [None]:
def calculate_monthly_pollutant_stats_by_sector(image_collection, band_name, year, month):
    # Format dates
    month_str = f"{month:02d}"
    start_date = f"{year}-{month_str}-01"

    # Calculate end date (handle December correctly)
    if month == 12:
        end_date = f"{year+1}-01-01"
    else:
        end_date = f"{year}-{month+1:02d}-01"

    # Filter collection to month
    monthly_collection = image_collection.filterDate(start_date, end_date)

    # Check if collection is empty
    collection_size = monthly_collection.size().getInfo()
    if collection_size == 0:
        print(f"No data for {band_name} in {year}-{month_str}")
        # Create empty dataframe with right structure
        empty_df = pd.DataFrame(columns=[
            'Dist_ID', 'District', 'Name', 'Prov_ID', 'Province', 'Sect_ID',
            f'{band_name}_mean', f'{band_name}_min', f'{band_name}_max',
            'year', 'month', 'date'
        ])
        return empty_df

    try:
        # Calculate statistics
        mean_image = monthly_collection.mean()

        # Use reduceRegions to get mean value for each sector
        sector_stats = mean_image.reduceRegions(
            collection=sectors,
            reducer=ee.Reducer.mean(),
            scale=1000
        )

        # Convert to DataFrame
        stats_df = ee_to_df(sector_stats)

        # Rename columns
        if 'mean' in stats_df.columns:
            stats_df.rename(
                columns={'mean': f'{band_name}_mean'}, inplace=True)

        # Add min/max columns with placeholder values if not present
        if f'{band_name}_min' not in stats_df.columns:
            stats_df[f'{band_name}_min'] = stats_df[f'{band_name}_mean']
        if f'{band_name}_max' not in stats_df.columns:
            stats_df[f'{band_name}_max'] = stats_df[f'{band_name}_mean']

        # Add date
        stats_df['year'] = year
        stats_df['month'] = month
        stats_df['date'] = f"{year}-{month_str}"

        return stats_df

    except Exception as e:
        print(f"Error processing {band_name} for {year}-{month_str}: {e}")
        # Get all sector IDs
        all_sectors_df = ee_to_df(sectors)

        # Create DataFrame with zeros for all sectors
        empty_df = pd.DataFrame(
            all_sectors_df[['Dist_ID', 'District', 'Name', 'Prov_ID', 'Province', 'Sect_ID']])
        empty_df[f'{band_name}_mean'] = 0
        empty_df[f'{band_name}_min'] = 0
        empty_df[f'{band_name}_max'] = 0
        empty_df['year'] = year
        empty_df['month'] = month
        empty_df['date'] = f"{year}-{month_str}"

        return empty_df

start_date = '2018-01'
end_date = '2025-01'

# Get data for each pollutant
no2_all = calculate_pollutant_stats_by_sector_range(
    no2, 'NO2', start_date, end_date)
co_all = calculate_pollutant_stats_by_sector_range(
    co, 'CO', start_date, end_date)
so2_all = calculate_pollutant_stats_by_sector_range(
    so2, 'SO2', start_date, end_date)
o3_all = calculate_pollutant_stats_by_sector_range(
    o3, 'O3', start_date, end_date)

# Print summaries
for pollutant, df in [("NO2", no2_all), ("CO", co_all), ("SO2", so2_all), ("O3", o3_all)]:
    months = df['date'].nunique()
    sectors = df['Sect_ID'].nunique()
    print(f"{pollutant}: {months} months, {sectors} sectors, {len(df)} total records")

# Create combined dataset by merging on sector ID and date
df_combined = no2_all.merge(
    co_all[['Sect_ID', 'date', 'CO_mean', 'CO_min', 'CO_max']],
    on=['Sect_ID', 'date'], how='outer')

df_combined = df_combined.merge(
    so2_all[['Sect_ID', 'date', 'SO2_mean', 'SO2_min', 'SO2_max']],
    on=['Sect_ID', 'date'], how='outer')

df_combined = df_combined.merge(
    o3_all[['Sect_ID', 'date', 'O3_mean', 'O3_min', 'O3_max']],
    on=['Sect_ID', 'date'], how='outer')

# Save combined dataset
df_combined.to_csv('rwanda_air_quality_2020_2025.csv', index=False)
print(f"Combined data saved with {len(df_combined)} records")

Processing 2018-01...
No data for NO2 in 2018-01
Processing 2018-02...
No data for NO2 in 2018-02
Processing 2018-03...
No data for NO2 in 2018-03
Processing 2018-04...
No data for NO2 in 2018-04
Processing 2018-05...
No data for NO2 in 2018-05
Processing 2018-06...
No data for NO2 in 2018-06
Processing 2018-07...
No data for NO2 in 2018-07
Processing 2018-08...
No data for NO2 in 2018-08
Processing 2018-09...
No data for NO2 in 2018-09
Processing 2018-10...
No data for NO2 in 2018-10
Processing 2018-11...
No data for NO2 in 2018-11
Processing 2018-12...
No data for NO2 in 2018-12
Processing 2019-01...
No data for NO2 in 2019-01
Processing 2019-02...
No data for NO2 in 2019-02
Processing 2019-03...
No data for NO2 in 2019-03
Processing 2019-04...
No data for NO2 in 2019-04
Processing 2019-05...
No data for NO2 in 2019-05
Processing 2019-06...
No data for NO2 in 2019-06
Processing 2019-07...
No data for NO2 in 2019-07
Processing 2019-08...
No data for NO2 in 2019-08
Processing 2019-09..

AttributeError: 'int' object has no attribute 'getInfo'

In [4]:
# Load Rwanda sector boundaries
# sectors = ee.FeatureCollection("projects/ml-for-earth-observation/assets/rwa_sector")

# Print basic info about the sectors dataset
print(f"Number of sectors: {sectors.size().getInfo()}")
print(f"First sector properties: {sectors.first().toDictionary().getInfo()}")

Number of sectors: 416
First sector properties: {'Dist_ID': 11, 'District': 'Nyarugenge', 'Name': 'Gitega', 'Prov_ID': 1, 'Province': 'Kigali City', 'Sect_ID': 1101}


In [5]:
def ee_to_df(fc):
    features = fc.getInfo()['features']
    return pd.DataFrame([f['properties'] for f in features])

# Function to calculate monthly pollutant stats by sector
def calculate_monthly_pollutant_stats_by_sector(image_collection, band_name, year, month):
    # Format dates
    month_str = f"{month:02d}"
    start_date = f"{year}-{month_str}-01"

    # Calculate end date (handle December correctly)
    if month == 12:
        end_date = f"{year+1}-01-01"
    else:
        end_date = f"{year}-{month+1:02d}-01"

    # Filter collection to month
    monthly_collection = image_collection.filterDate(start_date, end_date)

    # Check if collection is empty
    collection_size = monthly_collection.size().getInfo()
    if collection_size == 0:
        print(f"No data for {band_name} in {year}-{month_str}")
        # Return empty DataFrame with correct structure
        return pd.DataFrame(columns=[
            'Dist_ID', 'District', 'Name', 'Prov_ID', 'Province', 'Sect_ID',
            f'{band_name}_mean', f'{band_name}_min', f'{band_name}_max',
            'year', 'month', 'date'
        ])

    try:
        # Calculate statistics
        mean_image = monthly_collection.mean()

        # Use reduceRegions to get mean value for each sector
        sector_stats = mean_image.reduceRegions(
            collection=sectors,
            reducer=ee.Reducer.mean(),
            scale=1200
        )

        # Convert to DataFrame
        stats_df = ee_to_df(sector_stats)

        # Rename columns
        if 'mean' in stats_df.columns:
            stats_df.rename(
                columns={'mean': f'{band_name}_mean'}, inplace=True)

        # Add date columns
        stats_df['year'] = year
        stats_df['month'] = month
        stats_df['date'] = f"{year}-{month_str}"

        return stats_df

    except Exception as e:
        print(f"Error processing {band_name} for {year}-{month_str}: {e}")
        return pd.DataFrame()  # Return empty DataFrame

In [6]:

# Function to get pollutant stats for a date range

def calculate_pollutant_stats_by_sector_range(image_collection, band_name, start_year, end_year):
    all_results = []

    # Loop through each year and month
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            # Skip future months in 2025
            if year == 2025 and month > 1:
                continue

            print(f"Processing {band_name}: {year}-{month:02d}...")

            # Get stats for current month
            month_stats = calculate_monthly_pollutant_stats_by_sector(
                image_collection, band_name, year, month)

            if not month_stats.empty:
                all_results.append(month_stats)

    # Combine all months
    if all_results:
        combined_df = pd.concat(all_results)
        return combined_df
    else:
        # Return empty DataFrame with correct columns
        return pd.DataFrame(columns=[
            'Dist_ID', 'District', 'Name', 'Prov_ID', 'Province', 'Sect_ID',
            f'{band_name}_mean', 'year', 'month', 'date'
        ])

In [7]:
def collect_air_quality_data(start_year, end_year):
    # Define date strings for filtering
    start_date = f"{start_year}-01-01"
    end_date = f"{end_year}-01-31"

    # Load pollutant collections
    no2 = ee.ImageCollection(
        'COPERNICUS/S5P/NRTI/L3_NO2').select('NO2_column_number_density')
    co = ee.ImageCollection(
        'COPERNICUS/S5P/NRTI/L3_CO').select('CO_column_number_density')
    so2 = ee.ImageCollection(
        'COPERNICUS/S5P/NRTI/L3_SO2').select('SO2_column_number_density')
    o3 = ee.ImageCollection(
        'COPERNICUS/S5P/NRTI/L3_O3').select('O3_column_number_density')
    aer = ee.ImageCollection(
        "COPERNICUS/S5P/OFFL/L3_AER_AI").select('absorbing_aerosol_index')
    ch4 = ee.ImageCollection(
        "COPERNICUS/S5P/OFFL/L3_CH4").select('CH4_column_volume_mixing_ratio_dry_air')

    # Filter by date and location
    rwandaExtent = sectors.geometry().bounds()
    no2 = no2.filterDate(start_date, end_date).filterBounds(rwandaExtent)
    co = co.filterDate(start_date, end_date).filterBounds(rwandaExtent)
    so2 = so2.filterDate(start_date, end_date).filterBounds(rwandaExtent)
    o3 = o3.filterDate(start_date, end_date).filterBounds(rwandaExtent)
    aer = aer.filterDate(start_date, end_date).filterBounds(rwandaExtent)
    ch4 = ch4.filterDate(start_date, end_date).filterBounds(rwandaExtent)

    # Print collection sizes
    print(f"NO2 collection size: {no2.size().getInfo()}")
    print(f"CO collection size: {co.size().getInfo()}")
    print(f"SO2 collection size: {so2.size().getInfo()}")
    print(f"O3 collection size: {o3.size().getInfo()}")
    print(f"AER collection size: {aer.size().getInfo()}")
    print(f"CH4 collection size: {ch4.size().getInfo()}")

    # Get data for each pollutant
    print("\nCollecting monthly data for all pollutants...")
    no2_all = calculate_pollutant_stats_by_sector_range(
        no2, 'NO2', start_year, end_year)
    co_all = calculate_pollutant_stats_by_sector_range(
        co, 'CO', start_year, end_year)
    so2_all = calculate_pollutant_stats_by_sector_range(
        so2, 'SO2', start_year, end_year)
    o3_all = calculate_pollutant_stats_by_sector_range(
        o3, 'O3', start_year, end_year)
    aer_all = calculate_pollutant_stats_by_sector_range(
        aer, 'AER', start_year, end_year)
    ch4_all = calculate_pollutant_stats_by_sector_range(
        ch4, 'CH4', start_year, end_year)

    # Print summaries
    print("\nData collection summary:")
    for pollutant, df in [("NO2", no2_all), ("CO", co_all), ("SO2", so2_all),
                          ("O3", o3_all), ("AER", aer_all), ("CH4", ch4_all)]:
        if not df.empty:
            months = df['date'].nunique()
            sectorsx = df['Sect_ID'].nunique()
            print(
                f"{pollutant}: {months} months, {sectorsx} sectors, {len(df)} total records")
        else:
            print(f"{pollutant}: No data collected")

    # Return all the datasets
    return {
        'NO2': no2_all,
        'CO': co_all,
        'SO2': so2_all,
        'O3': o3_all,
        'AER': aer_all,
        'CH4': ch4_all
    }

In [9]:

def merge_air_quality_data(pollutant_datasets, start_year, end_year):
    """
    Merge multiple pollutant datasets into a single DataFrame.
    
    Args:
        pollutant_datasets: Dictionary with pollutant names as keys and DataFrames as values
        start_year: Starting year for the filename
        end_year: Ending year for the filename
        
    Returns:
        Combined DataFrame with all pollutants
    """
    print("\nMerging datasets...")

    # Create a list of pollutant names and their corresponding DataFrames
    pollutant_data = [
        (name, df) for name, df in pollutant_datasets.items()
    ]

    # Filter to only non-empty DataFrames
    non_empty_pollutant_data = [(name, df)
                                for name, df in pollutant_data if not df.empty]

    if len(non_empty_pollutant_data) > 0:
        # Start with the first non-empty dataset
        first_name, merged_df = non_empty_pollutant_data[0]

        # Merge remaining datasets one by one
        for pollutant, df in non_empty_pollutant_data[1:]:
            cols_to_use = ['Sect_ID', 'date', f'{pollutant}_mean']

            # Make sure the required columns exist
            if all(col in df.columns for col in cols_to_use):
                merged_df = merged_df.merge(
                    df[cols_to_use], on=['Sect_ID', 'date'], how='outer')
            else:
                print(
                    f"Warning: Required columns not found in {pollutant} dataset. Columns: {df.columns.tolist()}")

        # Save combined dataset
        filename = f'rwanda_air_quality_{start_year}_{end_year}.csv'
        merged_df.to_csv(filename, index=False)
        print(
            f"\nCombined data saved to {filename} with {len(merged_df)} records")
        return merged_df
    else:
        print("No data to merge")
        return pd.DataFrame()

In [10]:
data = collect_air_quality_data(2020, 2025)

NO2 collection size: 3538
CO collection size: 3119
SO2 collection size: 3182
O3 collection size: 3706
AER collection size: 26296
CH4 collection size: 24795

Collecting monthly data for all pollutants...
Processing NO2: 2020-01...
Processing NO2: 2020-02...
Processing NO2: 2020-03...
Processing NO2: 2020-04...
Processing NO2: 2020-05...
Processing NO2: 2020-06...
Processing NO2: 2020-07...
Processing NO2: 2020-08...
Processing NO2: 2020-09...
Processing NO2: 2020-10...
Processing NO2: 2020-11...
Processing NO2: 2020-12...
Processing NO2: 2021-01...
Processing NO2: 2021-02...
Processing NO2: 2021-03...
Processing NO2: 2021-04...
Processing NO2: 2021-05...
Processing NO2: 2021-06...
Processing NO2: 2021-07...
Processing NO2: 2021-08...
Processing NO2: 2021-09...
Processing NO2: 2021-10...
Processing NO2: 2021-11...
Processing NO2: 2021-12...
Processing NO2: 2022-01...
Processing NO2: 2022-02...
Processing NO2: 2022-03...
Processing NO2: 2022-04...
Processing NO2: 2022-05...
Processing NO2

In [None]:
dataset = merge_air_quality_data(data, 2020, 2025)


Merging datasets...

Combined data saved to rwanda_air_quality_2020_2025.csv with 25376 records


: 

In [None]:
dataset.to_csv('extracted.csv', index=False)

NO2 collection size: 3538
CO collection size: 3119
SO2 collection size: 3182
O3 collection size: 3706
AER collection size: 26296
CH4 collection size: 24795

Collecting monthly data for all pollutants...
Processing NO2: 2020-01...
Processing NO2: 2020-02...
Processing NO2: 2020-03...
Processing NO2: 2020-04...
Processing NO2: 2020-05...
Processing NO2: 2020-06...
Processing NO2: 2020-07...
Processing NO2: 2020-08...
Processing NO2: 2020-09...
Processing NO2: 2020-10...
Processing NO2: 2020-11...
Processing NO2: 2020-12...
Processing NO2: 2021-01...
Processing NO2: 2021-02...
Processing NO2: 2021-03...
Processing NO2: 2021-04...
Processing NO2: 2021-05...
Processing NO2: 2021-06...
Processing NO2: 2021-07...
Processing NO2: 2021-08...
Processing NO2: 2021-09...
Processing NO2: 2021-10...
Processing NO2: 2021-11...
Processing NO2: 2021-12...
Processing NO2: 2022-01...
Processing NO2: 2022-02...
Processing NO2: 2022-03...
Processing NO2: 2022-04...
Processing NO2: 2022-05...
Processing NO2

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects