* Clean the Kayrros Methane Watch data
* Bulk downloaded data for all sources from https://methanewatch.kayrros.com/map
* Note: 
    ** there is a form to fill out to download the csv
    ** if using firefox you need to switch off enhanced tracking for the download to work 

* cleaning:
    - Datetime format is inconsistant
    - Lots of data has blank values for emission rates so delete these rows
    - EMIT data does not have country name (just lat, lons)
    - Standardise the sector names, sensor names, units to be consistant with IMEO MARS data 


In [1]:
#----------------------------------------------------------------------------------------------------------
# sensor, sector, country, date, emission rate (kg/h), uncertainty (kg/h), latitude, longitude, comments, platform | 
#----------------------------------------------------------------------------------------------------------

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [3]:
# set constants
download_date = '11032025'
download_url  = 'https://methanewatch.kayrros.com/map'
tonnes2kg     = 1000.0
directory     = 'satellite/data/kayrros' 
fname_in      = f'Methane_Watch_asset_data_{download_date}.csv'
fname_out     = f'Methane_Watch_asset_data_cleaned_{download_date}.csv'

In [4]:
mw = pd.read_csv(os.path.join(directory, fname_in))

mw.columns = mw.columns.str.strip() 

# keep imeo mars sector categories 'Oil and Gas' 'Coal' 'Waste'
mw['Category'] = mw['Category'].replace('og', 'Oil and Gas')
mw['Category'] = mw['Category'].replace('coal', 'Coal')
mw['Category'] = mw['Category'].replace('human', 'Waste')

#--- delete rows with blank emissions. These are detections with no emissions qualtifications
mw = mw.dropna(subset=['Emission Rate (tons/hour)'])

#--- this is the standard file format (using IMEO MARS as standard) 
#--- lat, lon, date, country, emissions (kg/h), uncertainty (kg/h), sensor, version, plume id
mw.rename(columns={'Country': 'country'}, inplace=True)
mw.rename(columns={'Satellite': 'sensor'}, inplace=True)
mw.rename(columns={'Emission Rate (tons/hour)': 'emission rate (kg/h)'}, inplace=True)
mw.rename(columns={'Uncertainty (tons/hour)': 'uncertainty (kg/h)'}, inplace=True)

mw.rename(columns={'Source Latitude Rough': 'latitude'}, inplace=True)
mw.rename(columns={'Source Longitude Rough': 'longitude'}, inplace=True)

mw.rename(columns={'Date': 'date'}, inplace=True)
mw.rename(columns={'Category': 'sector'}, inplace=True)


# deal with weird date fmt 2023-02-24T04:11:58
mw['date'] = pd.to_datetime(mw['date'], format='mixed')
mw['date'] = mw['date'].dt.strftime('%d/%m/%Y')


mw['emission rate (kg/h)'] = mw['emission rate (kg/h)'] * tonnes2kg
mw['uncertainty (kg/h)'] = mw['uncertainty (kg/h)'] * tonnes2kg

# keep the mars naming convention for sensors
mw['sensor'] = mw['sensor'].replace('Sentinel - 5P', 'Sentinel-5P/TROPOMI - ESA') 
mw['sensor'] = mw['sensor'].replace('EMIT', 'EMIT - NASA')

mw['country'] = mw['country'].replace('United States', 'USA')
mw['country'] = mw['country'].replace('United States of America', 'USA')
 
columns_to_delete = ['Identifier']
mw = mw.drop(columns=columns_to_delete)

mw['comments'] = f'Downloaded from {download_url} on the {download_date}. Cleaned using eda_kayrros_methane_watch.ipynb'
mw['platform'] = 'Kayrros Methane Watch'

print(mw['sector'].unique())
print(mw['sensor'].unique())

print(mw.shape)

['Waste' 'Oil and Gas' 'Coal']
['Sentinel-5P/TROPOMI - ESA' 'EMIT - NASA']
(5060, 10)


Get country name from lat, lon

In [5]:
import geopandas as gpd
from shapely.geometry import Point

fname = 'satellite/data/naturalearth_country_shapefiles/ne_110m_admin_0_countries.shp' 

# Load the country boundaries shapefile
world = gpd.read_file(fname)

def get_country_from_lat_lng(latitude, longitude):
    
    point = Point(longitude, latitude)
    
    # Find the country that contains the point
    for _, country in world.iterrows():
        if country['geometry'].contains(point):
            return country['NAME']
    
    return None

get_countries = True # switch on/off

if get_countries:

    mw['country'] = mw.apply(lambda row: get_country_from_lat_lng(row['latitude'], row['longitude'])
                        if pd.isnull(row['country']) or row['country'] == '' else row['country'],
                        axis=1)

    out_file = os.path.join(os.path.join(directory, fname_out))
    
    mw.to_csv(out_file, index=False) 