In [1]:
import pandas as pd
import numpy as np

import geopandas as gpd
import h3.api.numpy_int as h3

import os,sys,glob

In [2]:
def h3_idx(row,res):
    lat, lon = row['lat'], row['lng']
    h3_index = h3.geo_to_h3(lat, lon, resolution=res)  # You can adjust the resolution based on your needs
    row[f'h3_index_{res}'] = h3_index
    return row

In [3]:
def h3_to_lat_lon(h3_index):
    lat, lng = h3.h3_to_geo(h3_index)
    return pd.Series({'lat': lat, 'lng': lng})

# Create H3 Index for Stations

In [4]:
data = pd.read_csv("../data/Train.csv", parse_dates=["date"])

In [5]:
average_pm25_stations = data.groupby(['lat', 'lng'])['pm25'].mean().reset_index()
stations = gpd.GeoDataFrame(average_pm25_stations,geometry=gpd.points_from_xy(average_pm25_stations.lng, average_pm25_stations.lat, crs=4326))

In [6]:
stations.head()

Unnamed: 0,lat,lng,pm25,geometry
0,45.171919,9.488997,22.470256,POINT (9.48900 45.17192)
1,45.281956,8.988563,22.4687,POINT (8.98856 45.28196)
2,45.324517,9.134517,22.591867,POINT (9.13452 45.32452)
3,45.354286,9.329243,25.406741,POINT (9.32924 45.35429)
4,45.359261,9.202665,23.111943,POINT (9.20266 45.35926)


In [7]:
for resolution_value in [4,7,8]: # Adjust the resolution based on your needs
    stations = stations.apply(lambda row: h3_idx(row, res=resolution_value), axis=1)

In [8]:
stations.head()

Unnamed: 0,lat,lng,pm25,geometry,h3_index_4,h3_index_7,h3_index_8
0,45.171919,9.488997,22.470256,POINT (9.48900 45.17192),595031057019961343,608541854341791743,613045453958676479
1,45.281956,8.988563,22.4687,POINT (8.98856 45.28196),595031048430026751,608541845735079935,613045445349867519
2,45.324517,9.134517,22.591867,POINT (9.13452 45.32452),595031048430026751,608541843201720319,613045442820702207
3,45.354286,9.329243,25.406741,POINT (9.32924 45.35429),595031048430026751,608541844594229247,613045444217405439
4,45.359261,9.202665,23.111943,POINT (9.20266 45.35926),595031048430026751,608541844443234303,613045444055924735


In [10]:
stations[['h3_index_4', 'h3_index_7','h3_index_8','geometry']].to_file('../data/stations.geojson', driver='GeoJSON')

# Merge H3 Index with Train data

In [11]:
stations.drop(['pm25','geometry'],axis=1,inplace=True)

In [12]:
merged_df = pd.merge(stations,data, on=['lat', 'lng'], how='inner')

In [13]:
merged_df.drop(['lat', 'lng'],axis=1,inplace=True)

In [14]:
merged_df.to_csv("../data/Train_with_idx.csv",index=False)

In [15]:
from dask.distributed import LocalCluster
import dask.dataframe as dd
import numpy as np
import dask_geopandas
from shapely.geometry import Polygon

In [16]:
cluster = LocalCluster(timeout='60s')

2024-03-07 17:57:53,858 - tornado.application - ERROR - Uncaught exception GET /status/ws (10.1.0.193)
HTTPServerRequest(protocol='http', host='localhost:8787', method='GET', uri='/status/ws', version='HTTP/1.1', remote_ip='10.1.0.193')
Traceback (most recent call last):
  File "/scicore/home/roeoesli/valipo0000/training/anaconda3/envs/py39/lib/python3.9/site-packages/tornado/websocket.py", line 938, in _accept_connection
    open_result = handler.open(*handler.open_args, **handler.open_kwargs)
  File "/scicore/home/roeoesli/valipo0000/training/anaconda3/envs/py39/lib/python3.9/site-packages/tornado/web.py", line 3301, in wrapper
    return method(self, *args, **kwargs)
  File "/scicore/home/roeoesli/valipo0000/training/anaconda3/envs/py39/lib/python3.9/site-packages/bokeh/server/views/ws.py", line 149, in open
    raise ProtocolError("Token is expired.")
bokeh.protocol.exceptions.ProtocolError: Token is expired.


In [17]:
client = cluster.get_client()

In [18]:
stns = gpd.read_file('../data/stations.geojson')

In [19]:
df = dd.read_parquet("../independent-variables/pm25_cams_data/pm25_cams_2019-0*.parquet")

In [20]:
df.tail()

Unnamed: 0,date,pm2p5,h3_index
104,2019-09-26,28.871744,595031031250157567
105,2019-09-27,31.613932,595031031250157567
106,2019-09-28,23.507812,595031031250157567
107,2019-09-29,20.043295,595031031250157567
108,2019-09-30,19.431641,595031031250157567


In [22]:
h3_idxs = stns['h3_index_4'].tolist()

In [23]:
dff = df[df['h3_index'].isin(h3_idxs)]

In [24]:
dff.tail()

Unnamed: 0,date,pm2p5,h3_index
104,2019-09-26,28.871744,595031031250157567
105,2019-09-27,31.613932,595031031250157567
106,2019-09-28,23.507812,595031031250157567
107,2019-09-29,20.043295,595031031250157567
108,2019-09-30,19.431641,595031031250157567


In [30]:
b = dff.compute()

In [31]:
b[['lat', 'lng']] = b['h3_index'].apply(lambda x: h3_to_lat_lon(x))

In [33]:
c = gpd.GeoDataFrame(b,geometry=gpd.points_from_xy(b.lng, b.lat, crs=4326))

In [34]:
c.to_file('../data/test55.geojson', driver='GeoJSON')

In [40]:
h3.h3_to_parent(595030971120615423,res=3)

590527423032852479

In [25]:
# Create new columns 'latitude' and 'longitude' using apply
meta = {'lat': float, 'lng': float}
dff[['lat', 'lng']] = dff['h3_index'].apply(lambda x: h3_to_lat_lon(x), meta=meta)

In [26]:
gddf = dff.set_geometry(
    dask_geopandas.points_from_xy(dff, 'lng', 'lat')
).set_crs('epsg:4326').to_crs(3035)

In [27]:
a = gpd.sjoin_nearest(stns.to_crs(3035), gddf.compute(), how='inner', distance_col="distance")

In [28]:
a.columns

Index(['h3_index_4', 'h3_index_7', 'h3_index_8', 'geometry', 'index_right',
       'date', 'pm2p5', 'h3_index', 'lat', 'lng', 'distance'],
      dtype='object')

In [29]:
a.head()

Unnamed: 0,h3_index_4,h3_index_7,h3_index_8,geometry,index_right,date,pm2p5,h3_index,lat,lng,distance
0,595031057019961343,608541854341791743,613045453958676479,POINT (4280763.876 2451244.394),106,2019-06-29,12.291179,595031048430026751,45.245226,9.189416,24940.35007
1,595031048430026751,608541845735079935,613045445349867519,POINT (4241519.870 2463854.691),106,2019-06-29,12.291179,595031048430026751,45.245226,9.189416,16308.612274
2,595031048430026751,608541843201720319,613045442820702207,POINT (4253040.795 2468432.849),106,2019-06-29,12.291179,595031048430026751,45.245226,9.189416,9800.889219
3,595031048430026751,608541844594229247,613045444217405439,POINT (4268358.833 2471578.626),106,2019-06-29,12.291179,595031048430026751,45.245226,9.189416,16339.795863
4,595031048430026751,608541844443234303,613045444055924735,POINT (4258430.994 2472228.411),106,2019-06-29,12.291179,595031048430026751,45.245226,9.189416,12695.851303


In [49]:
b = gddf.compute()

In [50]:
b[b.h3_index == 599534641614946303]

Unnamed: 0,date,pm2p5,h3_index,lat,lng,geometry
