# Pluck a random handful of SSEN's feeders for all time
So we can analyse some data more easily. We're using SSEN's data because it's the first dataset we got into our production pipeline, so the first one we had a full time range for.

In [2]:
import geopandas as gpd

Find the random sample from the first month of data.

We want to split this sample evenly across SSEN's two license areas, so we use two bbox queries

In [None]:
southern_england_bbox = [-2.2, 50.7, 1.77, 51.8]
scotland_bbox = [-8.65, 54.5, -0.65, 60.84]
england_feb = gpd.read_parquet("s3://weave.energy/beta/smart-meter/2024-02.parquet", bbox=southern_england_bbox)
scotland_feb = gpd.read_parquet("s3://weave.energy/beta/smart-meter/2024-02.parquet", bbox=scotland_bbox)
england_feeders = sorted(england_feb["lv_feeder_unique_id"].dropna().drop_duplicates().sample(50))
scotland_feeders = sorted(scotland_feb["lv_feeder_unique_id"].dropna().drop_duplicates().sample(50))
del england_feb, scotland_feb

In [None]:
all_time_england = gpd.read_parquet("s3://weave.energy/beta/smart-meter",
                           columns=["secondary_substation_unique_id", "lv_feeder_unique_id", "aggregated_device_count_active", "total_consumption_active_import", "data_collection_log_timestamp", "geometry"],
                           filters=[("lv_feeder_unique_id", "in", england_feeders)])
all_time_scotland = gpd.read_parquet("s3://weave.energy/beta/smart-meter",
                           columns=["secondary_substation_unique_id", "lv_feeder_unique_id", "aggregated_device_count_active", "total_consumption_active_import", "data_collection_log_timestamp", "geometry"],
                           filters=[("lv_feeder_unique_id", "in", scotland_feeders)])

AttributeError: 'GeoDataFrame' object has no attribute 'concat'

In [14]:
import pandas as pd
all_time = gpd.GeoDataFrame(pd.concat([all_time_england, all_time_scotland], ignore_index=True), crs="EPSG:4326")
all_time.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1328722 entries, 0 to 1328721
Data columns (total 6 columns):
 #   Column                           Non-Null Count    Dtype              
---  ------                           --------------    -----              
 0   secondary_substation_unique_id   1328722 non-null  object             
 1   lv_feeder_unique_id              1328722 non-null  object             
 2   aggregated_device_count_active   1320934 non-null  float64            
 3   total_consumption_active_import  1320934 non-null  float64            
 4   data_collection_log_timestamp    1328722 non-null  datetime64[ms, UTC]
 5   geometry                         1328722 non-null  geometry           
dtypes: datetime64[ms, UTC](1), float64(2), geometry(1), object(2)
memory usage: 60.8+ MB


In [15]:
all_time_england.sort_values(["data_collection_log_timestamp", "lv_feeder_unique_id"], inplace=True)
all_time_scotland.sort_values(["data_collection_log_timestamp", "lv_feeder_unique_id"], inplace=True)

all_time_england["latitude"] = all_time_england["geometry"].y
all_time_england["longitude"] = all_time_england["geometry"].x
all_time_scotland["latitude"] = all_time_scotland["geometry"].y
all_time_scotland["longitude"] = all_time_scotland["geometry"].x

all_time_england.to_csv("ssen-50-england-feeders.csv", index=False, columns=["data_collection_log_timestamp", "secondary_substation_unique_id", "lv_feeder_unique_id", "latitude", "longitude", "aggregated_device_count_active", "total_consumption_active_import"])
all_time_scotland.to_csv("ssen-50-scotland-feeders.csv", index=False, columns=["data_collection_log_timestamp", "secondary_substation_unique_id", "lv_feeder_unique_id", "latitude", "longitude", "aggregated_device_count_active", "total_consumption_active_import"])

In [16]:
expected_rows_per_feeder = (all_time["data_collection_log_timestamp"].max() - all_time["data_collection_log_timestamp"].min()).total_seconds() / (30 * 60)
expected_rows_per_feeder

14063.0

In [17]:
import pandas as pd
with pd.option_context('display.max_rows', 500):
    display(all_time.groupby("lv_feeder_unique_id").agg({"data_collection_log_timestamp": "nunique"}).sort_values(by="data_collection_log_timestamp", ascending=False).reset_index())

Unnamed: 0,lv_feeder_unique_id,data_collection_log_timestamp
0,521300134002,13966
1,611300623001,13918
2,611400302001,13918
3,640700802701,13918
4,641000424001,13918
5,461800210003,13918
6,430000354403,13918
7,521700532006,13918
8,400526003,13870
9,520000462004,13870


In [18]:
all_time[all_time["data_collection_log_timestamp"] == "2024-07-12 12:00:00"].explore()