# Pluck a random handful of SSEN's feeders for all time
So we can analyse some data more easily. We're using SSEN's data because it's the first dataset we got into our production pipeline, so the first one we had a full time range for.

In [2]:
import pandas as pd

Find the random sample from the first month of data. SSEN use the `dataset_id` column as a unique feeder identifier (other DNOs do it differently) so that makes an easy way to find feeders.

In [3]:
feb = pd.read_parquet("s3://weave.energy/data/staging/ssen/2024-02.parquet")
feeders = sorted(feb["dataset_id"].dropna().drop_duplicates().sample(100))
del feb

In [4]:
all_time = pd.read_parquet("s3://weave.energy/data/staging/ssen",
                           columns=["dataset_id", "secondary_substation_id", "secondary_substation_name", "lv_feeder_id", "lv_feeder_name", "aggregated_device_count_active", "total_consumption_active_import", "data_collection_log_timestamp"],
                           filters=[("dataset_id", "in", feeders)],
                           engine="pyarrow")
all_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1184306 entries, 0 to 1184305
Data columns (total 8 columns):
 #   Column                           Non-Null Count    Dtype              
---  ------                           --------------    -----              
 0   dataset_id                       1184306 non-null  object             
 1   secondary_substation_id          1184306 non-null  object             
 2   secondary_substation_name        1184306 non-null  category           
 3   lv_feeder_id                     1184306 non-null  object             
 4   lv_feeder_name                   1184306 non-null  category           
 5   aggregated_device_count_active   1176393 non-null  float64            
 6   total_consumption_active_import  1176393 non-null  float64            
 7   data_collection_log_timestamp    1184306 non-null  datetime64[ms, UTC]
dtypes: category(2), datetime64[ms, UTC](1), float64(2), object(3)
memory usage: 60.6+ MB


In [5]:
all_time.sort_values(["data_collection_log_timestamp", "dataset_id"], inplace=True)
all_time.to_csv("ssen-10-feeders.csv", index=False)

In [12]:
expected_rows_per_feeder = (all_time["data_collection_log_timestamp"].max() - all_time["data_collection_log_timestamp"].min()).total_seconds() / (30 * 60)
expected_rows_per_feeder

12385.0

In [23]:
with pd.option_context('display.max_rows', 500):
    display(all_time.groupby("dataset_id").agg({"data_collection_log_timestamp": "nunique"}).sort_values(by="data_collection_log_timestamp", ascending=False).reset_index())

Unnamed: 0,dataset_id,data_collection_log_timestamp
0,551300434005,12288
1,252600320005,12288
2,190500324006,12240
3,551600806503,12240
4,521800701601,12240
5,550600347001,12240
6,610000146005,12240
7,251800120004,12240
8,310200107004,12192
9,400800630001,12192
