# DS5460 Final Project - Milestone 1

### Ingesting and Representing the Data - Task 3

Author: Donna Nguyen

Date: 03/08/25

In [1]:
# Step 1: Download the file
!wget -O dataset.tar.gz "https://storage.googleapis.com/gridopt-dataset/dataset_release_1/pglib_opf_case4661_sdet_1.tar.gz"

--2025-03-10 17:14:41--  https://storage.googleapis.com/gridopt-dataset/dataset_release_1/pglib_opf_case4661_sdet_1.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.111.207, 142.251.176.207, 142.251.177.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.111.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9327811105 (8.7G) [application/octet-stream]
Saving to: ‘dataset.tar.gz’


2025-03-10 17:16:19 (90.7 MB/s) - ‘dataset.tar.gz’ saved [9327811105/9327811105]



In [None]:
# Step 2: Extract the tar.gz file
!tar -xvzf dataset.tar.gz
# example_15000 to example_29999

In [3]:
import os

# Define the path to the extracted JSON files
json_dir = "gridopt-dataset-tmp/dataset_release_1/pglib_opf_case4661_sdet/group_1/"

# List all JSON files
json_files = sorted([f for f in os.listdir(json_dir) if f.endswith(".json")])

# Print first 10 JSON filenames
print(json_files[:10])

['example_15000.json', 'example_15001.json', 'example_15002.json', 'example_15003.json', 'example_15004.json', 'example_15005.json', 'example_15006.json', 'example_15007.json', 'example_15008.json', 'example_15009.json']


In [4]:
import os
import json
import pandas as pd

# Define the path to the JSON files
json_dir = "gridopt-dataset-tmp/dataset_release_1/pglib_opf_case4661_sdet/group_1/"

# List all JSON files
json_files = sorted([f for f in os.listdir(json_dir) if f.endswith(".json")])

# Initialize a list to store structured data
data_list = []

# Process a subset (e.g., testing first 100 files to avoid memory issues)
for file in json_files[:100]:  
    with open(os.path.join(json_dir, file), "r") as f:
        data = json.load(f)

        # Extract useful information
        grid_context = data['grid'].get('context', [])
        ac_line_features = data['grid']['edges']['ac_line'].get('features', [])

        solution = data.get('solution', {})
        metadata = data.get('metadata', {})

        # Flatten the data
        entry = {
            "filename": file,
            "grid_context": grid_context,
            "ac_line_features": ac_line_features,
            "solution": solution,
            "metadata": metadata
        }
        data_list.append(entry)

# Convert to a DataFrame
df = pd.DataFrame(data_list)

# Display first few rows
print(df.head())

             filename grid_context  \
0  example_15000.json  [[[100.0]]]   
1  example_15001.json  [[[100.0]]]   
2  example_15002.json  [[[100.0]]]   
3  example_15003.json  [[[100.0]]]   
4  example_15004.json  [[[100.0]]]   

                                    ac_line_features  \
0  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
1  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
2  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
3  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
4  [[-0.5235987755982988, 0.5235987755982988, 0.0...   

                                            solution  \
0  {'nodes': {'bus': [[-0.14185378316994993, 1.05...   
1  {'nodes': {'bus': [[-0.12853802530831018, 1.05...   
2  {'nodes': {'bus': [[-0.14024548483627264, 1.05...   
3  {'nodes': {'bus': [[-0.13184310827285547, 1.04...   
4  {'nodes': {'bus': [[-0.13965248977969102, 1.05...   

                            metadata  
0  {'objective': 2252269.9743585344}  
1   {'objective': 2

In [5]:
# Handle nested lists
# The grid_context and ac_line_features might be lists of lists. If needed, use pd.json_normalize() or extract specific indices.
# If ac_line_features has multiple values, consider expanding it into multiple columns.
# Example: Flatten grid context if it's a nested list
df["grid_context"] = df["grid_context"].apply(lambda x: x[0] if isinstance(x, list) and x else None)
df["ac_line_features"] = df["ac_line_features"].apply(lambda x: x[:5] if isinstance(x, list) else None)  # Extract first 5 elements for preview

print(df.head())

             filename grid_context  \
0  example_15000.json    [[100.0]]   
1  example_15001.json    [[100.0]]   
2  example_15002.json    [[100.0]]   
3  example_15003.json    [[100.0]]   
4  example_15004.json    [[100.0]]   

                                    ac_line_features  \
0  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
1  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
2  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
3  [[-0.5235987755982988, 0.5235987755982988, 0.0...   
4  [[-0.5235987755982988, 0.5235987755982988, 0.0...   

                                            solution  \
0  {'nodes': {'bus': [[-0.14185378316994993, 1.05...   
1  {'nodes': {'bus': [[-0.12853802530831018, 1.05...   
2  {'nodes': {'bus': [[-0.14024548483627264, 1.05...   
3  {'nodes': {'bus': [[-0.13184310827285547, 1.04...   
4  {'nodes': {'bus': [[-0.13965248977969102, 1.05...   

                            metadata  
0  {'objective': 2252269.9743585344}  
1   {'objective': 2

In [6]:
# potentially saving as csv?
df.to_csv("extracted_data.csv", index=False)

### Data Explanation - Task 1
Author: Anne Tumlin

Date: 03/10/25

Donna, I reviewed how you were ingesting the data above. I like the way to load it. However, I have recreated the dataframes so that we extract each feature under grid as its own column. For reference, I am looking at the library structure they have demonstrated in the paper and basing off of that. 

In [14]:
import os
import json

json_dir = "gridopt-dataset-tmp/dataset_release_1/pglib_opf_case4661_sdet/group_1/" 
json_files = sorted([f for f in os.listdir(json_dir) if f.endswith(".json")])

# Check the structure and content of one JSON file. 
sample_file = os.path.join(json_dir, json_files[0])
with open(sample_file, "r") as f:
    sample_data = json.load(f)

# Get the 'grid' data
grid_data = sample_data.get("grid", {})

# Here let's check out the data for just grid.nodes 
# Get the 'grid.nodes' data
nodes_data = grid_data.get("nodes", {})

# Get the 'grid.nodes.bus' data
nodes_bus_data = nodes_data.get("bus", [])

# Get the 'grid.nodes.generator' data
nodes_generator_data = nodes_data.get("generator", [])

# Get the 'grid.nodes.load' data
nodes_load_data = nodes_data.get("load", [])

# Get the 'grid.nodes.shunt' data
nodes_shunt_data = nodes_data.get("shunt", [])

print(f"'bus' contains {len(nodes_bus_data)} entries")
print(f"'generator' contains {len(nodes_generator_data)} entries")
print(f"'load' contains {len(nodes_load_data)} entries")
print(f"'shunt' contains {len(nodes_shunt_data)} entries")

if nodes_bus_data:
    print("\nSample 'bus' Entry:")
    print(json.dumps(nodes_bus_data[0], indent=4))

if nodes_generator_data:
    print("\nSample 'generator' Entry:")
    print(json.dumps(nodes_generator_data[0], indent=4))

if nodes_load_data:
    print("\nSample 'load' Entry:")
    print(json.dumps(nodes_load_data[0], indent=4))

if nodes_shunt_data:
    print("\nSample 'shunt' Entry:")
    print(json.dumps(nodes_shunt_data[0], indent=4))


'bus' contains 4661 entries
'generator' contains 724 entries
'load' contains 2683 entries
'shunt' contains 696 entries

Sample 'bus' Entry:
[
    345.0,
    2.0,
    0.9,
    1.1
]

Sample 'generator' Entry:
[
    100.0,
    0.33204999999999996,
    0.0159,
    0.6481999999999999,
    0.0,
    -1.5799,
    1.5799,
    1.0,
    0.0,
    3401.833,
    0.0
]

Sample 'load' Entry:
[
    1.1588552611735161,
    0.5067666236890527
]

Sample 'shunt' Entry:
[
    0.9426000000000001,
    0.0
]


Below I am creating the dataframes for grid.nodes features. This will be four dataframes in total containing the grid.nodes features for buses, generators, loads, and shunts. 

In [15]:
import os
import json
import pandas as pd

# Limit to the first 100 JSON files (CHANGE AFTER TESTING)
json_files = sorted([f for f in os.listdir(json_dir) if f.endswith(".json")])[:100]

bus_data_list = []
generator_data_list = []
load_data_list = []
shunt_data_list = []

for file in json_files:
    file_path = os.path.join(json_dir, file)
    
    with open(file_path, "r") as f:
        sample_data = json.load(f)  # Load JSON
    
    # Extract grid structure
    grid_data = sample_data.get("grid", {})
    nodes_data = grid_data.get("nodes", {})

    # Extract bus, generator, load, and shunt data
    bus_data = nodes_data.get("bus", [])
    generator_data = nodes_data.get("generator", [])
    load_data = nodes_data.get("load", [])
    shunt_data = nodes_data.get("shunt", [])

    bus_data_list.extend(bus_data)
    generator_data_list.extend(generator_data)
    load_data_list.extend(load_data)
    shunt_data_list.extend(shunt_data)

# Define column names
bus_columns = ["base_kv", "bus_type", "vmin", "vmax"]
generator_columns = [
    "mbase", "pg", "pmin", "pmax", "qg", "qmin", "qmax", "vg", 
    "cost_squared", "cost_linear", "cost_offset"
]
load_columns = ["pd","qd"]
shunt_columns = ["bs","gs"]

# Convert lists into DataFrames
bus_df = pd.DataFrame(bus_data_list, columns=bus_columns)
generator_df = pd.DataFrame(generator_data_list, columns=generator_columns)
load_df = pd.DataFrame(load_data_list, columns=load_columns)
shunt_df = pd.DataFrame(shunt_data_list, columns=shunt_columns)

print("Bus DataFrame Info:")
print(bus_df.info())

print("\nGenerator DataFrame Info:")
print(generator_df.info())

print("\nLoad DataFrame Info:")
print(load_df.info())

print("\nShunt DataFrame Info:")
print(shunt_df.info())

print("\nFirst few rows of Bus Data:")
print(bus_df.head())

print("\nFirst few rows of Generator Data:")
print(generator_df.head())

print("\nFirst few rows of Load Data:")
print(load_df.head())

print("\nFirst few rows of Shunt Data:")
print(shunt_df.head())

Bus DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466100 entries, 0 to 466099
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   base_kv   466100 non-null  float64
 1   bus_type  466100 non-null  float64
 2   vmin      466100 non-null  float64
 3   vmax      466100 non-null  float64
dtypes: float64(4)
memory usage: 14.2 MB
None

Generator DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72400 entries, 0 to 72399
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mbase         72400 non-null  float64
 1   pg            72400 non-null  float64
 2   pmin          72400 non-null  float64
 3   pmax          72400 non-null  float64
 4   qg            72400 non-null  float64
 5   qmin          72400 non-null  float64
 6   qmax          72400 non-null  float64
 7   vg            72400 non-null  float64
 8   cost_squared 

Let's look at the statistics of the grid.nodes features to fill in the data dictionary (Reminder: this is only for the first 100 JSON files). 

In [17]:
# Function to compute range, mean, and standard deviation for each column of data frame (features)
def compute_statistics(df):
    stats_df = pd.DataFrame({
        "Min": df.min(),
        "Max": df.max(),
        "Mean": df.mean(),
        "Std Dev": df.std()
    })
    return stats_df

In [18]:
bus_stats = compute_statistics(bus_df)
generator_stats = compute_statistics(generator_df)
load_stats = compute_statistics(load_df)
shunt_stats = compute_statistics(shunt_df)

print("Bus Data Statistics:\n", bus_stats, "\n")
print("Generator Data Statistics:\n", generator_stats, "\n")
print("Load Data Statistics:\n", load_stats, "\n")
print("Shunt Data Statistics:\n", shunt_stats, "\n")

Bus Data Statistics:
            Min    Max  Range        Mean       Std Dev
base_kv   11.0  500.0  489.0  196.812916  1.266471e+02
bus_type   1.0    3.0    2.0    1.135593  3.429826e-01
vmin       0.9    0.9    0.0    0.900000  6.165519e-12
vmax       1.1    1.1    0.0    1.100000  9.277922e-12 

Generator Data Statistics:
                   Min         Max       Range         Mean      Std Dev
mbase          0.1200    616.8400    616.7200   124.009613   119.596786
pg            -0.1050     16.6629     16.7679     1.360034     1.412612
pmin          -2.0000     13.6581     15.6581     0.487361     1.056602
pmax           0.1200     19.6677     19.5477     2.232706     2.021774
qg            -0.8500      1.5800      2.4300     0.164285     0.409292
qmin          -5.6160     -0.0055      5.6105    -1.211466     1.114564
qmax           0.1000      5.6160      5.5160     1.540036     1.022814
vg             1.0000      1.0000      0.0000     1.000000     0.000000
cost_squared   0.0000    

Now I am going to work through the grid.edges features.

In [22]:

ac_line_data_list = []
transformer_data_list = []

for file in json_files:
    file_path = os.path.join(json_dir, file)
    
    with open(file_path, "r") as f:
        sample_data = json.load(f)  # Load JSON
    
    # Extract grid structure
    grid_data = sample_data.get("grid", {})
    edges_data = grid_data.get("edges", {})

    # Extract ac_line and transformer features
    ac_line_data = edges_data.get("ac_line", {}).get("features", [])
    transformer_data = edges_data.get("transformer", {}).get("features", [])

    ac_line_data_list.extend(ac_line_data)
    transformer_data_list.extend(transformer_data)


ac_line_columns = [
    "angmin", "angmax", "b_fr", "b_to", "br_r", "br_x", "rate_a", "rate_b", "rate_c"
]
transformer_columns = [
    "angmin", "angmax", "br_r", "br_x", "rate_a", "rate_b", "rate_c", "tap", "shift", "b_fr", "b_to"
]

ac_line_df = pd.DataFrame(ac_line_data_list, columns=ac_line_columns)
transformer_df = pd.DataFrame(transformer_data_list, columns=transformer_columns)

print("AC Line DataFrame Info:")
print(ac_line_df.info(), "\n")

print("Transformer DataFrame Info:")
print(transformer_df.info(), "\n")

print("\nFirst few rows of AC Line Data:")
print(ac_line_df.head())

print("\nFirst few rows of Transformer Data:")
print(transformer_df.head())

AC Line DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466800 entries, 0 to 466799
Data columns (total 9 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   angmin  466800 non-null  float64
 1   angmax  466800 non-null  float64
 2   b_fr    466800 non-null  float64
 3   b_to    466800 non-null  float64
 4   br_r    466800 non-null  float64
 5   br_x    466800 non-null  float64
 6   rate_a  466800 non-null  float64
 7   rate_b  466800 non-null  float64
 8   rate_c  466800 non-null  float64
dtypes: float64(9)
memory usage: 32.1 MB
None 

Transformer DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132900 entries, 0 to 132899
Data columns (total 11 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   angmin  132900 non-null  float64
 1   angmax  132900 non-null  float64
 2   br_r    132900 non-null  float64
 3   br_x    132900 non-null  float64
 4   rate_a  132900 non-null  flo

In [23]:
ac_line_stats = compute_statistics(ac_line_df)
transformer_stats = compute_statistics(transformer_df)

print("AC-Line Data Statistics:\n", ac_line_stats, "\n")
print("Transformer Data Statistics:\n", transformer_stats, "\n")

AC-Line Data Statistics:
              Min        Max     Range      Mean       Std Dev
angmin -0.523599  -0.523599   0.00000 -0.523599  4.222294e-12
angmax  0.523599   0.523599   0.00000  0.523599  4.222294e-12
b_fr    0.000000   1.343250   1.34325  0.023031  7.747057e-02
b_to    0.000000   1.343250   1.34325  0.023031  7.747057e-02
br_r    0.000000   1.104500   1.10450  0.012116  3.676370e-02
br_x    0.000010   1.580000   1.57999  0.037743  8.219344e-02
rate_a  0.089700  30.969800  30.88010  2.606962  3.175603e+00
rate_b  0.100000  37.163700  37.06370  2.796723  3.412650e+00
rate_c  0.100000  46.454700  46.35470  3.157193  4.105513e+00 

Transformer Data Statistics:
              Min        Max     Range      Mean       Std Dev
angmin -0.523599  -0.523599   0.00000 -0.523599  1.168292e-12
angmax  0.523599   0.523599   0.00000  0.523599  1.168292e-12
br_r   -0.004670   0.010000   0.01467  0.004524  4.337811e-03
br_x   -0.013520   0.234690   0.24821  0.077965  3.234880e-02
rate_a  0.01

Now, we will make the dataframe for the objective (total generated cost). 

In [25]:
objective_data_list = []

for file in json_files:
    file_path = os.path.join(json_dir, file)
    
    with open(file_path, "r") as f:
        sample_data = json.load(f)  # Load JSON
    
    # Extract metadata section
    metadata_data = sample_data.get("metadata", {})
    
    # Extract objective value (default to None if missing)
    objective_value = metadata_data.get("objective", None)
    
    objective_data_list.append([objective_value])

objective_columns = ["total_cost"]

objective_df = pd.DataFrame(objective_data_list, columns=objective_columns)

print("Objective DataFrame Info:")
print(objective_df.info(), "\n")

# Show the first few rows
print("\nFirst few rows of Objective Data:")
print(objective_df.head())

Objective DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_cost  100 non-null    float64
dtypes: float64(1)
memory usage: 928.0 bytes
None 


First few rows of Objective Data:
     total_cost
0  2.252270e+06
1  2.256933e+06
2  2.268723e+06
3  2.266447e+06
4  2.232219e+06
