In [None]:
import requests
# import requests_cache
from dataclasses import dataclass, asdict
import numpy as np
import pandas as pd
import os, sys
import random
import math
import bisect
import arrow
import pytz
from datetime import datetime, date, timedelta
from timezonefinder import TimezoneFinder
from matplotlib import pyplot as plt, dates
from matplotlib.ticker import *
from carbon_api_client import *
from matplotlib_helper import *
from typing import List, Any
from dateutil import tz
from enum import Enum
from pprint import pprint
import functools

In [None]:
use_utc_time_of_day = True
enable_savefig = False

In [None]:
# requests_cache.install_cache('http_cache', backend='filesystem')

In [None]:
class Statistics(Enum):
    MEAN = 'mean'
    MEDIAN = 'median'
    STD = 'std'
    MIN = 'min'
    MAX = 'max'
    SIZE = 'size'

In [None]:
def round_down(dt: datetime, round_to: timedelta) -> datetime:
    """Round down the given datetime to the specified interval."""
    # datetime.min has tzinfo=None
    total_seconds = (dt.replace(tzinfo=None, microsecond=0) - datetime.min).total_seconds()
    remainder_seconds = total_seconds % round_to.total_seconds()
    dt = dt.replace(microsecond=0)
    return dt - timedelta(seconds=remainder_seconds)

In [None]:
M_PUBLIC_CLOUD_LOCATION = {
    ('AWS', 'us-west-1'): (37.00578, -121.56828),
    ('AWS', 'us-west-2'): (45.840410, -119.289460),
    ('AWS', 'us-east-1'): (39.983334, -82.983330),
    ('AWS', 'us-east-2'): (39.040283, -77.485165),
    # Not needed as Azure carbon API relies on region name instead of geocoordinates
    # Note: below are Azure reigons, but we want to use 'AWS' to call into c3lab API
    ('Azure', 'eastus'):            (37.3719, -79.8164),
    ('Azure', 'eastus2'):           (36.6681, -78.3889),
    ('Azure', 'southcentralus'):    (29.4167, -98.5),
    ('Azure', 'westus2'):           (47.233, -119.852),
    ('Azure', 'westus3'):           (33.448376, -112.074036),
    ('Azure', 'centralus'):         (41.5908, -93.6208),
    ('Azure', 'eastus2euap'):       (36.6681, -78.3889),
    ('Azure', 'northcentralus'):    (41.8819, -87.6278),
    ('Azure', 'westus'):            (37.783, -122.417),
    ('Azure', 'centraluseuap'):     (41.5908, -93.6208),
    ('Azure', 'westcentralus'):     (40.890, -110.234),
    ('SPP', 'kansus'):              (39.071971, -94.663875),
    ('NY', 'syracuse'):             (43.0415, -76.14),
    ('PACW', 'modford'):            (42.3265, -122.8756),
    ('BPA', 'eugene'):              (44.0521, -123.0868),
}
def get_location_for_public_cloud(cloud_vendor, region):
    '''Looks up the GPS coordinate for public cloud region.'''
    if (cloud_vendor, region) in M_PUBLIC_CLOUD_LOCATION:
        return M_PUBLIC_CLOUD_LOCATION[(cloud_vendor, region)]
    else:
        return (math.nan, math.nan)

In [None]:
# US/Canada
azure_regions_americas = [
    'westus',
    'westus2',
    'westcentralus',
    'westus3',
    'eastus',
    'centralus',
    'southcentralus',
    # 'canadacentral',
    # 'canadaeast',
]

# Europe
azure_regions_europe = [
    'uksouth',
    'francecentral',
    'germanywestcentral',
    'northeurope',
    'norwayeast',
    'swedencentral',
    'westeurope',
]

# Australia
azure_regions_aus = [
    'australiaeast',
    'australiasoutheast',
]

In [None]:
m_azure_regions_to_isos = {
    # US/Canada
    'westus': 'CAISO_NORTH',
    'westus2': 'GCPD',
    'westcentralus': 'PACE',
    'westus3': 'AZPS',
    'eastus': 'PJM_ROANOKE',
    'centralus': 'MISO_MASON_CITY',
    'southcentralus': 'ERCOT_SANANTONIO',
    'canadacentral': 'IESO_NORTH',
    'canadaeast': 'HQ',
    # Europe
    'uksouth': 'UK',
    'francecentral': 'FR',
    'germanywestcentral': 'DE',
    'northeurope': 'IE',
    'norwayeast': 'NO',
    'swedencentral': 'SE',
    'westeurope': 'NL',
    # Australia
    'australiaeast': 'NEM_NSW',
    'australiasoutheast': 'NEM_VIC',
}

In [None]:
%%script false --no-raise-error

# run_count = 0

def plot_timeseries(data_array, plot_axis=None, timestamp_column_name='timestamp', prefix=None, use_relative_time=False, color=None, index=0):
    # global run_count
    data_array = [asdict(entry) for entry in data_array]
    x = [entry[timestamp_column_name] for entry in data_array]

    # timestamp_deltas = np.diff(x)
    # values, counts = np.unique(timestamp_deltas, return_counts=True)
    # print(values, counts)

    if use_relative_time:
        start_time = x[0]
        x = [(t - start_time).total_seconds() for t in x]
    data_keys = []
    for key in data_array[0].keys():
        if key == timestamp_column_name:
            continue
        data_keys.append(key)
    lines = []
    for key in data_keys:
        data_series = [entry[key] for entry in data_array]
        label = (('%s - ' % prefix if prefix else '') + key) if len(data_keys) > 1 else (prefix if prefix else '')
        if plot_axis is None:
            plot_axis = plt.gca()
        line = plot_axis.plot(x, data_series, color=color, linestyle=get_linestyle(index), label=label, marker=None)
        # if run_count == 0:
        #     plot_axis.fill_between(x, y1=data_series, where=[True if len(x)*17.5/24 < i < len(x)*20.5/24 else False for i in range(len(x))], alpha=0.2)
        #     run_count += 1
        tzinfo = x[0].tzinfo
        plot_axis.xaxis_date(tz=tzinfo)
        index += 1
        lines.append(line)
    return lines

In [None]:
def plot_pd_timeseries(df: pd.DataFrame, errors=None, plot_axis=None, label="", use_relative_time=False, color=None, index=0):
    # x = [ts.to_pydatetime() for ts in df.index.tolist()]
    # y = df.values
    # print('df columns:', df.columns.values)
    # print('df:', df)
    # print('df.values:', df.values)
    # print('errors.values:', errors.values)
    if use_relative_time:
        raise NotImplementedError()
        # start_time = x[0]
        # x = [(t - start_time).total_seconds() for t in x]
    if plot_axis is None:
        plot_axis = plt.gca()
    # line = plot_axis.plot(x, y, color=color, linestyle=get_linestyle(index), label=label, marker=None)
    # tzinfo = df.index[0].tzinfo
    # plot_axis.xaxis_date(tz=tzinfo)
    if color is None: color = get_next_color()
    column = df.columns.values.tolist()[0]
    if errors is not None:
        line = df.plot(ax=plot_axis, yerr=errors, x_compat=True, color=color, linestyle=get_linestyle(index), label=label, capsize=2)
    else:
        line = df.plot(ax=plot_axis, x_compat=True, color=color, linestyle=get_linestyle(index), label=label)
    return line


In [None]:
def call_gsf_carbon_api_prediction(cloud_vendor: str, region: str, start: arrow.Arrow = None, end: arrow.Arrow = None) -> CarbonIntensityData:
    url_get_carbon_intensity = 'https://carbon-aware-api.azurewebsites.net/emissions/forecasts/current'
    response = requests.get(url_get_carbon_intensity, params={
        'location': [region],
        'dataStartAt': start,
        'dataEndAt': end.shift(minutes=-5),
        'windowsSize': 5,
    })
    assert response.ok, "GSF carbon intensity prediction lookup failed (%d): %s" % (response.status_code, response.text)
    response_json = response.json()

    print(len(response_json))
    assert len(response_json) == 1

    response_element = response_json[0]
    generatedAt = response_element['generatedAt']

    locations = set()
    timeseries = []
    for entry in response_element['forecastData']:
        locations.add(entry['location'])
        timestamp = arrow.get(entry['timestamp']).datetime
        carbon_intensity = float(entry['value'])
        duration = timedelta(minutes=entry['duration'])
        timeseries.append(TimestampdValue(timestamp, carbon_intensity))
    ds = create_pd_series([e.timestamp for e in timeseries], [e.value for e in timeseries])
    iso = ','.join(locations)

    return CarbonIntensityData(cloud_vendor, region, iso, timeseries, ds)

In [None]:
def call_gsf_carbon_api(cloud_vendor: str, region: str, start: arrow.Arrow, end: arrow.Arrow, get_prediction=False) -> CarbonIntensityData:
    if get_prediction:
        return call_gsf_carbon_api_prediction(cloud_vendor, region, start, end)

    url_get_carbon_intensity = 'https://carbon-aware-api.azurewebsites.net/emissions/bylocations'
    response = requests.get(url_get_carbon_intensity, params={
        'location': [region],
        'time': start,
        'toTime': end,
    })
    assert response.ok, "GSF carbon intensity lookup failed (%d): %s" % (response.status_code, response.text)
    response_json = response.json()

    locations = set()
    timeseries = []
    for entry in response_json:
        locations.add(entry['location'])
        timestamp = arrow.get(entry['time']).datetime
        carbon_intensity = float(entry['rating']) / 2.2 # lb/MWh -> g/kWh
        duration = entry['duration']
        timeseries.append(TimestampdValue(timestamp, carbon_intensity))
    ds = create_pd_series([e.timestamp for e in timeseries], [e.value for e in timeseries])
    iso = ','.join(locations)

    return CarbonIntensityData(cloud_vendor, region, iso, timeseries, ds)

In [None]:
def get_carbon_intensity_data(cloud_vendor, region, date:date = None, timerange:timedelta = timedelta(weeks=1),
                              use_utc_time_of_day = True, get_prediction=False,
                              desired_renewable_ratio: float = None) -> CarbonIntensityData:
    print(cloud_vendor, region)
    (latitude, longitude) = get_location_for_public_cloud(cloud_vendor, region)
    if date is None:
        date = arrow.get().shift(weeks=-1).date()
    if use_utc_time_of_day:
        timezone = pytz.UTC
    else:
        timezone_str = TimezoneFinder().timezone_at(lng=longitude, lat=latitude)
        timezone = pytz.timezone(timezone_str)
    date = arrow.get(date, tzinfo=timezone)
    # print(timezone_str, date, file=sys.stderr)
    # NOTE: temporary override _debug_
    # if cloud_vendor == 'AWS':
    if True:
        assert get_prediction is False, 'No prediction support for AWS regions yet'
        (latitude, longitude) = get_location_for_public_cloud(cloud_vendor, region)
        ci_data = call_sysnet_carbon_intensity_api(latitude, longitude, date, date + timerange, desired_renewable_ratio=desired_renewable_ratio)
        if ci_data is None:
            return None
        ci_data.cloud_vendor = cloud_vendor
        ci_data.region = region
        return ci_data
    elif cloud_vendor == 'Azure':
        return call_gsf_carbon_api(cloud_vendor, region, date, date.shift(minutes=-1) + timerange, get_prediction=get_prediction)
    else:
        raise ValueError(f'Unsupported region {cloud_vendor}:{region}')

In [None]:
def print_carbon_intensity_stats(l_time_series: List[TimestampdValue]):
    l_carbon_intensity = [e.value for e in l_time_series]
    print('Avg/Min/Max carbon intensity: %.2f/%.2f/%.2f' % (
        np.mean(l_carbon_intensity),
        np.min(l_carbon_intensity),
        np.max(l_carbon_intensity),
    ))

In [None]:
def find_overlap_diff_of_carbon_intensities(time_series_1: pd.Series, time_series_2: pd.Series) -> List[float]:
    s1_timestamps = [e.timestamp for e in time_series_1]
    s2_timestamps = [e.timestamp for e in time_series_2]
    union_timestamps = sorted(list(set(s1_timestamps).union(s2_timestamps)))
    # Same index as common_timestamps
    l1_carbon_intensity = []
    l2_carbon_intensity = []
    l_diff_carbon_intensity = []
    for index in range(len(union_timestamps)):
        curr_timestamp = union_timestamps[index]
        if curr_timestamp in s1_timestamps:
            index1 = s1_timestamps.index(curr_timestamp)
        else:   # Find the previous timestamp and use that
            index1 = max(bisect.bisect(s1_timestamps, curr_timestamp) - 1, 0)
        if curr_timestamp in s2_timestamps:
            index2 = s2_timestamps.index(curr_timestamp)
        else:
            index2 = max(bisect.bisect(s2_timestamps, curr_timestamp) - 1, 0)
        carbon_intensity1 = time_series_1[index1].value
        carbon_intensity2 = time_series_2[index2].value
        l1_carbon_intensity.append(carbon_intensity1)
        l2_carbon_intensity.append(carbon_intensity2)
        l_diff_carbon_intensity.append(carbon_intensity2 - carbon_intensity1)
    return l_diff_carbon_intensity

In [None]:
def find_overlap_interval_of_carbon_intensities(time_series_1: List[TimestampdValue], time_series_2: List[TimestampdValue]) -> \
        List[tuple[datetime, datetime]]:
    """Find the intervals where carbon intensity of the first time series drops below the second."""
    print("Finding overlap in intervals")
    s1_timestamps = sorted([e.timestamp for e in time_series_1])
    s2_timestamps = sorted([e.timestamp for e in time_series_2])
    union_timestamps = sorted(list(set(s1_timestamps).union(s2_timestamps)))
    # Same index as common_timestamps
    l1_carbon_intensity: List[float] = []
    l2_carbon_intensity: List[float] = []
    overlap_intervals: List[tuple[datetime, datetime]] = []
    interval_start_index = None
    for index in range(len(union_timestamps)):
        curr_timestamp = union_timestamps[index]
        index1 = max(bisect.bisect(s1_timestamps, curr_timestamp) - 1, 0)
        index2 = max(bisect.bisect(s2_timestamps, curr_timestamp) - 1, 0)
        carbon_intensity1 = time_series_1[index1].value
        carbon_intensity2 = time_series_2[index2].value
        l1_carbon_intensity.append(carbon_intensity1)
        l2_carbon_intensity.append(carbon_intensity2)
        if carbon_intensity1 <= carbon_intensity2:
            if interval_start_index is None:
                interval_start_index = index
        else:
            if interval_start_index is not None:
                timestamp_start = union_timestamps[interval_start_index]
                timestamp_end = union_timestamps[index]
                overlap_intervals.append((timestamp_start, timestamp_end))
                interval_start_index = None
    print("done")
    return overlap_intervals

In [None]:
def plot_overlap_interval_cdf(overlap_intervals: List[tuple[datetime, datetime]], label: str) -> None:
    interval_deltas = [(interval[1] - interval[0]) for interval in overlap_intervals]
    interval_in_hours = [delta.total_seconds() / timedelta(hours=1).total_seconds() for delta in interval_deltas]
    plot_cdf_array(interval_in_hours, label)

In [None]:
def format_cloud_region_name(cloud_region: tuple[str, str], iso: str) -> str:
    """Format the name for a cloud region, including its electricity-sourcing ISO."""
    return f'{cloud_region[0]} {cloud_region[1]} ({iso})'

In [None]:
def pull_carbon_intensity_data(cloud_vendor_and_regions: list[tuple], start_date: datetime, end_date: datetime,
                               get_prediction=False,
                               desired_renewable_ratio: float = None):
    print(f'Pulling carbon intensity data in range [{arrow.get(start_date)}, {arrow.get(end_date)}]')
    window_size = end_date - start_date
    all_region_time_series_data = {}
    for (cloud_vendor, region) in cloud_vendor_and_regions:
        carbon_intensity_data = get_carbon_intensity_data(cloud_vendor, region, date=start_date, timerange=window_size,
                                                          use_utc_time_of_day=use_utc_time_of_day,
                                                          get_prediction=get_prediction,
                                                          desired_renewable_ratio=desired_renewable_ratio)
        if not carbon_intensity_data:
            continue
        carbon_intensity_data.set_timeseries_interval('15min')

        all_region_time_series_data[(cloud_vendor, region)] = carbon_intensity_data
        time_series_data = carbon_intensity_data.timeseries
        print_carbon_intensity_stats(time_series_data)
    return all_region_time_series_data

In [None]:

def plot_carbon_intensity_time_series(all_region_time_series_data: dict[tuple, CarbonIntensityData], start_date: datetime, end_date: datetime, aggregate_by: timedelta = None, errorbar: bool = False):
    # plt.figure(figsize=(8, 4))
    plt.figure(figsize=(12, 4.8))
    # if aggregate_by is None:
    #     plt.figure(figsize=(12, 4.8))
    #     fig, axes = plt.subplots(1, 1)
    # else:
    #     # plt.figure(figsize=(12, 12))
    #     fig, axes = plt.subplots(3, 1)
    for (cloud_vendor, region) in all_region_time_series_data:
        carbon_intensity_data = all_region_time_series_data[(cloud_vendor, region)]
        prefix=f'{cloud_vendor} {region} ({carbon_intensity_data.iso})'
        print(f'Region: {prefix}')
        time_series_data = carbon_intensity_data.timeseries_pd
        print('Available data range: ', time_series_data.index.min(), time_series_data.index.max())
        # sampled_data = sorted(random.sample(time_series_data, min(len(time_series_data), 1000)), key=lambda e: e['timestamp'])
        sampled_data = time_series_data.loc[start_date:end_date]
        df = sampled_data.to_frame(prefix)
        if aggregate_by is None:
            plot_pd_timeseries(df, use_relative_time=False)
            # plot_timeseries(sampled_data, use_relative_time=False, prefix=prefix)
        else:
            # df.assign(time=df.index.time).groupby('time', as_index=True).mean()
            modulo_datetime_by_period = lambda dt, period: (dt - datetime.min.replace(tzinfo=tz.UTC)) % period
            grouped = df.assign(time=lambda x: modulo_datetime_by_period(x.index.to_pydatetime(), aggregate_by)).groupby('time', as_index=True)
            means = grouped.mean()
            errors = grouped.std()
            mins = grouped.min()
            maxs = grouped.max()
            # print('means.index:', means.index.size)
            if errorbar:
                plot_pd_timeseries(means, errors=errors)
            else:
                plot_pd_timeseries(means)
            # plt.sca(axes[0])
            # plot_pd_timeseries(means, errors=errors)
            # plt.sca(axes[1])
            # plot_pd_timeseries(mins)
            # plt.sca(axes[2])
            # plot_pd_timeseries(maxs)
    window_size = end_date - start_date
    ax = plt.gca()
    if aggregate_by is not None:
        xlabel = 'Time in period'
        ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: str(x/means.index.size * aggregate_by)))
        # plt.xlim(0, means.index.size)
    else:
        if window_size.total_seconds() == timedelta(days=1).total_seconds():
            date_formatter_string = "%H:%M"
            xlabel = f'Time of day ({"UTC" if use_utc_time_of_day else "local"})'
        else:
            date_formatter_string = "%Y/%m/%d"
            xlabel = 'Date'
        ax.xaxis.set_major_formatter(dates.DateFormatter(date_formatter_string))
    plt.xlabel(xlabel)
    plt.ylabel('Carbon intensity (gCO2/kWh)')
    # plt.ylabel('Renewable percentage')  # _debug_
    title_metric = 'carbon intensity'
    # title_metric = 'Renewable percentage'  # _debug_
    title = f'{window_size.days}-day {title_metric} in [{start_date.strftime("%Y-%m-%d")}, {end_date.strftime("%Y-%m-%d")})'
    global desired_renewable_ratio
    if desired_renewable_ratio is not None:
        title += f' ({desired_renewable_ratio*100}% renewable)'
    if aggregate_by is not None:
        title += f', grouped into {aggregate_by}'
    plt.title(title)
    # plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=4)
    plt.xticks(rotation=15)
    # plt.ylim(0, 400)
    plt.tight_layout()
    savefig_filename = 'carbon-intensity.timeseries.%s-%s.png' % (start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d"))
    if enable_savefig:
        plt.savefig(savefig_filename)

In [None]:
def plot_carbon_intensity_overlap_cdf(all_region_time_series_data: dict[tuple, CarbonIntensityData], l_cloud_region_pairs: list[tuple[tuple, tuple]], start_date: datetime, end_date: datetime):
    plt.figure()
    # plt.figure(figsize=(5, 4))
    for (cloud_region1, cloud_region2) in l_cloud_region_pairs:
        carbon_data1 = all_region_time_series_data[cloud_region1]
        carbon_data2 = all_region_time_series_data[cloud_region2]
        region1_name = format_cloud_region_name(cloud_region1, carbon_data1.iso)
        region2_name = format_cloud_region_name(cloud_region2, carbon_data2.iso)
        overlap_intervals = find_overlap_interval_of_carbon_intensities(carbon_data1.timeseries,
                                                                        carbon_data2.timeseries)
        print("plotting CDF")
        plot_overlap_interval_cdf(overlap_intervals, f'{region1_name} < {region2_name}')
        print("done")
    plt.xlabel('Overlap (hours)')
    plt.ylabel('CDF')
    plt.title('Carbon intensity overlap in [%s,%s)' % (start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')))
    plt.legend()
    plt.grid()
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2))
    plt.tight_layout()
    savefig_filename = 'carbon-intensity.overlap.%s-%s.png' % (start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d"))
    if enable_savefig:
        plt.savefig(savefig_filename)

In [None]:
cloud_vendor_and_regions = []
l_cloud_region_pairs = []

In [None]:
%%script false --no-raise-error

# cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), azure_regions_americas[0:2]))
# cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), azure_regions_americas))
cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), azure_regions_americas + azure_regions_europe))

In [None]:
%%script false --no-raise-error

# cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), azure_regions_americas))
# cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), azure_regions_europe))

cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), [
    'centralus',
    'westus',
    # 'germanywestcentral', # too high
    # 'francecentral',  # too low
    'uksouth',
    # 'northeurope',
]))

l_cloud_region_pairs = [
    (('Azure', 'eastus'), ('Azure', 'uksouth')),
    (('Azure', 'eastus'), ('Azure', 'westus')),
]

In [None]:
%%script false --no-raise-error

utcnow = arrow.get(round_down(datetime.now(timezone.utc), timedelta(minutes=5))).shift(minutes=5)
print(utcnow)
all_prediction_data = pull_carbon_intensity_data(cloud_vendor_and_regions, utcnow, utcnow.shift(days=1), get_prediction=True)
plot_carbon_intensity_time_series(all_prediction_data, utcnow.datetime, utcnow.shift(days=1).datetime, errorbar=False)

In [None]:
# %%script false --no-raise-error

cloud_vendor_and_regions = [
    # ('AWS', 'us-west-1'),
    # ('AWS', 'us-west-2'),
    # ('AWS', 'us-east-1'),
    # AWS us-east-2 uses the same ISO as us-east-1
    # ('AWS', 'us-east-2'),

    # This covers all c3lab regions
    ('Azure', 'westus'),        # US-CAISO
    ('BPA', 'eugene'),          # US-BPA
    ('PACW', 'modford'),        # US-PACW
    ('NY', 'syracuse'),         # US-NY
    ('SPP', 'kansus'),          # US-SPP
    ('Azure', 'southcentralus'),# US-ERCOT
    ('Azure', 'eastus'),        # US-PJM
    ('Azure', 'centralus'),     # US-MISO
]

# l_cloud_region_pairs = [
#     (('AWS', 'us-east-1'), ('AWS', 'us-west-1')),
#     (('AWS', 'us-east-1'), ('AWS', 'us-west-2'))
# ]

In [None]:
window_size = timedelta(days=365)
# base_start_date = datetime.utcnow().date()
base_start_date = datetime(2023, 1, 1, tzinfo=tz.UTC)
desired_renewable_ratio = None
d_region_time_series_data_by_offset = {}
for offset in range(1):
    start_date = arrow.get(base_start_date) + (window_size * -(1 + offset))
    end_date = start_date + window_size
    d_region_time_series_data_by_offset[(start_date, end_date)] = pull_carbon_intensity_data(cloud_vendor_and_regions, start_date, end_date, desired_renewable_ratio=desired_renewable_ratio)

In [None]:
# Single time range data, mean + errorbar

for start_date, end_date in d_region_time_series_data_by_offset:
    all_region_time_series_data = d_region_time_series_data_by_offset[(start_date, end_date)]
    plot_carbon_intensity_time_series(all_region_time_series_data, start_date.shift(days=0).datetime, end_date.datetime, errorbar=True, aggregate_by=timedelta(days=1))
    # break

plt.ylim(0, None)

In [None]:
# Check for continous time range
l_timeranges_by_region = {}
for start_date, end_date in d_region_time_series_data_by_offset:
    # print(start_date, end_date)
    all_region_time_series_data = d_region_time_series_data_by_offset[(start_date, end_date)]
    for (cloud_vendor, region) in all_region_time_series_data:
        # carbon_intensity_data = all_region_time_series_data[(cloud_vendor, region)]
        if (cloud_vendor, region) not in l_timeranges_by_region:
            l_timeranges_by_region[(cloud_vendor, region)] = []
        l_timeranges_by_region[(cloud_vendor, region)].append((start_date, end_date))
        # l.append(((cloud_vendor, region), (start_date, end_date)))
for (cloud_vendor, region) in cloud_vendor_and_regions:
    time_ranges = sorted(l_timeranges_by_region[(cloud_vendor, region)])
    if len(time_ranges) == 0: continue
    last_end = time_ranges[0][1]
    for (start, end) in time_ranges[1:]:
        if start != last_end:
            raise ValueError(f"Time range for {cloud_vendor}:{region} discontinuted between {last_end} and {start}")
        last_end = end

In [None]:
# Merge multiple time range data, mean (+ errorbar)

all_region_merged_carbon_data = {}
min_start_date = datetime.max.replace(tzinfo=tz.UTC)
max_end_date = datetime.min.replace(tzinfo=tz.UTC)
for (cloud_vendor, region) in cloud_vendor_and_regions:
    print(cloud_vendor, region)
    l_carbon_data = []
    for start_date, end_date in d_region_time_series_data_by_offset:
        l_carbon_data.append(d_region_time_series_data_by_offset[(start_date, end_date)][(cloud_vendor, region)])
        min_start_date = min(min_start_date, start_date)
        max_end_date = max(max_end_date, end_date)
    # print(cloud_vendor, region, min_start_date, max_end_date, [str(x) for x in l_carbon_data])
    all_region_merged_carbon_data[(cloud_vendor, region)] = CarbonIntensityData.merge(l_carbon_data)
plot_carbon_intensity_time_series(all_region_merged_carbon_data, min_start_date.datetime, max_end_date.datetime, aggregate_by=timedelta(days=1), errorbar=False)


In [None]:
%%script false --no-raise-error

# Filter select regions for plotting

# cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), azure_regions_americas))
cloud_vendor_and_regions = list(map(lambda r: ('Azure', r), azure_regions_europe))

filtered_region_merged_carbon_data = {}
min_start_date = datetime.max.replace(tzinfo=tz.UTC)
max_end_date = datetime.min.replace(tzinfo=tz.UTC)
for (cloud_vendor, region) in cloud_vendor_and_regions:
    print(cloud_vendor, region)
    carbon_intensity_data = all_region_merged_carbon_data[(cloud_vendor, region)]
    min_start_date = min(min_start_date, carbon_intensity_data.timeseries_pd.index.min())
    max_end_date = max(max_end_date, carbon_intensity_data.timeseries_pd.index.max())
    filtered_region_merged_carbon_data[(cloud_vendor, region)] = carbon_intensity_data
plot_carbon_intensity_time_series(filtered_region_merged_carbon_data, min_start_date, max_end_date, aggregate_by=timedelta(days=1), errorbar=True)
plt.savefig('carbon_intensity.daily_average.azure.eu.errorbar.png')

In [None]:
k, v = next(iter(d_region_time_series_data_by_offset.items()))
ci_uswest1 = v[('AWS', 'us-west-1')]
df = ci_uswest1.timeseries_pd

In [None]:
print('df:', type(df), len(df))
# Wrong, cannot apply two conditions per element
# df.loc[(pd.to_datetime(datetime(2022, 9, 30, 23, tzinfo=tz.UTC)) > df.index > pd.to_datetime(datetime(2022, 9, 30, 22, tzinfo=tz.UTC)))].index
# Okay, but only one condition
# df.loc[df.index > pd.to_datetime(datetime(2022, 9, 30, 22, tzinfo=tz.UTC)))].index
filtered = df.loc[datetime(2022, 9, 30, 22, tzinfo=tz.UTC):datetime(2022, 9, 30, 23, tzinfo=tz.UTC)]
print('filtered: ', type(filtered), len(filtered))

In [None]:
# Source: https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.core.groupby.DataFrameGroupBy.agg.html

series_name = "AWS:us-west-1"

df = filtered.to_frame(series_name)
dt = timedelta(minutes=30)
df_agg = df.assign(time=lambda x: (x.index.to_pydatetime() - datetime.min.replace(tzinfo=tz.UTC)) % dt).groupby('time').agg(['min', 'max', 'mean', 'std'])
print(df_agg)
df_agg[series_name]['min'].plot()
plt.ylim(0, 200)
plt.legend()
# df_agg.plot()

In [None]:
%%script false --no-raise-error

# Concatenate two series with different index range

# Example:
s1 = pd.Series({datetime(2022, 1, 1, 0): 0, datetime(2022, 1, 1, 1): 1})
s2 = pd.Series({datetime(2022, 1, 1, 2): 2, datetime(2022, 1, 1, 3): 3})
combined = pd.concat([s2, s1]).sort_index()
dedupped = combined[~combined.index.duplicated(keep='first')]

In [None]:
%%script false --no-raise-error

for start_date, end_date in d_region_time_series_data_by_offset:
    all_region_time_series_data = d_region_time_series_data_by_offset[(start_date, end_date)]
    plot_carbon_intensity_overlap_cdf(all_region_time_series_data, l_cloud_region_pairs, start_date, end_date)

In [None]:
def create_diff_carbon_intensity_data(ci1: CarbonIntensityData, ci2: CarbonIntensityData, diff_timeseries: list[TimestampdValue], diff_ds: pd.Series) -> CarbonIntensityData:
    diff_region_name = f'({ci1.cloud_vendor}:{ci1.region} - {ci2.cloud_vendor}:{ci2.region})'
    diff_iso_name = f'{ci1.iso} - {ci2.iso}'
    return CarbonIntensityData('diff', diff_region_name, diff_iso_name, diff_timeseries, diff_ds)

In [None]:
def get_diff_carbon_intensity(ci1: CarbonIntensityData, ci2: CarbonIntensityData) -> CarbonIntensityData:
    diff_timeseries = []
    ds1 = ci1.timeseries_pd
    ds2 = ci2.timeseries_pd
    combined_index = sorted(set(ds1.index.tolist() + ds2.index.tolist()))
    ds1 = ds1.reindex(combined_index, method='ffill')
    ds2 = ds2.reindex(combined_index, method='ffill')
    diff_ds = ds1 - ds2
    diff_ds.dropna()
    diff_timeseries = CarbonIntensityData.create_timeseries_from_pd(diff_ds)
    diff_region_name = f'({ci1.cloud_vendor}:{ci1.region} - {ci2.cloud_vendor}:{ci2.region})'
    diff_iso_name = f'{ci1.iso} - {ci2.iso}'
    return create_diff_carbon_intensity_data(ci1, ci2, diff_timeseries, diff_ds)


In [None]:
for start_date, end_date in d_region_time_series_data_by_offset:
    all_region_time_series_data = d_region_time_series_data_by_offset[(start_date, end_date)]
    diff_time_series_data = {}
    for cr1, cr2 in l_cloud_region_pairs:
        ci_data_diff = get_diff_carbon_intensity(
            all_region_time_series_data[cr1],
            all_region_time_series_data[cr2]
        )
        diff_time_series_data[(ci_data_diff.cloud_vendor, ci_data_diff.region)] = ci_data_diff
    plot_end_date = start_date + timedelta(days=1)
    if diff_time_series_data:
        # plot_carbon_intensity_time_series(all_region_time_series_data, start_date, plot_end_date)
        plot_carbon_intensity_time_series(diff_time_series_data, start_date, plot_end_date)
        plt.axhline(y=0, color='k')


In [None]:
d_region_time_series_data_by_offset.keys()