In [41]:
from dateutil import parser
from dateutil.tz import gettz
from datetime import timedelta

import pandas as pd

from util.util_data import read_data, load_json_column, unnest_dict, get_incident_id, write_partitioned_data

## 0. Metrics

- incident_id
- Incident_Title
- incident_impact_level
    - 0: impact-none
    - 1: impact-minor
    - 2: impact-major
    - 3: impact-critical
    - 4: impact-maintenance
- Impact_Color
- {service} (0 or 1)
- {stage}_flag (0 or 1)
- {stage}_timestamp
- {stage}_description

## 1. Incident stages
### 1.1 Parse incident stages and save

In [42]:
def get_services(service_str):
    if pd.isna(service_str):
        return []
    else:        
        service_str = service_str.split(':')[-1].rstrip('.')
        service_str = service_str.replace("and", ",")
        services = service_str.split(',')
        return [service.strip() for service in services]

In [43]:
def parse_update_time(timestamp_str):
    tzinfos = {
        "PST": gettz("America/Los_Angeles"),
        "PDT": gettz("America/Los_Angeles")
    }
    timestamp_str = " ".join(timestamp_str.split(". ")[1].split(" - "))
    timestamp = parser.parse(timestamp_str, ignoretz=False, tzinfos=tzinfos)
    return timestamp.astimezone(gettz('UTC'))

def parse_updates(updates_list, stages=None):
    """
    Parse updates list to a dictionary with keys like 'Investigating_flag', 'Investigating_timestamp', 'Investigating_description. All the stage='Update' are not included in the dictionary.
    """
    if stages is None:
        stages = ["Investigating", "Identified", "Monitoring", "Resolved", "Postmortem"]
    updates_dict = {}
    
    for stage in stages:
        stage = stage.lower()
        updates_dict[f'{stage}_flag'] = int(0)
        updates_dict[f'{stage}_timestamp'] = None
        updates_dict[f'{stage}_description'] = None
    
    for update in updates_list:
        stage = update.get("Update_Title")
        if stage in stages:
            stage = stage.lower()
            updates_dict[f'{stage}_flag'] = int(1)
            updates_dict[f'{stage}_timestamp'] = parse_update_time(update.get("Update_Timestamp"))
            updates_dict[f'{stage}_description'] = update.get("Update_Body")
    return updates_dict

def get_incident_provider(incident_url):
    if pd.isna(incident_url):
        return incident_url
    else:
        return incident_url.split('/')[2].split('.')[1]


In [44]:
# load data
# df = pd.read_csv('data/raw/incident/incident_history_test.csv')

DATA_TYPE = 'incident'
DATA_INPUT_DIR = f'data/raw/{DATA_TYPE}/2024-08-31'
DATA_OUTPUT_LAYER = 'clean'

df = read_data(DATA_INPUT_DIR)
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df = load_json_column(df, 'Updates')

# add a column of where the data is from
df['provider'] = df['Incident_Link'].apply(get_incident_provider)
df['provider'].value_counts()

loading: data/raw/incident/2024-08-31/anthropic/incident_history_202312_202402.csv
loading: data/raw/incident/2024-08-31/anthropic/incident_history_202309_202311.csv
loading: data/raw/incident/2024-08-31/anthropic/incident_history_202403_202405.csv
loading: data/raw/incident/2024-08-31/anthropic/incident_history_202303_202305.csv
loading: data/raw/incident/2024-08-31/anthropic/incident_history_202306_202308.csv
loading: data/raw/incident/2024-08-31/anthropic/incident_history_202406_202408.csv
loading: data/raw/incident/2024-08-31/characterai/incident_history_202312_202402.csv
loading: data/raw/incident/2024-08-31/characterai/incident_history_202309_202311.csv
loading: data/raw/incident/2024-08-31/characterai/incident_history_202403_202405.csv
loading: data/raw/incident/2024-08-31/characterai/incident_history_202406_202408.csv
loading: data/raw/incident/2024-08-31/openai/incident_history_202312_202402.csv
loading: data/raw/incident/2024-08-31/openai/incident_history_202309_202311.csv
lo

provider
openai       365
anthropic    141
character     36
Name: count, dtype: int64

In [45]:
df

Unnamed: 0,Incident_Title,Incident_Link,Incident_color,Incident_Impact,Updates,Service,provider
0,Elevated Errors,https://status.anthropic.com/incidents/qv89mzd...,#f1c40f,impact-minor,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic
1,Elevated error rate on claude.ai,https://status.anthropic.com/incidents/tmsczhz...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...",,anthropic
2,"Elevated error rate on Claude-2.0, Claude-2.1",https://status.anthropic.com/incidents/t6v85cs...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic
3,"Elevated error rate on Claude-2.0, Claude-2.1",https://status.anthropic.com/incidents/x5f4s8n...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic
4,elevated error rates on claude-instant-1.2,https://status.anthropic.com/incidents/gr7nb0d...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic
...,...,...,...,...,...,...,...
537,Outage on some Ada and Babbage models,https://status.openai.com/incidents/qz0h6b7jw268,#e86c09,impact-major,"[{'Update_Title': 'Postmortem', 'Update_Body':...",This incident affected: API.,openai
538,Cluster Outage,https://status.openai.com/incidents/7h9pjy5d1lz5,#e86c09,impact-major,"[{'Update_Title': 'Postmortem', 'Update_Body':...",This incident affected: API.,openai
539,Logged in websites unavailable,https://status.openai.com/incidents/jzk2gzhfc34g,#e86c09,impact-major,"[{'Update_Title': 'Resolved', 'Update_Body': '...",This incident affected: Playground.,openai
540,Base babbage model currently down,https://status.openai.com/incidents/2bxby42r6wv4,#e86c09,impact-major,"[{'Update_Title': 'Resolved', 'Update_Body': '...",This incident affected: API and Playground.,openai


In [46]:
# get incident id
df['incident_id'] = df['Incident_Link'].apply(get_incident_id)

# numerize incident impact level
impact_mapping = {
    'impact-none': 0,
    'impact-minor': 1,
    'impact-major': 2,
    'impact-critical': 3,
    'impact-maintenance': 4
}
# reverse mapping
impact_mapping_reverse = {v: k for k, v in impact_mapping.items()}
df['incident_impact_level'] = df['Incident_Impact'].map(impact_mapping)

# parse services
df['services'] = df['Service'].apply(get_services)
all_services = ['Playground', 'API', 'Labs', 'ChatGPT', 'api.anthropic.com', 'claude.ai', 'console.anthropic.com', 'Character.AI'] 
for service in all_services:
    df[service] = df['services'].apply(lambda x: 1 if service in x else 0)

In [47]:
df

Unnamed: 0,Incident_Title,Incident_Link,Incident_color,Incident_Impact,Updates,Service,provider,incident_id,incident_impact_level,services,Playground,API,Labs,ChatGPT,api.anthropic.com,claude.ai,console.anthropic.com,Character.AI
0,Elevated Errors,https://status.anthropic.com/incidents/qv89mzd...,#f1c40f,impact-minor,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic,qv89mzdjvtrs,1,"[claude.ai, console.anthropic.com, , api.anthr...",0,0,0,0,1,1,1,0
1,Elevated error rate on claude.ai,https://status.anthropic.com/incidents/tmsczhz...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...",,anthropic,tmsczhzhjd63,0,[],0,0,0,0,0,0,0,0
2,"Elevated error rate on Claude-2.0, Claude-2.1",https://status.anthropic.com/incidents/t6v85cs...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic,t6v85cs8j6jb,0,"[claude.ai, console.anthropic.com, , api.anthr...",0,0,0,0,1,1,1,0
3,"Elevated error rate on Claude-2.0, Claude-2.1",https://status.anthropic.com/incidents/x5f4s8n...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic,x5f4s8nvkyqq,0,"[claude.ai, console.anthropic.com, , api.anthr...",0,0,0,0,1,1,1,0
4,elevated error rates on claude-instant-1.2,https://status.anthropic.com/incidents/gr7nb0d...,#333333,impact-none,"[{'Update_Title': 'Resolved', 'Update_Body': '...","This incident affected: claude.ai, console.ant...",anthropic,gr7nb0dwyydw,0,"[claude.ai, console.anthropic.com, , api.anthr...",0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,Outage on some Ada and Babbage models,https://status.openai.com/incidents/qz0h6b7jw268,#e86c09,impact-major,"[{'Update_Title': 'Postmortem', 'Update_Body':...",This incident affected: API.,openai,qz0h6b7jw268,2,[API],0,1,0,0,0,0,0,0
538,Cluster Outage,https://status.openai.com/incidents/7h9pjy5d1lz5,#e86c09,impact-major,"[{'Update_Title': 'Postmortem', 'Update_Body':...",This incident affected: API.,openai,7h9pjy5d1lz5,2,[API],0,1,0,0,0,0,0,0
539,Logged in websites unavailable,https://status.openai.com/incidents/jzk2gzhfc34g,#e86c09,impact-major,"[{'Update_Title': 'Resolved', 'Update_Body': '...",This incident affected: Playground.,openai,jzk2gzhfc34g,2,[Playground],1,0,0,0,0,0,0,0
540,Base babbage model currently down,https://status.openai.com/incidents/2bxby42r6wv4,#e86c09,impact-major,"[{'Update_Title': 'Resolved', 'Update_Body': '...",This incident affected: API and Playground.,openai,2bxby42r6wv4,2,"[API, Playground]",1,1,0,0,0,0,0,0


In [48]:
# unnest stages
df_stages = df.copy()
df_stages['parsed_updates'] = df_stages['Updates'].apply(parse_updates)
df_stages = unnest_dict(df_stages, 'parsed_updates')

# time span
cols_timestamp = [col for col in df_stages.columns if  'timestamp' in col ]
cols_timestamp.pop() # remove the postmortem timestamp
df_stages['start_timestamp'] = df_stages[cols_timestamp].min(axis=1)
df_stages['close_timestamp'] = df_stages[cols_timestamp].max(axis=1)
df_stages['time_span'] = df_stages['close_timestamp'] - df_stages['start_timestamp']
df_stages['over_one_day'] = df_stages['time_span'] > timedelta(days=1)

# reorder columns
cols_incident_info = ['incident_id', 'Incident_Title', 'incident_impact_level', 'Incident_color', 'provider']
cols_services = all_services
cols_stages = [col for col in df_stages.columns if 'flag' in col or 'timestamp' in col or 'description' in col]
cols_timespan = ['time_span', 'over_one_day']

df_stages_reordered = df_stages[cols_incident_info + cols_services + cols_stages + cols_timespan]

# write to csv
stages_path = write_partitioned_data(df_stages_reordered, DATA_OUTPUT_LAYER, DATA_TYPE, 'incident_stages.csv')

writing data to: data/clean/incident/2024-08-31/incident_stages.csv


In [49]:
df_stages_reordered

Unnamed: 0,incident_id,Incident_Title,incident_impact_level,Incident_color,provider,Playground,API,Labs,ChatGPT,api.anthropic.com,...,resolved_flag,resolved_timestamp,resolved_description,postmortem_flag,postmortem_timestamp,postmortem_description,start_timestamp,close_timestamp,time_span,over_one_day
0,qv89mzdjvtrs,Elevated Errors,1,#f1c40f,anthropic,0,0,0,0,1,...,1,2024-02-26 18:21:00+00:00,This incident has been resolved.,0,NaT,,2024-02-26 17:09:00+00:00,2024-02-26 18:21:00+00:00,0 days 01:12:00,False
1,tmsczhzhjd63,Elevated error rate on claude.ai,0,#333333,anthropic,0,0,0,0,0,...,1,2024-02-16 20:35:00+00:00,We experienced an elevated level of errors on ...,0,NaT,,2024-02-16 20:35:00+00:00,2024-02-16 20:35:00+00:00,0 days 00:00:00,False
2,t6v85cs8j6jb,"Elevated error rate on Claude-2.0, Claude-2.1",0,#333333,anthropic,0,0,0,0,1,...,1,2024-02-13 04:36:00+00:00,This incident has been resolved.,0,NaT,,2024-02-13 03:59:00+00:00,2024-02-13 04:36:00+00:00,0 days 00:37:00,False
3,x5f4s8nvkyqq,"Elevated error rate on Claude-2.0, Claude-2.1",0,#333333,anthropic,0,0,0,0,1,...,1,2024-02-13 01:26:00+00:00,This incident has been resolved.,0,NaT,,2024-02-13 00:21:00+00:00,2024-02-13 01:26:00+00:00,0 days 01:05:00,False
4,gr7nb0dwyydw,elevated error rates on claude-instant-1.2,0,#333333,anthropic,0,0,0,0,1,...,1,2024-02-06 01:55:00+00:00,This incident has been resolved.,0,NaT,,2024-02-05 23:07:00+00:00,2024-02-06 01:55:00+00:00,0 days 02:48:00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,qz0h6b7jw268,Outage on some Ada and Babbage models,2,#e86c09,openai,0,1,0,0,0,...,1,2022-07-07 08:12:00+00:00,All models have recovered and are operational....,1,2022-07-12 01:03:00+00:00,"Summary:\nOn July 6, 2022 22:17 PDT (July 7 05...",2022-07-07 06:07:00+00:00,2022-07-07 08:12:00+00:00,0 days 02:05:00,False
538,7h9pjy5d1lz5,Cluster Outage,2,#e86c09,openai,0,1,0,0,0,...,1,2022-06-24 15:25:00+00:00,All models are operational. Thank you for you...,1,2022-06-28 03:20:00+00:00,"Summary:\nOn June 23, 2022 18:56 PDT (June 24 ...",2022-06-24 02:02:00+00:00,2022-06-24 15:25:00+00:00,0 days 13:23:00,False
539,jzk2gzhfc34g,Logged in websites unavailable,2,#e86c09,openai,1,0,0,0,0,...,1,2022-06-21 07:38:00+00:00,A fix has been implemented by our service prov...,0,NaT,,2022-06-21 07:00:00+00:00,2022-06-21 07:38:00+00:00,0 days 00:38:00,False
540,2bxby42r6wv4,Base babbage model currently down,2,#e86c09,openai,1,1,0,0,0,...,1,2022-06-20 17:06:00+00:00,Serving of babbage models is now healthy. The ...,0,NaT,,2022-06-20 16:34:00+00:00,2022-06-20 17:06:00+00:00,0 days 00:32:00,False


### 1.2 Incident Stages Overview

In [57]:
df_openai = df_stages_reordered.loc[df_stages_reordered['provider'] == 'openai']
df_anthropic = df_stages_reordered.loc[df_stages_reordered['provider'] == 'anthropic']
df_characterai = df_stages_reordered.loc[df_stages_reordered['provider'] == 'character']

In [58]:
df_summary = pd.DataFrame()
for df_stages in [df_openai, df_anthropic, df_characterai]:
    incident_count = df_stages['incident_id'].nunique()

    cols_timestamp = [col for col in df_stages.columns if  'timestamp' in col ]
    for col in cols_timestamp:
        df_stages[col] = pd.to_datetime(df_stages[col])
    start_time = df_stages[cols_timestamp].min().min()
    end_time = df_stages[cols_timestamp].max().max()

    impact_level_count = df_stages['incident_impact_level'].value_counts().sort_index().rename(index=impact_mapping_reverse)

    stage_counts = {}
    cols_flag = [col for col in df_stages.columns if  'flag' in col ]
    for col in cols_flag:
        stage = col.split('_')[0]
        stage_counts[stage] = df_stages[col].sum()

    flat_dict = {**{'Incident Count': incident_count, 'Start': start_time, 'End': end_time}, **impact_level_count, **stage_counts}
    df_stages_summary = pd.DataFrame([flat_dict])
    # df_stages_summary.to_csv('plot/table/incident_summary.csv', index=False)

    df_summary = pd.concat([df_summary, df_stages_summary])

df_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stages[col] = pd.to_datetime(df_stages[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stages[col] = pd.to_datetime(df_stages[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stages[col] = pd.to_datetime(df_stages[col])
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,Incident Count,Start,End,impact-none,impact-minor,impact-major,impact-critical,impact-maintenance,investigating,identified,monitoring,resolved,postmortem
0,365,2021-02-09 18:00:00+00:00,2024-08-28 20:36:00+00:00,52,141,125,46,1.0,259,144,225,365,29
0,141,2023-03-16 03:00:00+00:00,2024-08-30 20:30:00+00:00,44,48,43,5,1.0,96,45,51,141,2
0,36,2023-10-24 21:15:00+00:00,2024-08-17 04:46:00+00:00,2,4,11,19,,26,16,15,36,2


In [11]:
latex_code = df_stages_summary.to_latex(index=False)
latex_code

'\\begin{tabular}{rllrrrrrrrrrr}\n\\toprule\nIncident Count & Start & End & impact-none & impact-minor & impact-major & impact-critical & impact-maintenance & investigating & identified & monitoring & resolved & postmortem \\\\\n\\midrule\n322 & 2021-02-09 18:00:00+00:00 & 2024-05-24 21:04:00+00:00 & 42 & 130 & 107 & 42 & 1 & 229 & 132 & 199 & 322 & 25 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [20]:
df_stages.columns

Index(['incident_id', 'Incident_Title', 'incident_impact_level',
       'Incident_color', 'Playground', 'API', 'Labs', 'ChatGPT',
       'investigating_flag', 'investigating_timestamp',
       'investigating_description', 'identified_flag', 'identified_timestamp',
       'identified_description', 'monitoring_flag', 'monitoring_timestamp',
       'monitoring_description', 'resolved_flag', 'resolved_timestamp',
       'resolved_description', 'postmortem_flag', 'postmortem_timestamp',
       'postmortem_description', 'start_timestamp', 'close_timestamp',
       'time_span', 'over_one_day'],
      dtype='object')