In [16]:
import pandas as pd
from pathlib import Path
import os
import datetime


In [17]:
pd.set_option("display.max_colwidth", -1)
project_dir = str(Path(os.getcwd()).parent)
datasets_path = project_dir + "/datasets/"

uncleaned_df = pd.read_parquet(datasets_path + 'dirty_2020_provider_data.parquet')
uncleaned_df = uncleaned_df.drop(uncleaned_df.index[range(1)])

  pd.set_option("display.max_colwidth", -1)


In [18]:
# Convert to timestamps
uncleaned_df['event_start_time'] = pd.to_datetime(uncleaned_df['event_start_time'], unit='s')
uncleaned_df['event_end_time'] = pd.to_datetime(uncleaned_df['event_end_time'], unit='s')
uncleaned_df['first_notification'] = pd.to_datetime(uncleaned_df['first_notification'], unit='s')
uncleaned_df['last_notification'] = pd.to_datetime(uncleaned_df['last_notification'], unit='s')

# Drop duplicate rows
uncleaned_df = uncleaned_df.drop_duplicates(inplace = False).reset_index(drop=True)

# Drop duplicate event start and end times
uncleaned_df = uncleaned_df.drop_duplicates(inplace = False, subset=['event_start_time', 'service_name']).reset_index(drop=True)


# Remove failures that didn't occur in 2020
df_2020 = pd.DataFrame()
for _, row in uncleaned_df.sort_values("event_start_time").iterrows():
    if row["event_start_time"].year == 2020:
        df_2020 = df_2020.append(row, ignore_index=True)[uncleaned_df.columns.tolist()].reset_index(drop=True)  

In [19]:
# Check if event start time is later than end time
broken_event_times_index = list()
for idx, row in df_2020.sort_values("event_start_time").iterrows():
    if row["event_start_time"] > row["event_end_time"] or not row["event_start_time"] or not row["event_end_time"]:
        broken_event_times_index.append(idx)

In [23]:
broken_event_times_index
df_2020[25:26].description

25     6:08 PM PST \nWe are currently experiencing an issue provisioning new image builder and fleet streaming instances in the AP-SOUTHEAST-2 Region. 7:20 PM PST \nWe are continuing to investigate an increase in instance provisioning error rates in the AP-SOUTHEAST-2 Region. 8:32 PM PST \nWe have identified the cause of the increased provisioning error rates in the AP-SOUTHEAST-2 Region and continue working towards resolution. 8:59 PM PST \nWe continue to experience increased instance provisioning error rates due to the issue affecting EC2 within the AP-SOUTHEAST-2 Region. We continue to work towards full resolution. Existing streaming sessions and instances will continue to operate.Jan 23, 12:41 AM PST \nWe continue to experience increased instance provisioning error rates within the AP-SOUTHEAST-2 Region. We continue to work towards full resolution. Existing streaming sessions and instances will continue to operate.Jan 23,  1:51 AM PST \nWe are continuing to work towards resolution 

In [24]:
# Fixing the event start times for service id 23: http://arpio.io/behind-the-aws-sydney-outage/
df_2020[22:23].event_start_time = pd.Timestamp(year=2020, month=1, day=23, hour=16, minute=7) # PST
df_2020[22:23].event_end_time = pd.Timestamp(year=2020, month=1, day=23, hour=23, minute=20) # PST
df_2020[22:23].first_notification = pd.Timestamp(year=2020, month=1, day=23, hour=16, minute=41) # PST
df_2020[22:23].last_notification = pd.Timestamp(year=2020, month=1, day=23, hour=23, minute=20) # PST

df_2020[25:26].event_start_time = pd.Timestamp(year=2020, month=1, day=22, hour=18, minute=8) # PST
df_2020[25:26].event_end_time = pd.Timestamp(year=2020, month=1, day=23, hour=2, minute=38) # PST
df_2020[25:26].first_notification = pd.Timestamp(year=2020, month=1, day=22, hour=18, minute=8) # PST
df_2020[25:26].last_notification = pd.Timestamp(year=2020, month=1, day=23, hour=2, minute=38) # PST

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [26]:
# Save cleaned dataset
df_2020.to_parquet(datasets_path + "2020_fixed_sesaeet.parquet")

In [27]:
# Save to excel file for manual inspection
df_2020.to_excel('dirty_2020_provider_data.xlsx')