# Power Outage

**Name(s)**: Luke, Andrew

**Website Link**: (your website link)

## Code

In [23]:
import pandas as pd
import numpy as np
import os
from scipy.stats import ks_2samp

import plotly.express as px
pd.options.plotting.backend = 'plotly'

### Cleaning and EDA

In [24]:
outage = pd.read_excel("outage.xlsx", sheet_name="Masterdata")
outage.head().to_clipboard()

In [25]:
# Drop Info Rows 
df = outage.drop(range(4)).dropna(axis=1, how='all')

# Set column names
df.columns = df.iloc[0]

# Drop Unit row and variables
df = df.drop([4, 5])
df = df.drop(columns="variables")

# Combine 'OUTAGE.START.DATE' and 'OUTAGE.START.TIME'
df['OUTAGE.START'] = pd.to_datetime(df['OUTAGE.START.DATE']) + pd.to_timedelta(df['OUTAGE.START.TIME'].astype(str))

# Combine 'OUTAGE.RESTORATION.DATE' and 'OUTAGE.RESTORATION.TIME'
df['OUTAGE.RESTORATION'] = pd.to_datetime(df['OUTAGE.RESTORATION.DATE']) + pd.to_timedelta(df['OUTAGE.RESTORATION.TIME'].astype(str))

# Drop the original date and time columns
df = df.drop(['OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME'], axis=1)

# Fill Missing 
df.replace("NA", np.nan, inplace=True)

# Display 
df[['RES.SALES']]

df.to_csv("outage.csv")

In [26]:
len(df["variables"][df["variables"].isna()])

KeyError: 'variables'

### Assessment of Missingness

In [33]:
# NMAR Analysis

# We believe the column CAUSE.CATEGORY.DETAIL is likely to be NMAR as the column
# revolves around a detailed description of the event categories, and too complex
# of a description may cause nothing to be marked down instead. Possible data 
# that could help make it MAR would be the uniqueness or complexity of the cause
# since more complex causes may not be easily inputed into the data.


# Missingness Dependency

df = pd.read_csv('outage.csv')

def ks_query(missing, dependent):
    mar = df.copy()
    mar['missing'] = mar[missing].isna()
    res = ks_2samp(mar.query('missing')[dependent], mar.query('not missing')[dependent])
    return res

dur_vs_cust = ks_query('OUTAGE.DURATION', 'CUSTOMERS.AFFECTED')
dur_vs_sales = ks_query('OUTAGE.DURATION', 'TOTAL.SALES')

duration_missing = df.copy()
duration_missing['missing'] = duration_missing['OUTAGE.DURATION'].isna()

px.histogram(duration_missing, x='CUSTOMERS.AFFECTED', color='missing', histnorm='probability', marginal='box',
             title="customers affected by missingness of outage duration", barmode='overlay', opacity=0.7)

# px.histogram(duration_missing, x='TOTAL.SALES', color='missing', histnorm='probability', marginal='box',
#              title="total sales by missingness of outage duration", barmode='overlay', opacity=0.7)


### Hypothesis Testing

In [39]:
# Null Hypothesis
# The duration of outages in the years 2005 comes from the same population as 2015.

# Alternative Hypothesis
# The duration of outages in the year 2015 are shorter than the duration of outages in 2005

# Test Statistic
# Difference in group means

In [50]:
n_repetitions = 500

shuffled = df[(df['OUTAGE.START'].str.startswith('2005')) | (df['OUTAGE.START'].str.startswith('2015'))].dropna()
shuffled['old_year'] = shuffled['OUTAGE.START'].str.startswith('2005')

observed_difference = shuffled.groupby('old_year')['OUTAGE.DURATION'].mean().diff().iloc[-1]

differences = []
for _ in range(n_repetitions):
    
    with_shuffled = shuffled.assign(Shuffled_Duration=np.random.permutation(shuffled['OUTAGE.DURATION']))

    group_means = (
        with_shuffled
        .groupby('old_year')
        .mean()
        .loc[:, 'Shuffled_Duration']
    )
    difference = group_means.diff().iloc[-1]
    
    differences.append(difference)

(np.array(differences) >= observed_difference).mean()

0.0