# Challenge 1
## Below are some code to help you get started with manipulating the data and plot the time series. You may use them to manually identify thief hatch open and close events.

In [3]:
# get tank header pressure

import numpy as np
import pandas as pd
from datetime import timedelta 
from glob import glob 
from scipy import stats

df_thp = []
for f in glob('data/tank_header_pressure_*.csv'):
    df_thp.append(pd.read_csv(f))
    
#concatenate the tank header pressure data
df_thp = pd.concat(df_thp)
df_thp.info()

#timeframe of data
dates = pd.to_datetime(df_thp['timestamp'])
print("Minimum and maximum dates")
print(dates.min(), dates.max())

#read the drone data
df_drone = pd.read_csv('data/drone.csv')

#read form data
df_form = pd.read_csv('data/forms.csv').sort_values(by='SubmitDate')
df_form.FACILITY_ID = df_form.FACILITY_ID.astype(int)

#read workorder data
df_workorder = pd.read_csv('data/work_order.csv')
df_workorder.facility_id = df_workorder.facility_id.fillna(0).astype(int)


ValueError: No objects to concatenate

In [None]:
# check thp for one facility that we magically knew it had open thief hatch
facility_id = 10085941

df_thp_460 = df_thp[df_thp.FACILITY_ID == facility_id].copy()
df_thp_460.timestamp = pd.to_datetime(df_thp_460.timestamp)
#df_thp_460 = df_thp_460[(np.abs(stats.zscore(df_thp_460['pressure_osi'])) < 3)]
print(df_thp_460.head())

from utils import plot_ts_open_hatch
fig = plot_ts_open_hatch(dfi=df_thp_460, fac_id=facility_id)
fig.show()

                     timestamp                      TagType  FACILITY_ID  \
112985 2021-09-22 15:12:38.513  FlareTankHeaderPressureCurr     10085941   
112986 2021-09-22 15:42:33.732  FlareTankHeaderPressureCurr     10085941   
112987 2021-09-22 15:57:35.744  FlareTankHeaderPressureCurr     10085941   
112988 2021-09-22 16:12:33.477  FlareTankHeaderPressureCurr     10085941   
112989 2021-09-22 16:27:33.575  FlareTankHeaderPressureCurr     10085941   

        pressure_osi  
112985         1.835  
112986         1.355  
112987         1.155  
112988         1.110  
112989         1.080  


In [None]:
# get drone data for facility
df_drone_460 = df_drone[df_drone.FACILITY_ID == facility_id]
df_drone_460

Unnamed: 0,ASSET,DTM,FACILITY_ID
297,Permian,2021-10-20 18:42:17.000,10085941
362,Permian,2021-09-18 17:22:58.000,10085941


In [None]:
# check tank header pressure only limited days before and after to keep plot clean and easy to read
from datetime import timedelta

t_drone_open_hatch = df_drone_460.DTM.iloc[0] # in this case, only one open hatch event detected for this facility
t_drone_open_hatch = pd.to_datetime(t_drone_open_hatch)

t_strt = t_drone_open_hatch - timedelta(days=30)
t_stop = t_drone_open_hatch + timedelta(days=30)

df_thp_460 = df_thp_460[df_thp_460.timestamp.between(t_strt, t_stop)]

df_thp_460


Unnamed: 0,timestamp,TagType,FACILITY_ID,pressure_osi
112985,2021-09-22 15:12:38.513,FlareTankHeaderPressureCurr,10085941,1.835
112986,2021-09-22 15:42:33.732,FlareTankHeaderPressureCurr,10085941,1.355
112987,2021-09-22 15:57:35.744,FlareTankHeaderPressureCurr,10085941,1.155
112988,2021-09-22 16:12:33.477,FlareTankHeaderPressureCurr,10085941,1.110
112989,2021-09-22 16:27:33.575,FlareTankHeaderPressureCurr,10085941,1.080
...,...,...,...,...
281908,2021-11-19 17:27:33.785,FlareTankHeaderPressureCurr,10085941,1.595
281909,2021-11-19 17:42:33.708,FlareTankHeaderPressureCurr,10085941,1.540
281910,2021-11-19 17:57:33.823,FlareTankHeaderPressureCurr,10085941,1.030
281911,2021-11-19 18:12:33.937,FlareTankHeaderPressureCurr,10085941,1.685


In [None]:
# get work order data for facility 10085460 

# filter rows containing key word 'hatch', keyword could be 'thief' or misspelled words
df_workorder_460 = df_workorder[df_workorder.facility_id == facility_id]
df_workorder_460 = df_workorder_460[(df_workorder_460.workOrderDescription.fillna('').str.lower().str.contains('hatch'))
                | ((df_workorder_460.workOrderResolutionDescription.fillna('').str.lower().str.contains('hatch')))
]

# filter rows with dates within drone detected open hatch date
df_workorder_460.created_date = pd.to_datetime(df_workorder_460.created_date)
df_workorder_460.workOrderActualsStartDate = pd.to_datetime(df_workorder_460.workOrderActualsStartDate)
df_workorder_460.workOrderActualsEndDate = pd.to_datetime(df_workorder_460.workOrderActualsEndDate)

t_workorder_strt = t_drone_open_hatch - timedelta(days=30)
t_workorder_stop = t_drone_open_hatch + timedelta(days=30)

df_workorder_460 = df_workorder_460[
    df_workorder_460.created_date.between(t_workorder_strt, t_workorder_stop)
    | df_workorder_460.workOrderActualsStartDate.between(t_workorder_strt, t_workorder_stop)
    | df_workorder_460.workOrderActualsEndDate.between(t_workorder_strt, t_workorder_stop)
]

df_workorder_460


Unnamed: 0,created_date,assetType,sourceSystemId,facility_id,corp_id,workOrderDescription,workOrderResolutionDescription,workOrderActualsStartDate,workOrderActualsEndDate


In [None]:
# plot pressure with drone detected open hatch time, you may add hatch open or close time from work order or Forms data or other notes to the plot.
fig = plot_ts_open_hatch(dfi=df_thp_460, fac_id=facility_id, t_drone_open_hatch=t_drone_open_hatch)
fig.show()

In [None]:
from utils import summary
id = 10085544

data=summary(drone=df_drone, form = df_form, thp = df_thp, workorder = df_workorder, id = id, start=60, stop=60)
fig = plot_ts_open_hatch(dfi=data[0][1], fac_id=id, t_drone_open_hatch=data[0][0])
fig.show()

NameError: name 'df_drone' is not defined

In [None]:
print(len(data))

data[1][0]

2


Timestamp('2020-05-10 14:01:50')

In [None]:
#create the data frame with column names
open_hatch_events_manual = pd.read_csv('results/challenge_1/open_hatch_events_manual.csv')

workorder_index = 0

manual_open = pd.to_datetime('2021-10-18 00:00:00.000')
manual_close = pd.to_datetime('2021-10-29 00:00:00.000')
drone_opened = data[0][0]
drone_closed = None
workorder_opened = None #pd.to_datetime(data[0][2]['created_date'].iloc[workorder_index])
workorder_closed = None #pd.to_datetime(data[0][2]['workOrderActualsEndDate'].iloc[workorder_index])
forms_opened = None
forms_closed = None

'''
open_hatch_events_manual['facility_id'] = None
open_hatch_events_manual['time when hatch opened(identified manually)'] = None
open_hatch_events_manual['time when hatch closed(identified manually)'] = None
open_hatch_events_manual['time when hatch was opened(from drone data)'] = None
open_hatch_events_manual['time when hatch was closed(from drone data)'] = None
open_hatch_events_manual['time when hatch opened(from work orders data)'] = None
open_hatch_events_manual['time when hatch closed(from work order data)'] = None
open_hatch_events_manual['time when hatch opened(from forms data)'] = None
open_hatch_events_manual['time when hatch closed(from forms data)'] = None
'''

temp = {'facility_id':id, 'time when hatch opened(identified manually)':manual_open,
        'time when hatch closed(identified manually)':manual_close, 
        'time when hatch was opened(from drone data)':drone_opened,
        'time when hatch was closed(from drone data)':drone_closed,
        'time when hatch opened(from work orders data)':workorder_opened,
        'time when hatch closed(from work order data)':workorder_closed,
        'time when hatch opened(from forms data)':forms_opened,
        'time when hatch closed(from forms data)':forms_closed}
temp = pd.DataFrame(data = temp, index = [0])

open_hatch_events_manual = pd.concat([open_hatch_events_manual, temp], ignore_index = True)
open_hatch_events_manual.reset_index()

done = [10085460, 10086098, 10085941]

bad_data = [10111830, 10085545, 10085544, 10085463, 10086129, 
            10206093, 10085651, 10167456, 10085637, 10085463, 
            10085529, 10085615, 10086087, 10085694, 10094669,
            10085542, 10086111, 10085510, 10111756, 10085602,
            10085544, ]

# it should include 9 columns of 
# facility_id, 
# time when hatch opened(identified manually), time when hatch closed(identified manually)
# time when hatch was opened(from drone data), time when hatch was close(from drone data), 
# time when hatch opened(from work orders data), time when hatch opened(from work order data)
# time when hatch opened(from forms data), time when hatch opened(from forms data)

list_of_unique_facilities_in_thp_data = df_thp.FACILITY_ID.unique()

relevent_drone_data = df_drone[df_drone.FACILITY_ID.isin(list_of_unique_facilities_in_thp_data)]

#print(len(list_of_unique_facilities_in_thp_data))
print(relevent_drone_data)

open_hatch_events_manual

          ASSET                      DTM  FACILITY_ID
27      Permian  2022-08-19 13:49:00.000     10085460
28      Permian  2022-08-19 13:11:00.000     10086098
55      Permian  2022-06-07 12:06:24.000     10111830
56      Permian  2022-05-24 09:09:00.000     10085545
103     Permian  2022-02-19 11:20:00.000     10085544
137     Permian  2022-01-28 16:22:58.000     10085463
290     Permian  2021-10-25 16:30:20.000     10086129
297     Permian  2021-10-20 18:42:17.000     10085941
362     Permian  2021-09-18 17:22:58.000     10085941
363   Hawkville  2021-09-17 10:50:00.000     10206093
386     Permian  2021-08-18 17:47:38.000     10085651
404     Permian  2021-08-11 13:26:21.000     10167456
647     Permian  2021-04-24 20:05:18.000     10085810
648     Permian  2021-04-24 13:19:16.000     10085637
810     Permian  2020-12-27 10:10:41.000     10085463
862     Permian  2020-10-31 12:43:04.000     10085529
906     Permian  2020-09-14 14:51:55.000     10085615
909     Permian  2020-09-14 

Unnamed: 0,facility_id,time when hatch opened(identified manually),time when hatch closed(identified manually),time when hatch was opened(from drone data),time when hatch was closed(from drone data),time when hatch opened(from work orders data),time when hatch closed(from work order data),time when hatch opened(from forms data),time when hatch closed(from forms data)
0,10085460,2022-07-27 01:59:00,2022-08-25 20:14:00,2022-08-19 13:49:00,,2022-08-24,2022-08-25,,
1,10086098,2022-07-23 09:42:00,2022-09-22 06:12:00,2022-08-19 13:11:00,,2022-08-24 00:00:00,2022-08-25 00:00:00,,
2,10085941,,2021-09-24 20:27:00,2021-09-18 17:22:58,,,,,
3,10085941,2021-10-18 00:00:00,2021-10-29 00:00:00,2021-10-20 18:42:17,,,,,


In [None]:
#open_hatch_events_manual.to_csv('results/challenge_1/open_hatch_events_manual.csv')

# Challenge 2
## Develop algorithm to classify time series data to determine whether thief hatch is open or not at any given time

In [None]:
from models import model_random, model_ground_truth
from datetime import datetime

df = df_thp_460.copy() # use df for easy reference
pred_freq = timedelta(hours=4) # Predict every 4 hours

t_strt = df.timestamp.iloc[0]
t_strt = datetime(*t_strt.timetuple()[:4]) # round datetime to nearest hours
t_strt += pred_freq 
t_strt_0 = t_strt

t_stop = df.timestamp.iloc[-1]
t_stop = datetime(*t_stop.timetuple()[:4]) # round datetime to nearest hours
t_stop -= pred_freq 

df_pred = []
while t_strt <= t_stop:

    # model_ground_truth is bpx manually labelled, you may need to label groud truth for the events you identified
    # model_random is a random model to predict whether hatch is open or not. 
    # You should replace it with your own model
    # you model may need to deal with missing data
    
    df_pred.append([t_strt, model_ground_truth(df, t_strt)] + model_random(df, t_strt))

    t_strt += pred_freq

df_pred = pd.DataFrame(df_pred, columns = ['TimeStamp', 'Status_Truth', 'Status_Predicted', 'Probability_Open_Hatch'])    
df_pred


Unnamed: 0,TimeStamp,Status_Truth,Status_Predicted,Probability_Open_Hatch
0,2021-09-22 19:00:00,0,0,0.542724
1,2021-09-22 23:00:00,0,1,0.151226
2,2021-09-23 03:00:00,0,0,0.828437
3,2021-09-23 07:00:00,0,0,0.623712
4,2021-09-23 11:00:00,0,1,0.846614
...,...,...,...,...
342,2021-11-18 19:00:00,0,0,0.965856
343,2021-11-18 23:00:00,0,1,0.618522
344,2021-11-19 03:00:00,0,1,0.232924
345,2021-11-19 07:00:00,0,0,0.891978


In [None]:
from utils import plot_prediction_validation

fig = plot_prediction_validation(df, df_pred, facility_id)
#fig.write_image(f'results/challenge_2/prediction_validation_{facility_id}.png')
fig.show()

## Now classify other time series you identified in Challenge 1. 
### The time series you classify should start roughly at 30 days before the hacth opened and till 30 days after hatch closed, which you identified manually. 


In [None]:

# calculate classification metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, precision_score, accuracy_score
conf_matrix = confusion_matrix(df_pred.Status_Truth, df_pred.Status_Predicted)
ConfusionMatrixDisplay(conf_matrix).plot()

ImportError: ConfusionMatrixDisplay.plot requires matplotlib. You can install matplotlib with `pip install matplotlib`

In [None]:
# store scores for each "facility_id, df.timestamp.min(), df.timestamp.max()"

scores = [facility_id, t_strt_0, t_stop, pred_freq, df_pred.shape[0], 
          accuracy_score(df_pred.Status_Truth, df_pred.Status_Predicted),
          recall_score(df_pred.Status_Truth, df_pred.Status_Predicted),
          precision_score(df_pred.Status_Truth, df_pred.Status_Predicted),
          f1_score(df_pred.Status_Truth, df_pred.Status_Predicted)
          ]

scores = pd.DataFrame([scores], columns = ['facility_id', 't_strt', 't_stop', 'pred_freq', 'pred_#', 'accuracy_score', 'recall_score', 'precision_score', 'f1_score'])

#scores.to_csv('results/challenge_2/model_predictions.csv', index=False)

scores

Unnamed: 0,facility_id,t_strt,t_stop,pred_freq,pred_#,accuracy_score,recall_score,precision_score,f1_score
0,10086098,2022-07-20 17:00:00,2022-09-18 08:00:00,0 days 04:00:00,358,0.488827,0.46789,0.60355,0.527132


# Challenge 3
## Use the algorithm you developed or new algorithm to identify retrospectively open thief hatches in the past that’s not detected by drone

In [None]:
from models import search_for_open_hatch_random

search_res = []

for fac_id in df_thp.FACILITY_ID.unique():

    df = df_thp[df_thp.FACILITY_ID == fac_id]

    # replace "search_for_open_hatch_random", which is a random model, with your own model
    events = search_for_open_hatch_random(df, fac_id)
    
    search_res +=events

search_res = pd.DataFrame(search_res, columns = ['facility_id', 'num_of_open_hatch_events', 'open_hatch_event_seq', 't_hacth_open', 't_hacth_clos', 'prob_has_open_hatch_event'])

#search_res.to_csv('results/challenge_3/search_results.csv', index=False)

search_res

Unnamed: 0,facility_id,num_of_open_hatch_events,open_hatch_event_seq,t_hacth_open,t_hacth_clos,prob_has_open_hatch_event
0,10085682,1,1,2023-01-22 22:19:16.377,2023-02-09 21:19:15.944,0.367200
1,10085526,2,1,2023-01-24 03:21:25.387,2023-02-05 01:21:21.667,0.703341
2,10085526,2,2,2023-01-27 06:51:22.194,2023-01-28 06:21:21.534,0.136462
3,10085845,2,1,2022-09-07 21:32:25.784,2023-02-22 00:32:27.302,0.255757
4,10085845,2,2,2022-10-05 06:47:40.923,2022-10-28 21:17:31.326,0.941616
...,...,...,...,...,...,...
143,10085488,2,1,2023-01-28 19:18:17.210,2023-02-17 08:07:00.000,0.766978
144,10085488,2,2,2023-01-24 02:07:00.000,2023-02-15 18:33:26.749,0.229701
145,20000089,2,1,2023-02-02 23:57:00.000,2023-02-11 07:58:00.000,0.785646
146,20000089,2,2,2023-02-23 18:33:11.465,2023-02-23 21:03:14.484,0.081673


In [None]:

search_res_plot = search_res.iloc[:10] # limit what and how many to plot
'''
for _, fac_id, num_of_open_hatch_events, open_hatch_event_seq, t_hacth_open, t_hacth_clos, prob_open_hatch_event in search_res_plot.itertuples():
    
    dfi = df_thp[df_thp.FACILITY_ID==fac_id].copy()

    # ensure in datetime format
    t_hacth_open = pd.to_datetime(t_hacth_open)
    t_hacth_clos = pd.to_datetime(t_hacth_clos)

    t_strt = t_hacth_open - timedelta(days=30)
    t_stop = t_hacth_clos + timedelta(days=30)
    
    dfi.timestamp = pd.to_datetime(dfi.timestamp)
    dfi = dfi[dfi.timestamp.between(t_strt, t_stop)] # limit the dataframe 

    fig = plot_ts_open_hatch(dfi=dfi, fac_id=facility_id)
    fig.update_layout(title=f'{fac_id}-[{open_hatch_event_seq:01}-{num_of_open_hatch_events:01}], hatch open: {t_hacth_open}, hatch close: {t_hacth_clos}')
    fig.write_image(f'results/challenge_3/imgs_search_results/{fac_id}-[{open_hatch_event_seq:01}-{num_of_open_hatch_events:01}].png', engine='orca')
'''

"\nfor _, fac_id, num_of_open_hatch_events, open_hatch_event_seq, t_hacth_open, t_hacth_clos, prob_open_hatch_event in search_res_plot.itertuples():\n    \n    dfi = df_thp[df_thp.FACILITY_ID==fac_id].copy()\n\n    # ensure in datetime format\n    t_hacth_open = pd.to_datetime(t_hacth_open)\n    t_hacth_clos = pd.to_datetime(t_hacth_clos)\n\n    t_strt = t_hacth_open - timedelta(days=30)\n    t_stop = t_hacth_clos + timedelta(days=30)\n    \n    dfi.timestamp = pd.to_datetime(dfi.timestamp)\n    dfi = dfi[dfi.timestamp.between(t_strt, t_stop)] # limit the dataframe \n\n    fig = plot_ts_open_hatch(dfi=dfi, fac_id=facility_id)\n    fig.update_layout(title=f'{fac_id}-[{open_hatch_event_seq:01}-{num_of_open_hatch_events:01}], hatch open: {t_hacth_open}, hatch close: {t_hacth_clos}')\n    fig.write_image(f'results/challenge_3/imgs_search_results/{fac_id}-[{open_hatch_event_seq:01}-{num_of_open_hatch_events:01}].png', engine='orca')\n"

In [None]:
print('Good luck!')

Good luck!
