In [3]:
import numpy as np
import pandas as pd

In [4]:
def clean_time(df, ON_STREAM_HRS,  BORE_VOL_OIL):
    # Cap the maximum value of ON_STREAM_HRS to 24
    df[ON_STREAM_HRS] = np.where(df[ON_STREAM_HRS] > 24, 24, df[ON_STREAM_HRS])
    df[ON_STREAM_HRS] = np.where(BORE_VOL_OIL==0, 0, df[ON_STREAM_HRS])
    return df

In [None]:
def clean_rate(df, ON_STREAM_HRS, *rate_vars):
    for rate in rate_vars:
        df[rate] = np.where(df[ON_STREAM_HRS] == 0 , 0, df[rate])
    return df

In [1]:
def downholePressure_outlierDetection(var, window_size, thd_z_score=2, thd_quantile=0.98, rate_of_change=None, all_same_rate=False):
    series = var.copy()
    series, outliers = remove_extreme_outliers(series)

    if rate_of_change is None :
        outliers = detect_outliers_with_thd_quantile(series, outliers, window_size, thd_quantile)
    else:
        outliers = detect_outliers_with_rate_of_change(series, outliers, window_size, rate_of_change, all_same_rate)
    return outliers.sort_index()


def remove_extreme_outliers(series, thd_z_score=2):
    outliers = series[series == 0]
    series[series == 0] = np.nan
    z_score = (series - series.mean()) / series.std()
    abs_z_score = abs(z_score)
    outliers = pd.concat([outliers, series[abs_z_score > thd_z_score]])
    series[abs_z_score > thd_z_score] = np.nan
    return series, outliers

def get_window_mean(i, window_size, series):
    if i + 2*window_size <= len(series):
        # If the last window is smaller than window_size, add it to the previous window
        window = series.iloc[i:i+window_size]
        print(f"The rate of change for segment {[i, i+window_size]}", end='')

    else:
        window = series.iloc[i:]
        print(f"The rate of change for segment {[i, len(series)]}", end='')
        i = len(series)

    return window.mean(), window

def get_window_outliers(window, mean, rate_of_change_window):
    upper_bound = mean + rate_of_change_window
    lower_bound = mean - rate_of_change_window
    window_outliers = window.loc[(window < lower_bound) | (window > upper_bound)]
    return window_outliers

def detect_outliers_with_thd_quantile(series, outliers, window_size, thd_quantile):
    for i in range(0, len(series), window_size):
        mean, window =  get_window_mean(i, window_size, series)
        diff_window = np.abs(np.diff(window))
        rate_of_change_window = np.nanquantile(diff_window, thd_quantile)
        print(f' is {rate_of_change_window}', end='')
        print(f" with a mean of {mean}")
        window_outliers = get_window_outliers(window, mean, rate_of_change_window)
        outliers = pd.concat([outliers, window_outliers])
        # If there are less than window_size elements left, add them to the previous window
        if i == len(series) :
            break
    return outliers

def validate_rate_of_change(rate_of_change, all_same_rate, num_windows):
    if not isinstance(rate_of_change, (list, np.ndarray)):
        raise ValueError("rate_of_change must be a list or an array.")
    
    if all_same_rate:
        if len(rate_of_change) != 1:
            raise ValueError("When all_same_rate is True, rate_of_change must be a list or an array with a single value.")
            
        return iter([rate_of_change[0]] * num_windows)
    else:
        if len(rate_of_change) != num_windows:
            raise ValueError(f"Length of rate_of_change ({len(rate_of_change)})"
                                f"does not match the number of windows ({num_windows}).")
        
        return iter(rate_of_change)

def detect_outliers_with_rate_of_change(series, outliers, window_size, rate_of_change, all_same_rate=False):
    num_windows = int(np.ceil(len(series) / window_size))
    rate_of_change_iter = validate_rate_of_change(rate_of_change, all_same_rate, num_windows)
    for i in range(0, len(series), window_size):
        mean, window =  get_window_mean(i, window_size, series)
        rate_of_change_window = next(rate_of_change_iter)
        print(f' is {rate_of_change_window}', end='')
        print(f" with a mean of {mean}")
        window_outliers = get_window_outliers(window, mean, rate_of_change_window)
        outliers = pd.concat([outliers, window_outliers])
        if i == len(series):
            break
    return outliers