In [9]:
import numpy as np
import pandas as pd

In [10]:
class OilDataCleaner:
    def __init__(self, df):
        import numpy as np
        import pandas as pd
        self.df = np.abs(df.copy())
        
    def clean_time(self, on_stream_var, rate_var):
        # Cap the maximum value of ON_STREAM_HRS to 24
        self.df[on_stream_var] = np.where(self.df[on_stream_var] > 24, 24, self.df[on_stream_var])
        self.df[rate_var] = np.where(self.df[on_stream_var] == 0, 0, self.df[rate_var])
        return self.df

    def clean_rate(self, on_stream_var, *rate_vars):
        for rate in rate_vars:
            self.df[rate] = np.where(self.df[on_stream_var] == 0 , 0, self.df[rate])
        return self.df
    
    def clean_choke(self, on_stream_var, avg_choke_var):
        # The average choke size should be set to zero when the well is off
        self.df.loc[self.df[on_stream_var] == 0, avg_choke_var] = 0
        return self.df


    def __remove_extreme_outliers(self, series, thd_z_score=2):
        outliers = series[series == 0]
        series[series == 0] = np.nan
        z_score = (series - series.mean()) / series.std()
        abs_z_score = abs(z_score)
        outliers = pd.concat([outliers, series[abs_z_score > thd_z_score]])
        series[abs_z_score > thd_z_score] = np.nan
        return series, outliers

    def __get_window_mean(self, i, window_size, series):
        if i + 2*window_size <= len(series):
            # If the last window is smaller than window_size, add it to the previous window
            window = series.iloc[i:i+window_size]
            print(f"The rate of change for segment {[i, i+window_size]}", end='')

        else:
            window = series.iloc[i:]
            print(f"The rate of change for segment {[i, len(series)]}", end='')
            i = len(series)

        return window.mean(), window

    def __get_window_outliers(self, window, mean, rate_of_change_window):
        upper_bound = mean + rate_of_change_window
        lower_bound = mean - rate_of_change_window
        window_outliers = window.loc[(window < lower_bound) | (window > upper_bound)]
        return window_outliers

    def __detect_outliers_with_thd_quantile(self, series, outliers, window_size, thd_quantile):
        for i in range(0, len(series), window_size):
            mean, window =  self.__get_window_mean(i, window_size, series)
            diff_window = np.abs(np.diff(window))
            rate_of_change_window = np.nanquantile(diff_window, thd_quantile)
            print(f' is {rate_of_change_window}', end='')
            print(f" with a mean of {mean}")
            window_outliers = self.__get_window_outliers(window, mean, rate_of_change_window)
            outliers = pd.concat([outliers, window_outliers])
            # If there are less than window_size elements left, add them to the previous window
            if i == len(series) :
                break
        return outliers

    def __validate_rate_of_change(self, rate_of_change, all_same_rate, num_windows):
        if not isinstance(rate_of_change, (list, np.ndarray)):
            raise ValueError("rate_of_change must be a list or an array.")
        
        if all_same_rate:
            if len(rate_of_change) != 1:
                raise ValueError("When all_same_rate is True, rate_of_change must be a list or an array with a single value.")
                
            return iter([rate_of_change[0]] * num_windows)
        else:
            if len(rate_of_change) != num_windows:
                raise ValueError(f"Length of rate_of_change ({len(rate_of_change)})"
                                    f"does not match the number of windows ({num_windows}).")
            
            return iter(rate_of_change)

    def __detect_outliers_with_rate_of_change(self, series, outliers, window_size, rate_of_change, all_same_rate=False):
        num_windows = int(np.ceil(len(series) / window_size))
        rate_of_change_iter = self.__validate_rate_of_change(rate_of_change, all_same_rate, num_windows)
        for i in range(0, len(series), window_size):
            mean, window =  self.__get_window_mean(i, window_size, series)
            rate_of_change_window = next(rate_of_change_iter)
            print(f' is {rate_of_change_window}', end='')
            print(f" with a mean of {mean}")
            window_outliers = self.__get_window_outliers(window, mean, rate_of_change_window)
            outliers = pd.concat([outliers, window_outliers])
            if i == len(series):
                break
        return outliers

    def rate_of_change_outlier_detector(self, series, window_size, thd_z_score=2, thd_quantile=0.98, rate_of_change=None, all_same_rate=False):

        # Check if the input series is a pandas Series object
        if not isinstance(series, pd.Series):
            raise TypeError(f"Input 'series' must be a pandas Series object, not {type(series)}")
        
        # Remove extreme outliers from the series using the remove_extreme_outliers function
        series, outliers = self.__remove_extreme_outliers(series)  
        
        # Detect outliers using either rate of change or quantile threshold
        if rate_of_change is None:
            outliers = self.__detect_outliers_with_thd_quantile(series, outliers, window_size, thd_quantile)  
        else:
            outliers = self.__detect_outliers_with_rate_of_change(series, outliers, window_size, rate_of_change, all_same_rate)  
        
        # Sort and return the outliers
        return outliers.sort_index() 

        def outliers_treatment(self, outliers, method='time'):
            series[outliers] = np.nan
            
            series = series.interpolate(method=method)
            seies[series.duplicated()] = np.nan
