<a href="https://colab.research.google.com/github/alexandergribenchenko/01_Crypto/blob/main/Object_Oriented_Programming/DS_OOP_OW_03_Sklearn_Transformator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dataset**

In [136]:
import pandas as pd
  
C1_lst = [-21.57, 1.24, -0.84, 32.25, 0.82, -3.11, 0.46, -18.68, 0.04, 30.87]
C2_lst = [2.49, 2.27, 0.25, -2.33, -2.91, -3.61, 0.58, -2.59, -3.99, 1.54]
C3_lst = [1.14, 4.76, 14.23, -2.65, -3.53, -0.03, 17.50, -0.21, 2.96, 2.31]

df = pd.DataFrame(list(zip(C1_lst, C2_lst, C3_lst )), columns =['C1', 'C2', 'C3'])
df

Unnamed: 0,C1,C2,C3
0,-21.57,2.49,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,14.23
3,32.25,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,17.5
7,-18.68,-2.59,-0.21
8,0.04,-3.99,2.96
9,30.87,1.54,2.31


# **Function**

In [6]:
def handle_outliers_V_01(df):
  
  df_output = pd.DataFrame()

  for col in df.columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    # Q1 = df[col].quantile(0.25, interpolation='lower')
    # Q3 = df[col].quantile(0.75, interpolation='lower')
    IQR = Q3-Q1

    outlier_threshold_down = Q1-1.5*IQR
    outlier_threshold_up = Q3+1.5*IQR

    lst_not_outliers = [x for x in df[col] if x>=outlier_threshold_down and x<=outlier_threshold_up]
    min_not_outlier = min(lst_not_outliers)
    max_not_outlier = max(lst_not_outliers)

    df_output[col] = df[col].apply(lambda x: min_not_outlier if x<min_not_outlier else
                                             max_not_outlier if x>max_not_outlier else
                                             x)
  return df_output

In [7]:
handle_outliers_V_01(df)

Unnamed: 0,C1,C2,C3
0,-3.11,2.49,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,4.76
3,1.24,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,4.76
7,-3.11,-2.59,-0.21
8,0.04,-3.99,2.96
9,1.24,1.54,2.31


In [15]:
def handle_outliers_V_02(df):

  Q1 = df.quantile(0.25)
  Q3 = df.quantile(0.75)
  # Q1 = df[col].quantile(0.25, interpolation='lower')
  # Q3 = df[col].quantile(0.75, interpolation='lower')
  IQR = Q3-Q1

  outlier_threshold_down = Q1-1.5*IQR
  outlier_threshold_up = Q3+1.5*IQR

  return outlier_threshold_down, outlier_threshold_up

In [16]:
outlier_threshold_down, outlier_threshold_up = handle_outliers_V_02(df)

In [17]:
outlier_threshold_down

C1   -8.05875
C2   -9.02500
C3   -6.87750
dtype: float64

In [24]:
outlier_threshold_down.loc['C1']

-8.05875

In [18]:
outlier_threshold_up

C1     6.65125
C2     7.49500
C3    11.02250
dtype: float64

# **Estimator**

In [25]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [138]:
class OutliersEstimator(BaseEstimator, TransformerMixin):
  """This estimator ignores its input and returns random Gaussian noise.

  It also does not adhere to all scikit-learn conventions,
  but showcases how to handle randomness.
  """
  def __init__(self, columns=None):
    self.columns = columns

  # the arguments are ignored anyway, so we make them optional
  def fit(self, X=None, y=None):
    Q1 = X[self.columns].quantile(0.25)
    Q3 = X[self.columns].quantile(0.75)
    # Q1 = df[col].quantile(0.25, interpolation='lower')
    # Q3 = df[col].quantile(0.75, interpolation='lower')
    IQR = Q3-Q1
    self.outlier_threshold_down_ = Q1-1.5*IQR
    self.outlier_threshold_up_ = Q3+1.5*IQR
    return self

  def transform(self, X):
    X_output = X.copy()
    for col in X[self.columns].columns:
      lst_not_outliers = [i for i in X[col] if i>=outlier_threshold_down.loc[col] 
                                              and i<=outlier_threshold_up.loc[col]]
      min_not_outlier = min(lst_not_outliers)
      max_not_outlier = max(lst_not_outliers)

      X_output[col] = X[col].apply(lambda x: min_not_outlier if x<min_not_outlier else
                                             max_not_outlier if x>max_not_outlier else
                                             x)
    return X_output

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   C1      10 non-null     float64
 1   C2      10 non-null     float64
 2   C3      10 non-null     float64
dtypes: float64(3)
memory usage: 368.0 bytes


In [140]:
Object_OutliersEstimator = OutliersEstimator(columns=['C2'])

In [141]:
Object_OutliersEstimator.columns

['C2']

In [142]:
Object_OutliersEstimator.fit_transform(X=df)

Unnamed: 0,C1,C2,C3
0,-21.57,2.49,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,14.23
3,32.25,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,17.5
7,-18.68,-2.59,-0.21
8,0.04,-3.99,2.96
9,30.87,1.54,2.31


In [143]:
df

Unnamed: 0,C1,C2,C3
0,-21.57,2.49,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,14.23
3,32.25,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,17.5
7,-18.68,-2.59,-0.21
8,0.04,-3.99,2.96
9,30.87,1.54,2.31


In [76]:
Object_OutliersEstimator.fit(X=df)

In [95]:
Object_OutliersEstimator.outlier_threshold_down_

C1   -8.05875
C2   -9.02500
C3   -6.87750
dtype: float64

In [96]:
Object_OutliersEstimator.outlier_threshold_up_

C1     6.65125
C2     7.49500
C3    11.02250
dtype: float64

In [97]:
Object_OutliersEstimator.salida_

['perro', 'gato', 'mico']

In [80]:
Object_OutliersEstimator.transform(X=df)

Unnamed: 0,C1,C2,C3
0,-3.11,2.49,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,4.76
3,1.24,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,4.76
7,-3.11,-2.59,-0.21
8,0.04,-3.99,2.96
9,1.24,1.54,2.31


In [99]:
dir(Object_OutliersEstimator)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'columns',
 'fit',
 'fit_transform',
 'get_params',
 'outlier_threshold_down_',
 'outlier_threshold_up_',
 'salida_',
 'set_params',
 'transform']

In [100]:
Object_OutliersEstimator.columns

['perro', 'gato']

In [101]:
Object_OutliersEstimator.salida_

['perro', 'gato', 'mico']

In [None]:
self._indx_outlier_rang_int(X[col])
            self._indx_outlier_3_sig(X[col])

In [None]:
class GaussianNoise(BaseEstimator, TransformerMixin):
    """This estimator ignores its input and returns random Gaussian noise.

    It also does not adhere to all scikit-learn conventions,
    but showcases how to handle randomness.
    """

    def __init__(self, n_components=100, random_state=None):
        self.random_state = random_state
        self.n_components = n_components

    # the arguments are ignored anyway, so we make them optional
    def fit(self, X=None, y=None):
        self.random_state_ = check_random_state(self.random_state)

    def transform(self, X):
        n_samples = X.shape[0]
        return self.random_state_.randn(n_samples, self.n_components)