In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [64]:
cars = pd.read_csv('cars.csv')
df = cars.copy()

In [65]:
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


### IQR_SIMPTransformer

Transformer for detecting outliers and replace them using SimpleImputer

In [91]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.utils.validation import check_array, check_is_fitted

class IQR_SIMPTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=2):
        self.threshold = threshold
        self.simple_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    
    def fit(self, X, y=None):
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = X.columns
        X = check_array(X, force_all_finite='allow-nan', dtype=None) 
        X = pd.DataFrame(X)
        
        for col in X.select_dtypes(include=[np.number]).columns:
            if col == 'Year': continue # we don't do anything with year 
            Q1,Q3 = np.percentile(X[col].dropna(), [25,75])
            IQR = Q3 - Q1
            # Identify outliers
            lower_bound = Q1 - self.threshold * IQR
            upper_bound = Q3 + self.threshold * IQR
            
            # Replacing outliers with np.nan, using .loc()
            X[col] = X[col].apply(lambda x: np.nan if x < lower_bound or x > upper_bound else x)
        
        # Fitting the imputer on the DataFrame with outliers replaced with nan
        self.simple_imputer.fit(X)      
        
        return self
    
    def transform(self, X, y=None):
        check_is_fitted(self, 'simple_imputer') 
        X = check_array(X, force_all_finite='allow-nan', dtype=None)
        X = pd.DataFrame(X)        
        assert self.n_features_in_ == X.shape[1], "Number of features does not match"
        
        #Replacing outliers with the new data
        #this part is the same like for fit method
        for col in X.select_dtypes(include=[np.number]).columns:  
            if col == 'Year': continue
            Q1, Q3 = np.percentile(X[col].dropna(), [25, 75])
            IQR = Q3 - Q1
            lower_bound = Q1 - self.threshold * IQR
            upper_bound = Q3 + self.threshold * IQR
            
            # Replace outliers with np.nan
            X[col] = X[col].apply(lambda x: np.nan if x < lower_bound or x > upper_bound else x)
        
        # Transform the DataFrame
        X_imputed = self.simple_imputer.transform(X)
        
        # Return as DataFrame
        return pd.DataFrame(X_imputed, columns=self.feature_names_in_)
    
    def get_feature_names_out(self, input_features=None):
        return self.feature_names_in_

In [94]:
# Example usage
data = {
    'A': [1, 2, 3, 4, 100],
    'B': [10, 20, 30, 40, 50],
    'Year': [2001, 2002, 2003, 2004, 2005]
}
df_1 = pd.DataFrame(data)

transformer = IQR_SIMPTransformer(threshold=2)
transformer.fit(df_1)
transformer.get_feature_names_out()
transformed_df = transformer.transform(df_1)
print(df_1)
print('==='*50)
print(transformed_df)

     A   B  Year
0    1  10  2001
1    2  20  2002
2    3  30  2003
3    4  40  2004
4  100  50  2005
     A     B    Year
0  1.0  10.0  2001.0
1  2.0  20.0  2002.0
2  3.0  30.0  2003.0
3  4.0  40.0  2004.0
4  2.5  50.0  2005.0


### LogTransformer

This transformer applies a logarithmic transformation to positively skewed data. It will only apply the logarithmic transformation to numeric columns and will ignore columns with non-positive values, as the logarithmic function is not defined for these values.

In [67]:
from sklearn.utils.validation import check_array, check_is_fitted

class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, base):
        self.base = base

    def fit(self, X, y=None):
        X = check_array(X, dtype=None, force_all_finite=False)
        self.n_features_in_ = X.shape[1]
        return self
    
    def transform(self, X, y=None):
        check_is_fitted(self, 'n_features_in_')
        X = check_array(X, dtype=None, force_all_finite=False)

        assert self.n_features_in_ == X.shape[1], "Number of features does not match"
        X = pd.DataFrame(X)

        for col in X.select_dtypes(include=[np.number]).columns:
            # Only apply the transformation to columns with all positive values
            if (X[col] > 0).all():
                X[col] = np.log(X[col]) / np.log(self.base)
            else:
                print(f"Column {col} contains non-positive values and will not be log-transformed.")

        return X

In [68]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pipeline = Pipeline([
    ('iqr_knn', IQR_KNNTransformer(n_neighbors=5, threshold=1.5)),
    ('log', LogTransformer(base=10)),
    ('standarize', MinMaxScaler())
])

In [69]:
from sklearn.compose import ColumnTransformer, make_column_selector

preprocessing = ColumnTransformer(
    [
        ('num_columns', pipeline, make_column_selector(dtype_include=np.number))
    ]
)

transformed_data = preprocessing.fit_transform(df)

print("Original Data:")
print(df)
print("Transformed Data:")
print(transformed_data)

TypeError: Cannot perform 'ror_' with a dtyped [int64] array and scalar of type [bool]