In [1]:
import pandas as pd
import pypelines.data_prep_pipeline as pipe

In [3]:
housing = pd.read_csv("pypelines/datasets/regression/housing.csv")
reg_pypelines_all = pipe.DataPrepPipeline(
    data = housing, 
    target = 'population',
    preprocessing_method = 'MatchVariables',
    outlier_method = 'Winsorizer', 
    numerical_imputation_methods = 'MeanMedianImputer', 
    categorical_imputation_methods = 'CategoricalImputer', 
    encoding_method = 'RareLabelEncoder',
    datetime_method = 'DatetimeFeatures',
    target_date1_column = None,
    target_date2_column = None,
    discretisation_method = 'EqualFrequencyDiscretiser')

In [4]:
reg_pypelines_all.code_to_clipboard()

In [5]:

import numpy as np
import pandas as pd
from feature_engine.preprocessing import MatchVariables
from feature_engine.outliers import Winsorizer
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.encoding import RareLabelEncoder


encode = RareLabelEncoder(tol=0.05, n_categories=10, max_n_categories=None, replace_with='Rare', variables=None, missing_values='raise', ignore_format=False)

# target dataframe: housing
target = "population"
features = list(housing.columns.drop("population"))
feature_df = housing[features]




# Preprocessing
try:
    
    process = MatchVariables(missing_values='raise', verbose=True)
    housing = process.fit_transform(housing)

    
except Exception as e:
    print("Error in outlier:", str(e))




# Handling Outliers
try:
    
    out = Winsorizer(capping_method='gaussian', tail='right', fold=3, add_indicators=False, variables=None, missing_values='ignore')
    housing = out.fit_transform(housing)
    
except Exception as e:
    print("Error in outlier:", str(e))
 


# Missing Value Imputation

# Identifying the missing columns and selecting only columns with less than 10% missing values
edited_missing_columns = feature_df.columns[feature_df.isnull().mean() <= 0.1].tolist()

if len(edited_missing_columns) != 0:
    int_lst = housing[edited_missing_columns].select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_lst = housing[edited_missing_columns].select_dtypes(include=['category','object']).columns.tolist()
    try:
        if len(int_lst) > 0:
            
            imputer = MeanMedianImputer(imputation_method='median', variables=int_lst)
            housing_num = imputer.fit_transform(housing[int_lst])

            
    except Exception as e:
        print("Error in integer imputation:", str(e))

    try:
        if len(cat_lst) > 0:
            
            imputer = CategoricalImputer(imputation_method='missing',variables=cat_lst)
            housing_cat = imputer.fit_transform(housing[cat_lst])

            
    except Exception as e:
        print("Error in integer imputation:", str(e))
        
feature_df = pd.concat([housing_num,housing_cat],axis=1)
housing =  pd.concat([feature_df,housing["population"]],axis=1)




# Encoding Categorical Variables
try:
    
    encode = RareLabelEncoder(tol=0.05, n_categories=10, max_n_categories=None, replace_with='Rare', variables=None, missing_values='raise', ignore_format=False)
    housing = encode.fit_transform(housing)
    
except Exception as e:
    print("Error in integer encoding:", str(e))



# Datetime Features
try:
    
    dt = DatetimeFeatures(variables=None, features_to_extract=None, drop_original=True, missing_values='raise', dayfirst=False, yearfirst=False, utc=None)
    dtf = dt(features_to_extract = ["year", "month", "day_of_month"])
    x = housing.drop("population", axis=1)
    y = housing["population"]
    x = dtf.fit_transform(x)
    housing = pd.concat([x,y])


    
except Exception as e:
    print("Error in integer datetime:", str(e))



# Discretisation
try:
    
    discret = EqualFrequencyDiscretiser(variables=None, q=10, return_object=False, return_boundaries=False, precision=3)
    x = housing.drop("population", axis=1)
    x = discret.fit_transform(x)

    
except Exception as e:
    print("Error in integer encoding:", str(e))

##### End of Data Processing Pipeline #####



Error in outlier: Some of the variables in the dataset contain NaN. Check and remove those before using this transformer.
Error in integer datetime: 'DatetimeFeatures' object is not callable


