Clone the respository

In [2]:
!git clone https://github.com/Zerve-AI/pypelines.git

Installing the pypeline

In [None]:
import os
folder = 'C:/zerve'
os.chdir(f'{folder}/pypelines')

In [None]:
!pip install .

LIST OF DATAPREP METHODS

In [None]:
from pypelines import utils

utils.list_supported_dataprepmethods(method_type='transformer')

Loading Library

In [None]:
import pypelines.data_prep_pipeline as pipe
import pandas as pd

Data Load and Method Selection

In [None]:
housing = pd.read_csv("pypelines/datasets/regression/housing.csv")
dataprep_pypelines_all = pipe.DataPrepPipeline(
    data = housing, 
    target = 'population',
    preprocessing_method = 'MatchVariables',
    outlier_method = 'Winsorizer', 
    numerical_imputation_method = 'MeanMedianImputer', 
    categorical_imputation_method = 'CategoricalImputer', 
    encoding_method = 'RareLabelEncoder',
    datetime_method = None,
    target_date1_column = None,
    target_date2_column = None,
    forecasting_method ='LagFeatures',
    forecast_columns = ['housing_median_age', 'total_rooms','total_bedrooms', 'population', 'households', 'median_income','median_house_value'],
    transformer_method = 'LogTransformer',
    transformer_columns = ['housing_median_age', 'total_rooms','total_bedrooms', 'population', 'households', 'median_income','median_house_value'],
    discretisation_method = 'EqualFrequencyDiscretiser',
    discretisation_columns = ['housing_median_age']
)

Data Preparation code generation

In [None]:
dataprep_pypelines_all.code_to_clipboard()

In [None]:

import numpy as np
import pandas as pd
from feature_engine.preprocessing import MatchVariables
from feature_engine.outliers import Winsorizer
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer

from feature_engine.encoding import RareLabelEncoder


encode = RareLabelEncoder(tol=0.05, n_categories=10, max_n_categories=None, replace_with='Rare', variables=None, missing_values='raise', ignore_format=False)
from feature_engine.timeseries.forecasting import LagFeatures
from feature_engine.transformation import LogTransformer

# target dataframe: housing
target = "population"
features = list(housing.columns.drop("population"))
feature_df = housing[features]



# Preprocessing
try:
    
    process = MatchVariables(missing_values='ignore', verbose=True)
    housing = process.fit_transform(housing)

    
except Exception as e:
    print("Error in outlier:", str(e))




# Missing Value Imputation

# Identifying the missing columns and selecting only columns with less than 10% missing values
edited_missing_columns = feature_df.columns[feature_df.isnull().mean() <= 0.1].tolist()

if len(edited_missing_columns) != 0:
    int_lst = housing[edited_missing_columns].select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_lst = housing[edited_missing_columns].select_dtypes(include=['category','object']).columns.tolist()
    try:
        if len(int_lst) > 0:
            
            imputer = MeanMedianImputer(imputation_method='median', variables=int_lst)
            housing_num = imputer.fit_transform(housing[int_lst])

            
    except Exception as e:
        print("Error in integer imputation:", str(e))

    try:
        if len(cat_lst) > 0:
            
            imputer = CategoricalImputer(imputation_method='missing',variables=cat_lst)
            housing_cat = imputer.fit_transform(housing[cat_lst])

            
    except Exception as e:
        print("Error in integer imputation:", str(e))
        
feature_df = pd.concat([housing_num,housing_cat],axis=1)
housing =  pd.concat([feature_df,housing["population"]],axis=1)




# Handling Outliers
try:
    
    out = Winsorizer(capping_method='gaussian', tail='right', fold=3, add_indicators=False, variables=None, missing_values='ignore')
    housing = out.fit_transform(housing)
    
except Exception as e:
    print("Error in outlier:", str(e))
 



# Encoding Categorical Variables
try:
    
    encode = RareLabelEncoder(tol=0.05, n_categories=10, max_n_categories=None, replace_with='Rare', variables=None, missing_values='raise', ignore_format=False)
    housing = encode.fit_transform(housing)
    
except Exception as e:
    print("Error in integer encoding:", str(e))





# Forecasting Features
try:
    
    lf = LagFeatures(variables=['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value'], periods=[1,2], freq=None, sort_index=True, missing_values='raise', drop_original=False)
    housing = lf.fit_transform(housing)


    
except Exception as e:
    print("Error in integer forecast:", str(e))



# variable transformer
try:
    
    lt = LogTransformer(variables=['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value'])
    housing = lt.fit_transform(housing)

    
    
except Exception as e:
    print("Error in integer transformer:", str(e))



# Discretisation
try:
    
    discret = EqualFrequencyDiscretiser(variables = ['housing_median_age'], q=10, return_object=False, return_boundaries=False, precision=3)
    # x = housing.drop("population", axis=1)
    housing = discret.fit_transform(housing)

    
except Exception as e:
    print("Error in integer discretisation:", str(e))



##### End of Data Processing Pipeline #####

