In [17]:
import pandas as pd

In [18]:
# cargar .env
import os
from dotenv import load_dotenv
load_dotenv()

True

In [19]:
train_file = os.getenv("DATASET")
target_column = os.getenv("TARGET")
model_name = os.getenv("MODEL")
n_trials = os.getenv("TRIALS")
depl_type = os.getenv("DEPLOYMENT_TYPE")
input_folder = os.getenv("INPUT_FOLDER")
output_folder = os.getenv("OUTPUT_FOLDER")
port = os.getenv('PORT')
print(train_file)

data/zoo_cls.parquet


In [20]:
import sys
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import KernelPCA

In [51]:
def preprocess(input_file, target_column, output_file, output_prep, output_pca): 
    df = pd.read_parquet(input_file, engine='pyarrow')
    
    # separar caracteristicas
    features_name = [x for x in list(df.columns) if x!=target_column]
    features = df[features_name]
    target = df[[target_column]]

    # definición de tipo de features
    numeric_features = []
    categoric_features = []
    for f in features_name:
        if (features[f].nunique()<=20):
            categoric_features.append(f)
        else:
            numeric_features.append(f)   
    
    # Ajuste de features
    feat_num = features[numeric_features].select_dtypes('O')
    for f in feat_num.columns:
        feat_num.loc[(~feat_num[f].str.isnumeric()), f] = np.nan
    feat_num = feat_num.astype('float32')
    feat_num = pd.concat([features[numeric_features].select_dtypes('number'),feat_num], axis=1)
    feat_cat = features[categoric_features]
    features = pd.concat([feat_num, feat_cat], axis=1)
        
    # preprocesamiento
    numeric_transformer = Pipeline(steps=[
                                        ('imputer', SimpleImputer(strategy='mean')),
                                        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
                                        ('imputer', SimpleImputer(strategy='most_frequent')),
                                        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categoric_features)
        ]
    )
    
    df = preprocessor.fit_transform(features)

    feature_limits = 15
    pca = KernelPCA(n_components=feature_limits, kernel='rbf')
    
    if df.shape[1]>feature_limits:
        df = pca.fit_transform(df)
        
    # guardar preprocesador
    joblib.dump(preprocessor, output_prep)
    joblib.dump(pca, output_pca)
    
    df = pd.concat([pd.DataFrame(df), target], axis=1, ignore_index=True)
    df.to_parquet(output_file, index=False, engine='pyarrow')
    print('Preprocessing stage is done')
    return df

In [52]:
input_file = train_file
target_column = target_column
output_file = 'data/data_prep.parquet'
output_prep = 'models/preprocessor.pkl'
output_pca = 'models/pca.pkl'

In [53]:
a = preprocess(input_file, target_column, output_file, output_prep, output_pca)

Preprocessing stage is done
