In [11]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/sleep_health_clean.csv',sep=';')

In [12]:

X = data.drop(columns=["Sleep Disorder"], inplace=False)
y = data["Sleep Disorder"].values.reshape(-1,1)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
X_preprocessor = ColumnTransformer(transformers=
    [
        ('drop_columns', 'drop', [0,4,10,12]),

        ("gender_encoder", OrdinalEncoder(categories=[["Female", "Male"]], dtype=np.int8),[1]),
        ("one_hot_encoder",OneHotEncoder(drop='first',categories='auto', sparse=False),[3]),
        ("bmi_encoder",
         Pipeline(steps=[
             ('bmi',OrdinalEncoder(
                 categories=[["Normal", "Normal Weight", "Overweight", "Obese"]], dtype=np.int8)),('bmi_scaler', MinMaxScaler())]),[8]),
        ], remainder=StandardScaler()
)

In [14]:
X_pipeline = Pipeline(steps=[
    
    ('preprocessor', X_preprocessor),

])

In [15]:
y_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(fill_value='none',strategy='constant')),
    ('label_encoder', OneHotEncoder(sparse=False))
])

In [16]:
X_pipeline.fit(X)



In [17]:
y_pipeline.fit(y)



In [18]:
X_pipeline.transform(X)

array([[ 1.        ,  1.        ,  0.        , ...,  1.47559156,
        -0.45923879,  1.17047394],
       [ 1.        ,  1.        ,  0.        , ...,  1.47559156,
        -0.45923879,  1.17047394],
       [ 1.        ,  1.        ,  0.        , ...,  1.47559156,
        -0.45923879,  0.44410682],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -1.34583606,
         1.47930869, -0.52438269],
       [ 0.        ,  0.        ,  0.        , ..., -1.34583606,
         1.47930869, -0.52438269],
       [ 0.        ,  0.        ,  0.        , ..., -1.34583606,
         1.47930869, -0.52438269]])

In [19]:
y_pipeline.transform(y)

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [20]:
import joblib

joblib.dump(X_pipeline,'data-pipelines-bin/predictor_pipeline')
joblib.dump(y_pipeline,'data-pipelines-bin/target_pipeline')

['data-pipelines-bin/target_pipeline']