In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Assign Url
sheet_id = '1wLXAt2bZEGKIeE_mvg1zYUSAfQCjkrDOREjLKxL55ZE'
sheet_name = 'Employee-Attrition'
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

df = pd.read_csv(url)

In [3]:
from Fun import Employee_Attrition
EA = Employee_Attrition()

In [4]:
df = EA.clean(df)

In [5]:
Att_df = EA.selection(df)[0]

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

In [9]:
order_list = [  ['female', 'male'],
              
                ['single', 'married', 'divorced'],

                ['marketing',
                'medical',
                'life sciences',
                'technical degree',
                'human resources',
                'other'
                ],

                ['sales', 'research & development', 'human resources'],

                ['sales representative',
                'laboratory technician',
                'research scientist',
                'human resources',
                'sales executive',
                'manufacturing director',
                'healthcare representative',
                'research director',
                'manager'],

                ['no', 'yes'],

                ['non-travel','travel_rarely','travel_frequently'],


                ]

In [11]:
cat_col = Att_df.select_dtypes(include=['object']).drop(columns=['attrition'])
cat_col_num = Att_df[['joblevel','performancerating']]
num_col = Att_df.select_dtypes(exclude=['object']).drop(columns=['joblevel','performancerating'])

In [7]:
num_trans = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ]
)
num_trans

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [10]:
cat_trans = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=order_list, dtype='int'))
    ]
)
cat_trans

0,1,2
,steps,"[('imputer', ...), ('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['female', 'male'], ['single', 'married', ...], ...]"
,dtype,'int'
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [12]:
cat_trans1 = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ]
)
cat_trans1

0,1,2
,steps,"[('imputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [13]:
preprocess = ColumnTransformer(
                    transformers=[
                        ('num',num_trans,num_col.columns),
                        ('cat',cat_trans,cat_col.columns),
                        ('cat1',cat_trans1,cat_col_num.columns)
                    ]
                )
preprocess

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['female', 'male'], ['single', 'married', ...], ...]"
,dtype,'int'
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [14]:
pipe = Pipeline(
    steps=[
        ('preprocess', preprocess),
    ])
pipe

0,1,2
,steps,"[('preprocess', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['female', 'male'], ['single', 'married', ...], ...]"
,dtype,'int'
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False
