# Import Library

In [1]:
import pandas as pd
import numpy as np

from func import zero_std, custom_info

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('csv/employee_attrition_train.csv')

In [3]:
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({'Yes':1,'No':0})

In [4]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [5]:
zero_std(X)

['EmployeeCount', 'StandardHours']

In [6]:
drop_zero_std = FunctionTransformer(X.drop(zero_std(X), inplace=True, axis=1))
drop_unused_cols = FunctionTransformer(X.drop(['Over18', 'EmployeeNumber'], inplace=True, axis=1))

In [7]:
def extraction(df, reference_column, target_column):
    # Pada First Extraction, df dibersihkan dari NaN values (jika ada) di reference column
    first_extraction = df[~df[reference_column].isna()]
    
    # Pada Second Extraction, df dibersihkan dari NaN values di reference column dan target column
    second_extraction = first_extraction[~first_extraction[target_column].isna()]
    second_extraction = second_extraction.sort_values([reference_column, target_column], ascending=[False, False])
    
    # Pada Second Extraction data sudah bersih dari NaN value dari reference column maupun target column
    return first_extraction, second_extraction

first, second = extraction(X, 'TotalWorkingYears', 'Age')

missing_age = first[first.Age.isna()]
missing_age = missing_age['TotalWorkingYears'].unique()
missing_age = np.sort(missing_age).tolist()

imputer_val_arr = []
for i in missing_age:
    source = second[second.TotalWorkingYears==i]['Age']
    if i == 7 or i == 8:
        imputer_val_arr.append(int(source.mean()))
    else:
        imputer_val_arr.append(int(source.median()))

zip_iter = zip(missing_age, imputer_val_arr)
age_val_imputer = dict(zip_iter)

def fill_age(x):
    if np.isnan(x['Age']):
        return age_val_imputer[x['TotalWorkingYears']]
    else:
        return x['Age']

# def age_column_swap(df):
#     df = pd.concat(
#     [
#         df,
#         df.apply(fill_age,axis=1)
#     ],
#     axis=1)
#     df = df.drop(columns='Age')
#     return df
def age_imputer(df):
    df.Age = df.apply(fill_age,axis=1)
    return df

age_handler = FunctionTransformer(age_imputer(X))
# df[age_imputer(X)['Age'].isna()]

In [8]:
dfh_handler = FunctionTransformer(X['DistanceFromHome'].fillna(int(X['DistanceFromHome'].median()), inplace=True))
dr_handler = FunctionTransformer(X['DailyRate'].fillna(int(X['DailyRate'].median()), inplace=True))

In [9]:
pipeline = Pipeline([
    ('age_handler', age_handler),
    ('dfh_handler', dfh_handler),
    ('dr_handler', dr_handler),
    ('drop_zero_std', drop_zero_std),
    ('drop_unused_cols', drop_unused_cols),
])

In [10]:
pipeline.fit_transform(X)

TypeError: 'DataFrame' object is not callable