In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

# import warnings
# warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('csv/imputed.csv')

In [3]:
class SkippablePCA(PCA):

    def __init__(self,skip=False,n_components=5, svd_solver="auto", random_state=None):
        self.skip = skip
        super().__init__(n_components, svd_solver, random_state)

    def fit(self, X, y=None):
        if self.skip:
            return self
        else:
            return super().fit(X,y)

    def fit_transform(self, X, y=None):
        if self.skip:
            return X
        else:
            return super().fit_transform(X,y) 

    def transform(self, X):
        if self.skip:
            return X
        else:
            return super().transform(X) 

In [4]:
def business_travel_encoding(df):
    oof = df.copy()
    oof['BusinessTravel'] = oof['BusinessTravel'].map({
        'Non-Travel' : 0,
        'Travel_Rarely' : 1,
        'Travel_Frequently' : 2
    })
    return oof

def over_time_encoding(df):
    oof = df.copy()
    oof['OverTime'] = oof['OverTime'].map({
        'No' : 0,
        'Yes' : 1
    })
    return oof

def one_hot(df,column):
    df = pd.concat(
    [
        df,
        pd.get_dummies(df[column], prefix=column, drop_first=True)
    ],
    axis=1)
    df = df.drop(columns=column)
    return df

def data_encoders(df):
    nominal_columns_arr = ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']
    df = business_travel_encoding(df)
    for i in nominal_columns_arr:
        df = one_hot(df, i)
    df = over_time_encoding(df)
    return df

In [5]:
X = df.drop(columns='Attrition')
y = df.Attrition.map({
    'No': 0,
    'Yes': 1
})

X_train, X_test, y_train, y_test= train_test_split(X,y,stratify=y,test_size= .2, random_state=11111992)

In [6]:
pipeline = Pipeline(
    [
        ('encoder_function', FunctionTransformer(data_encoders)),
        ('pca', SkippablePCA(random_state=11111992)),
        ('model', KNeighborsClassifier())
    ]
)



In [7]:
param_grid = {
    'pca__skip':[False, True],
    'pca__n_components':[10,15,20,25,30],
    'pca__svd_solver':['auto', 'full', 'arpack'],
    'model__n_neighbors':[3,5,7]
}

grid = GridSearchCV(pipeline, cv=5, param_grid=param_grid)
grid.fit(X_train,y_train)



KeyboardInterrupt: 

In [None]:
pipeline

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.score(X_train,y_train)

In [None]:
data_encoders(X_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
grid.cv_results_