In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [12]:
df = pd.read_csv("Employee.csv")
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [None]:
plot_missing_value(df)

In [13]:
X = df.drop(columns=["LeaveOrNot"])
y = df.LeaveOrNot

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3722, 8), (931, 8), (3722,), (931,))

In [14]:
X_train.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
4097,Bachelors,2012,New Delhi,3,38,Female,No,1
1694,Bachelors,2018,Bangalore,3,26,Male,No,4
2659,Bachelors,2016,Pune,3,27,Male,No,5
3860,Bachelors,2016,Bangalore,3,36,Male,No,4
1250,Bachelors,2017,Bangalore,3,27,Male,Yes,5


In [15]:
X_train.columns

Index(['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender',
       'EverBenched', 'ExperienceInCurrentDomain'],
      dtype='object')

In [17]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling="robust"), ['JoiningYear', 'Age', 'ExperienceInCurrentDomain']),
    ('categoric', cat_pipe(encoder='onehot'), ['Education', 'City', 'PaymentTier', 'City', 'PaymentTier', 'EverBenched']),
])

from sklearn.svm import SVC
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVC(max_iter=500))
])


from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 49 candidates, totalling 147 fits




{'algo__C': 1.0, 'algo__gamma': 1.0}
0.7235357334766255 0.7917825548248464 0.7035445757250268
