In [8]:
from pandas import read_csv

data = read_csv('../data/processed/data.csv')

In [11]:
TARGET = 'Attrition_Yes'
TEST_SPLIT = 0.2
TRAIN_SPLIT = 1 - TEST_SPLIT

In [75]:
x, y = data.drop(columns=[TARGET]), data[TARGET]
train_data_len = int(len(x) * TRAIN_SPLIT)
x_train, y_train = x[:train_data_len], y[:train_data_len]
x_test, y_test = x[train_data_len:], y[train_data_len:]

In [76]:
data.isnull().sum()

Unnamed: 0.1                          0
Unnamed: 0                            0
Age                                   0
DistanceFromHome                      0
JobLevel                              0
MonthlyIncome                         0
NumCompaniesWorked                   19
PercentSalaryHike                     0
StockOptionLevel                      0
TotalWorkingYears                     9
TrainingTimesLastYear                 0
YearsAtCompany                        0
YearsSinceLastPromotion               0
YearsWithCurrManager                  0
MeanWorkingTime                       1
MedianWorkingTime                     1
SkewWorkingTime                       1
Attrition_Yes                         0
BusinessTravel_Travel_Frequently      0
BusinessTravel_Travel_Rarely          0
Department_Research & Development     0
Department_Sales                      0
Education_Below College               0
Education_College                     0
Education_Doctor                      0


In [103]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

pipeline = Pipeline(steps=[
    (
        'Fill missing values',
        SimpleImputer(strategy='median')  # can use median as there are no missing categorical features
    ),
    (
        'Scaling',
        StandardScaler()
    ),
    (
        'Logistic regression',
        LogisticRegression(max_iter=10000)
    )
])

In [104]:
pipeline = pipeline.fit(x_train, y_train)

In [105]:
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score

labels = pipeline.predict(x_test)
probas = pipeline.predict_proba(x_test)[:, 1]

accuracy_score(y_test, labels), recall_score(y_test, labels), roc_auc_score(y_test, probas)

(0.8469387755102041, 0.13432835820895522, 0.7489823609226595)

In [106]:
from plotly.express import area
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, probas)

figure = area(x=fpr, y=tpr, title='ROC curve',
              labels={'x': 'False Positive Rate', 'y': 'True Positive Rate'},
              width=500, height=500)
figure.add_shape(type='line', line={'dash': 'dash'}, x0=0, x1=1, y0=0, y1=1)
figure.show()