# Production Notebook

In [24]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import joblib

In [26]:
import warnings
warnings.filterwarnings("ignore")

In [34]:
from sklearn.metrics import classification_report

def production(X_path, y_path):
    # load data
    df_X = pd.read_csv(X_path)
    # make the changes if required 
    # -------------------------
    df_X['Distance'].fillna(df_X['Distance'].mode()[0], inplace=True)
    df_X['SelfReview'].fillna(df_X['SelfReview'].median(), inplace=True)
    df_X['SupervisorReview'].fillna(df_X['SupervisorReview'].median(), inplace=True)
    df_X['Salary'] = df_X['Salary'].str.replace('K', '000', regex=True).astype(int)
    df_X['PreviousSalary'] = df_X['PreviousSalary'].str.replace('K', '000', regex=True).astype(int)
    df_X['WorkloadStress'] = (df_X['ProjectComplexity'] * df_X['NumOfProjects']) / df_X['TeamSize']
    df_X['CareerGrowth'] = df_X['Certifications'] + df_X['SkillDevelopmentCourses'] - df_X['YearsWorked']/5
    df_X['RewardDisparity'] = (df_X['WorkSatisfactionScore'] - df_X['Salary'].rank(pct=True))

    # -------------------------


    model = joblib.load('final_model.pkl')
    pred = model.predict(df_X)

    df_y = pd.read_csv(y_path)['Left']
    print(classification_report(df_y, pred))
    

production( 
    X_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X.csv',
    y_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y.csv'
)
    

              precision    recall  f1-score   support

           0       0.76      0.64      0.70    319539
           1       0.50      0.65      0.57    180461

    accuracy                           0.64    500000
   macro avg       0.63      0.64      0.63    500000
weighted avg       0.67      0.64      0.65    500000

