# PREDICTION DE LA PERFORMANCE DES EMPLOYES 

AUTEURS : MERAOUI Camélia, PERVENCHE Clémence, ROCHER Ludovic

## Import 

In [26]:
import numpy as np
np.set_printoptions(threshold=10000,suppress=True)
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler


from sklearn import tree
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.pipeline import Pipeline


## Fonctions utiles

### Exécution des classifiers avec cross-validate

In [27]:

def custom_scoring_function(y_true, y_pred):
    error_margin = 0.1  
    
    absolute_errors = abs(y_true - y_pred)
    
    within_margin = (absolute_errors <= error_margin).mean()
    
    return within_margin


def run_classifiers(clfs, X, Y):
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    results_dict = {}
    for i, clf in clfs.items():
        scoring_metrics = {'mae': make_scorer(mean_absolute_error),
                           'rmse': make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))),
                           'r2_score': make_scorer(r2_score), 
                           'custom_score': make_scorer(custom_scoring_function)}
        cv_results = cross_validate(clf, X, Y, cv=kf, scoring=scoring_metrics)
        
        print("Temps d'exécution de l'algorithme pour l'apprentissage de {0} : {1:.2f} secondes".format(i, np.mean(cv_results["fit_time"])))
        print("Temps d'exécution de l'algorithme pour la prédiction de {0} : {1:.2f} secondes".format(i, np.mean(cv_results["score_time"])))
        print("MAE for {0} is: {1:.3f} +/- {2:.3f}".format(i, np.mean(cv_results['test_mae']), np.std(cv_results['test_mae'])))
        print("RMSE for {0} is: {1:.3f} +/- {2:.3f}".format(i, np.mean(cv_results['test_rmse']), np.std(cv_results['test_rmse'])))
        print("R2 score for {0} is: {1:.3f} +/- {2:.3f}".format(i, np.mean(cv_results['test_r2_score']), np.std(cv_results['test_r2_score'])))
        print("Custom score for {0} is: {1:.3f} +/- {2:.3f}".format(i, np.mean(cv_results['test_custom_score']), np.std(cv_results['test_custom_score'])))

        print("\n")
    
    return results_dict





### Pipeline

In [24]:
def create_pipeline(steps, X_, y_, pipeline_file):
    pipeline = Pipeline(steps)
    pipeline.fit(X_, y_)
    with open(pipeline_file+'.pkl', 'wb') as file:
        pickle.dump(pipeline, file)

def load_and_predict(X_test_, pipeline_file):
    with open(pipeline_file+'.pkl', 'rb') as file:
        loaded_pipeline = pickle.load(file)
    predictions = loaded_pipeline.predict(X_test_)

    return predictions

## File paths

In [5]:
file_path = "../data/productivity_employees_blue/"

employees_data_path = f"{file_path}train_dataset.csv"
employees_data_path_test = f"{file_path}test_dataset.csv"


## Ouvrir le fichier et visualisation

### Ouvrir le fichier en format dataframe

In [6]:
df_emp_perf = pd.read_csv(employees_data_path, sep=',')
df_emp_perf_test = pd.read_csv(employees_data_path_test, sep=',')


In [7]:
df_emp_perf.head()

Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,...,department_finishing,department_finishing.1,department_sweing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,actual_productivity
0,9,0.75,3.94,,960,0,0.0,0,0,8.0,...,1,0,0,0,0,0,0,0,1,0.755167
1,7,0.65,30.1,909.0,7080,0,0.0,0,1,59.0,...,0,0,1,0,0,0,1,0,0,0.535678
2,3,0.8,4.15,,1440,0,0.0,0,0,7.0,...,0,1,0,0,0,0,0,0,1,0.820833
3,1,0.65,22.53,762.0,5040,0,0.0,0,1,42.0,...,0,0,1,0,0,0,0,0,1,0.581131
4,4,0.7,30.1,767.0,3300,50,0.0,0,1,57.0,...,0,0,1,1,0,0,0,0,0,0.790003


In [8]:
df_emp_perf_test.head()


Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,...,quarter_Quarter5,department_finishing,department_finishing.1,department_sweing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,12,0.75,4.08,,1080,0,0.0,0,0,9.0,...,1,0,1,0,0,1,0,0,0,0
1,4,0.75,4.15,,2400,0,0.0,0,0,20.0,...,0,1,0,0,0,0,1,0,0,0
2,3,0.7,30.1,1057.0,0,40,0.0,0,1,58.0,...,0,0,0,1,0,0,0,0,0,1
3,7,0.7,3.94,,2160,0,0.0,0,0,18.0,...,0,0,1,0,0,0,1,0,0,0
4,5,0.5,4.15,,1440,0,0.0,0,0,8.0,...,0,0,1,0,0,0,1,0,0,0


In [9]:
df_emp_perf.describe()

Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,...,department_finishing,department_finishing.1,department_sweing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,actual_productivity
count,1017.0,1017.0,1017.0,594.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,...,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0
mean,6.443461,0.730747,15.150492,1183.183502,4532.94002,40.689282,0.564405,0.39823,0.160275,34.846116,...,0.201573,0.214356,0.584071,0.161259,0.152409,0.164208,0.165192,0.171091,0.185841,0.736509
std,3.472473,0.097384,10.946096,1793.836719,3275.997333,173.240655,10.093731,3.351712,0.440199,22.185292,...,0.401373,0.410577,0.493124,0.36795,0.359594,0.370647,0.371536,0.376774,0.389169,0.174304
min,1.0,0.07,2.9,7.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233705
25%,3.0,0.7,3.94,770.5,1440.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.651515
50%,7.0,0.75,15.26,1039.0,4080.0,0.0,0.0,0.0,0.0,34.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.773333
75%,9.0,0.8,24.26,1254.75,6900.0,50.0,0.0,0.0,0.0,57.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.85017
max,12.0,0.8,54.56,23122.0,15120.0,3600.0,270.0,45.0,2.0,89.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.108125


In [10]:
df_emp_perf_test.describe()

Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,...,quarter_Quarter5,department_finishing,department_finishing.1,department_sweing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
count,180.0,180.0,180.0,97.0,180.0,180.0,180.0,180.0,180.0,180.0,...,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,6.333333,0.723333,14.563167,1235.061856,4762.5,24.205556,1.666667,0.205556,0.094444,33.275,...,0.038889,0.244444,0.216667,0.538889,0.194444,0.177778,0.2,0.172222,0.15,0.105556
std,3.423546,0.100751,10.94399,2094.627651,3737.737773,29.280515,22.36068,2.757817,0.34572,22.28236,...,0.193869,0.430956,0.413123,0.499876,0.396876,0.383392,0.401116,0.378627,0.358067,0.308125
min,1.0,0.35,2.9,171.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.7,4.08,808.0,1440.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,0.75,14.99,1067.0,3600.0,0.0,0.0,0.0,0.0,33.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.0,0.8,23.0575,1227.0,7020.0,46.25,0.0,0.0,0.0,57.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.0,0.8,48.68,21385.0,25920.0,113.0,300.0,37.0,2.0,59.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### WIP imputer 

In [11]:

imp_num = SimpleImputer(missing_values=np.nan, strategy='mean')

df_emp_perf["wip"] = imp_num.fit_transform(df_emp_perf[["wip"]])

df_emp_perf_test["wip"] = imp_num.transform(df_emp_perf_test[["wip"]])


In [12]:
df_emp_perf.describe()

Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,...,department_finishing,department_finishing.1,department_sweing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,actual_productivity
count,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,...,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0,1017.0
mean,6.443461,0.730747,15.150492,1183.183502,4532.94002,40.689282,0.564405,0.39823,0.160275,34.846116,...,0.201573,0.214356,0.584071,0.161259,0.152409,0.164208,0.165192,0.171091,0.185841,0.736509
std,3.472473,0.097384,10.946096,1370.450653,3275.997333,173.240655,10.093731,3.351712,0.440199,22.185292,...,0.401373,0.410577,0.493124,0.36795,0.359594,0.370647,0.371536,0.376774,0.389169,0.174304
min,1.0,0.07,2.9,7.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233705
25%,3.0,0.7,3.94,963.0,1440.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.651515
50%,7.0,0.75,15.26,1183.183502,4080.0,0.0,0.0,0.0,0.0,34.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.773333
75%,9.0,0.8,24.26,1183.183502,6900.0,50.0,0.0,0.0,0.0,57.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.85017
max,12.0,0.8,54.56,23122.0,15120.0,3600.0,270.0,45.0,2.0,89.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.108125


In [13]:
for col in df_emp_perf:
    print(f'Column name: ', col , '\nuniques val: ', df_emp_perf[col].nunique(), '\n------------')

Column name:  team 
uniques val:  12 
------------
Column name:  targeted_productivity 
uniques val:  9 
------------
Column name:  smv 
uniques val:  67 
------------
Column name:  wip 
uniques val:  490 
------------
Column name:  over_time 
uniques val:  137 
------------
Column name:  incentive 
uniques val:  47 
------------
Column name:  idle_time 
uniques val:  11 
------------
Column name:  idle_men 
uniques val:  9 
------------
Column name:  no_of_style_change 
uniques val:  3 
------------
Column name:  no_of_workers 
uniques val:  60 
------------
Column name:  month 
uniques val:  3 
------------
Column name:  quarter_Quarter1 
uniques val:  2 
------------
Column name:  quarter_Quarter2 
uniques val:  2 
------------
Column name:  quarter_Quarter3 
uniques val:  2 
------------
Column name:  quarter_Quarter4 
uniques val:  2 
------------
Column name:  quarter_Quarter5 
uniques val:  2 
------------
Column name:  department_finishing 
uniques val:  2 
------------
Column 

### Affichage du nombre de valeurs uniques pour chaque colonnes

In [14]:
for col in df_emp_perf:
    print(f'Column name: ', col , '\nuniques val: ', df_emp_perf[col].nunique(), '\n------------')

Column name:  team 
uniques val:  12 
------------
Column name:  targeted_productivity 
uniques val:  9 
------------
Column name:  smv 
uniques val:  67 
------------
Column name:  wip 
uniques val:  490 
------------
Column name:  over_time 
uniques val:  137 
------------
Column name:  incentive 
uniques val:  47 
------------
Column name:  idle_time 
uniques val:  11 
------------
Column name:  idle_men 
uniques val:  9 
------------
Column name:  no_of_style_change 
uniques val:  3 
------------
Column name:  no_of_workers 
uniques val:  60 
------------
Column name:  month 
uniques val:  3 
------------
Column name:  quarter_Quarter1 
uniques val:  2 
------------
Column name:  quarter_Quarter2 
uniques val:  2 
------------
Column name:  quarter_Quarter3 
uniques val:  2 
------------
Column name:  quarter_Quarter4 
uniques val:  2 
------------
Column name:  quarter_Quarter5 
uniques val:  2 
------------
Column name:  department_finishing 
uniques val:  2 
------------
Column 

### Get X and y

In [15]:
X = df_emp_perf.drop(columns=["actual_productivity"], inplace=False)
y = df_emp_perf["actual_productivity"]

print(X.shape)
print(y.shape)

(1017, 25)
(1017,)


In [16]:
# Convertir en matrice NumPy
X_ = X.values

# Sélectionner la colonne cible (y)
y = y.astype(float)

## Apprentissage supervisé

### Classifieur

In [17]:
clfs = {
    'ElasticNetRegressor': ElasticNet(),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(criterion='squared_error'),
    'DecisionTreeRegressor': tree.DecisionTreeRegressor(),
    'DecisionTreeRegressor2': TransformedTargetRegressor(regressor=tree.DecisionTreeRegressor()),
    'RandomForestRegressor2': TransformedTargetRegressor(regressor=RandomForestRegressor()),
    'GradientBoostingRegressor2': TransformedTargetRegressor(regressor=GradientBoostingRegressor()),
    'AdaBoostRegressor2': TransformedTargetRegressor(regressor=AdaBoostRegressor(estimator=tree.DecisionTreeRegressor())), 
    'SGDRegressor2': TransformedTargetRegressor(regressor=SGDRegressor()),
    'RidgeRegressor': Ridge(),
    'LassoRegressor': Lasso(),
    'KNNRegressor': KNeighborsRegressor(),
    'GaussianProcessRegressor': GaussianProcessRegressor(),
    'SVR': SVR(),
    'PolynomialRegressor': TransformedTargetRegressor(regressor=LinearRegression())
}


### Utilisation de cross-validate

In [18]:
X_norm = MinMaxScaler().fit_transform(X_)
results_dict = run_classifiers(clfs, X_norm, y)

Temps d'exécution de l'algorithme pour l'apprentissage de ElasticNetRegressor : 0.01 secondes
Temps d'exécution de l'algorithme pour la prédiction de ElasticNetRegressor : 0.01 secondes
MAE for ElasticNetRegressor is: 0.134 +/- 0.015
RMSE for ElasticNetRegressor is: 0.173 +/- 0.019
R2 score for ElasticNetRegressor is: -0.008 +/- 0.010
Custom score for ElasticNetRegressor is: 0.499 +/- 0.058


Temps d'exécution de l'algorithme pour l'apprentissage de HistGradientBoostingRegressor : 0.50 secondes
Temps d'exécution de l'algorithme pour la prédiction de HistGradientBoostingRegressor : 0.01 secondes
MAE for HistGradientBoostingRegressor is: 0.081 +/- 0.010
RMSE for HistGradientBoostingRegressor is: 0.126 +/- 0.014
R2 score for HistGradientBoostingRegressor is: 0.459 +/- 0.077
Custom score for HistGradientBoostingRegressor is: 0.718 +/- 0.051


Temps d'exécution de l'algorithme pour l'apprentissage de GradientBoostingRegressor : 0.32 secondes
Temps d'exécution de l'algorithme pour la prédict

In [19]:
best_data = MinMaxScaler().fit_transform(X_)
best_label = y
best_model = TransformedTargetRegressor(regressor=RandomForestRegressor())

## Recherche des meilleurs paramètres

In [22]:
param_grid = {
    'regressor__n_estimators': [100, 200, 300],  
    'regressor__max_depth': [None, 10, 20],  
    'regressor__min_samples_split': [2, 5, 10],  
    'regressor__min_samples_leaf': [1, 2, 4],  
}

grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, cv=10, scoring=make_scorer(custom_scoring_function), n_jobs=-1)
grid_search.fit(best_data, best_label)
print("Meilleurs paramètres:", grid_search.best_params_)
print("Meilleur score de validation croisée:", grid_search.best_score_)
best_model = grid_search.best_estimator_

Meilleurs paramètres: {'regressor__max_depth': 20, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Meilleur score de validation croisée: 0.7413609007959618


## Pipeline

In [28]:
steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('normalisation', MinMaxScaler()),  
    ('classifier', best_model)  
]


df_emp_perf_test = pd.read_csv(employees_data_path, sep=',')
X_train = df_emp_perf_test.iloc[:,:-1]
y_train = y.astype(float)
create_pipeline(steps, X_train, y_train, "../artifacts/predict_blue_performance_pipeline_nb")

In [30]:
X_test = df_emp_perf_test.iloc[:,:-1]
y_test = y

predictions = load_and_predict(X_test, "../artifacts/predict_blue_performance_pipeline_nb")
custom_score = custom_scoring_function(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print(f"Score : {custom_score}")
print(f"Mae : {mae}")

Score : 0.9419862340216323
Mae : 0.027755801994581292
