# Packages

### Downloading packages 
#### Select and run just either one of them

In [None]:
!pip3 install --upgrade pip
!pip3 install mlflow
# !pip3 install uuid
!pip3 install pandas
!pip3 install sklearn

### Importing libraries

In [35]:
import mlflow
import warnings
import numpy as np
import pandas as pd
import os
import binascii
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

### Reading files and setting variables

In [3]:
path, file = '../../data/', 'dataset.csv'
df = pd.read_csv (os.path.join(path, file), sep = ',')

# Experiments Tracking

## Functions definitions

### Disclaimer

In [4]:
'''
Most of these functions have been taken from the 'EDA_Attrition_Dataset.ipynb' file, so all coding credits go to the respective author.
'''

"\nMost of these functions have been taken from the 'EDA_Attrition_Dataset.ipynb' file, so all coding credits go to the respective author.\n"

### Log MLFlow experiment

In [5]:
def log_mlflow_experiment(exp_id, name, tags, metrics, nested=False):
    """
    Logs an Mlflow experiment that includes:
        1. Name
        2. Tags
            - Project name
            - Model
        3. Metrics
            - Accuracy score on train and test sets
            - F1 score on train and test sets

    This function allows to create child run experiments, when "nested" is set to True then the resulting experiment
    will be a child run of the current active experiment, otherwise a new parent run is created.

    No return is expected as the result is the logged experiment.

    Parameters:
        exp_id (str): Experiment ID.
        name (str): Experiment name.
        tags (dict): Dictionary with the experiment tags.
        metrics (dict): Dictionary with the performance metrics of the model.
        nested (bool, defaults to False): Flag to indicate if the experiment to create is a child run.
    """
    # Starting experiment
    mlflow.start_run(experiment_id=exp_id, run_name=name, nested=nested)

    # Logging experiment tags and metrics
    mlflow.set_tags(tags)
    mlflow.log_metrics(metrics)

    # Finishing experiment
    mlflow.end_run()

### Calculate performance

In [6]:
def calculate_performance(y_train, y_test, y_train_preds, y_test_preds):
    """
    Calculates the performance metrics of the model on train and test sets. Considered metrics are:
        - Accuracy score
        - F1 score

    Parameters:
        y_train (np.ndarray): Array with the true values of the target variable on the train set.
        y_test (np.ndarray): Array with the true values of the target variable on the test set.
        y_train_preds (np.ndarray): Array with the predicted values of the target variable on the train set.
        y_test_preds (np.ndarray): Array with the predicted values of the target variable on the test set.

    Returns:
        train_acc (float): Accuracy score on the train set
        test_acc (float): Accuracy score on the test set
        train_f1 (float): F1 score on the train set
        test_f1 (float): F1 score on the test set
    """
    # Evaluating train and test predictions with accuracy metric
    train_acc = accuracy_score(y_train, y_train_preds)
    test_acc = accuracy_score(y_test, y_test_preds)

    # Evaluating train and test predictions with F1 Score metric
    train_f1 = f1_score(y_train, y_train_preds)
    test_f1 = f1_score(y_test, y_test_preds)

    return (train_acc, test_acc, train_f1, test_f1)

### Preparing dataset

In [7]:
def preparing_dataset (df, id = df.columns[0], feature = df.columns[1]):
    # Removing employee identification features
    df.drop(columns=id, inplace=True)
    
    # Filling null values
    df.fillna(0, inplace = True)
    # df.round(decimals = 3)

    # Separating input features and target variable
    y = df[feature]
    X = df.drop(columns=feature)

    # Encoding target variable
    y = LabelEncoder().fit_transform(y)

    # Splitting dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    return X_train, X_test, y_train, y_test

## Model definitions

### Experiment N°1-1: The naive model (always predicts 0)

In [8]:
def experiment_1 (df, y_train, y_test):
    # Generating naive predictions
    y_train_preds = np.zeros(len(y_train))
    y_test_preds = np.zeros(len(y_test))

    # Calculating performance metrics of the model
    train_acc, test_acc, train_f1, test_f1 = calculate_performance(y_train, y_test, y_train_preds, y_test_preds)

    # Creating and starting experiment
    mlflow_exp = r"Exp N°1: Baseline Models"
    exp_id = mlflow.create_experiment(mlflow_exp)

    # Logging the experiment
    log_mlflow_experiment(
        exp_id,
        "Naive Model",
        {"project": "Caylent - Hiring process", "modelo": "naive"},
        {"train_acc": train_acc, "test_acc": test_acc, "train_f1": train_f1, "test_f1": test_f1}
    )

### Experiment N°1-2: The delusional model (always predicts 1)

In [9]:
def experiment_2 (df, y_train, y_test): 
    # Generating naive predictions
    y_train_preds = np.ones(len(y_train))
    y_test_preds = np.ones(len(y_test))

    # Calculating performance metrics of the model
    train_acc, test_acc, train_f1, test_f1 = calculate_performance(y_train, y_test, y_train_preds, y_test_preds)

    # Creating and starting experiment
    mlflow_exp = r"Exp N°2: Baseline Models"
    exp_id = mlflow.create_experiment(mlflow_exp)

    # Logging the experiment
    log_mlflow_experiment(
        exp_id,
        "Delusional Model",
        {"project": "Caylent - Hiring process", "modelo": "delusional"},
        {"train_acc": train_acc, "test_acc": test_acc, "train_f1": train_f1, "test_f1": test_f1}
    )

### Experiment N°3: KNN model with only numerical features

In [36]:
def experiment_3 (df, X_train, X_test):
    # Filtering numerical features
    new_X_train = X_train.select_dtypes(include="number")
    new_X_test = X_test.select_dtypes(include="number")

    # Defining list of values for the K hyperparameter
    Ks = [1, 2, 3, 4, 5]
    
    # Creating and starting parent experiment
    hash = str(binascii.hexlify(os.urandom(3)).decode("utf-8"))
    mlflow_exp = "Exp N°3: KNN Models - ID {}".format(hash)
    exp_id = mlflow.create_experiment(mlflow_exp)
    mlflow.start_run(experiment_id=exp_id, run_name="KNN Model")

    # Looping over the different values of K
    for k in Ks:
        
        # Defining and training KNN model
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(new_X_train, y_train)

        # Generating predictions for train and test sets
        y_train_preds = knn.predict(new_X_train)
        y_test_preds = knn.predict(new_X_test)

        # Calculating performance metrics of the model
        train_acc, test_acc, train_f1, test_f1 = calculate_performance(y_train, y_test, y_train_preds, y_test_preds)

        # Logging the child experiment
        log_mlflow_experiment(
            exp_id,
            "KNN, K={0}".format(k),
            {"project": "Caylent - Hiring process", "modelo": "knn", "k": k},
            {"train_acc": train_acc, "test_acc": test_acc, "train_f1": train_f1, "test_f1": test_f1},
            nested=True
        )

    # Finishing parent experiment
    mlflow.end_run()

## Functions executions

### Preparing dataset

In [11]:
X_train, X_test, y_train, y_test = preparing_dataset (df)

### Experiment 1

In [12]:
experiment_1 (df, y_train, y_test) 

In [13]:
experiment_2 (df, y_train, y_test) 

In [14]:
df.head()

Unnamed: 0,[Candidate] Hired,[Candidate] Overall,[AWS] EC2,[AWS] S3: Arch,[AWS] S3: Access,[AWS] S3: Classes,[AWS] S3: DR,[AWS] IAM,[AWS] Networking,[AWS] Net. App. Reliability,...,[TF] Secrets and States,[TF] Sensitive information,[TF] Best practices,[TF] Managing values,[K8] Architecture: Auto Scaling,[K8] Architecture: Control Plane,[K8] Core Concepts,[K8] Services & Networking: Ingress,[K8] Services & Networking: Service,[K8] Workload Management
0,Yes,64.3444,87.5,100.0,50.0,50.0,50.0,0.0,100.0,100.0,...,33.33,75.0,75.0,100.0,50.0,50.0,50.0,50.0,100.0,100.0
1,Yes,78.056,100.0,100.0,50.0,100.0,100.0,40.0,100.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Yes,69.3444,87.5,50.0,50.0,0.0,50.0,80.0,100.0,100.0,...,0.0,75.0,50.0,80.0,25.0,25.0,100.0,100.0,100.0,100.0
3,Yes,55.278,87.5,0.0,100.0,100.0,50.0,40.0,87.5,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Yes,83.42,100.0,100.0,100.0,50.0,100.0,80.0,100.0,87.5,...,0.0,0.0,0.0,0.0,75.0,75.0,100.0,75.0,100.0,100.0


In [22]:
df.columns = [
    'A', 'B',
    'C', 'D','E', 'F','G', 'H','I', 'J','K', 'L','M', 'N','O', 'P','Q', 'R','S', 'T','W', 'X',
    'Y', 'Z','AB'
]

In [37]:
experiment_3 (df, X_train, X_test) 

