# IT Educational Institute

In [1]:
# Constants
DF_PATH = "../data/processed/2_cleaned_data.pkl"

# Selected role and tech
ROLE_COLS = ['DevType']
TECH_COLS = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
            'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith',
            'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith',
            'NEWCollabToolsHaveWorkedWith']

# MlFlow constants
MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

# Logging constants
LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

In [2]:
# Importing libraries
import os
from pathlib import Path
import logging
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly

## Function

In [28]:
# Creating a function to calculate the scores of a model
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index = quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
        
    return quality_scores

## Read & Preprocess Data

### Read Data

In [4]:
# Reading data
df = pd.read_pickle(DF_PATH)

In [5]:
# Displaying forst 5 observations
df.head()

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications",...,skills_group_11,skills_group_12,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,2,4,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,3,1,3,4,0,2
9,0,0,0,0,0,0,0,0,0,1,...,0,2,1,0,0,2,0,1,2,0
10,0,0,0,0,0,0,0,0,1,0,...,0,0,3,0,10,0,1,0,0,0


### Imabalced Classes

In [7]:
# Checking total number of roles
roles_df = df['DevType'].copy()
roles_df.sum().sort_values(ascending=False)

Developer, full-stack                            17510
Developer, back-end                              15692
Developer, front-end                              7927
Developer, desktop or enterprise applications     4328
Developer, mobile                                 3703
DevOps specialist                                 2382
Cloud infrastructure engineer                     1994
Data scientist or machine learning specialist     1889
Developer, embedded applications or devices       1803
Engineer, data                                    1627
Academic researcher                               1535
System administrator                              1404
Data or business analyst                          1313
Database administrator                            1037
Developer, QA or test                             1024
Scientist                                          911
Developer, game or graphics                        810
Security professional                              585
Blockchain

In [14]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
        
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0)
    
    resampled_roles.append(sub_df)

In [15]:
# Concatenating resampled_roles to create balanced DataFrame
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [17]:
# Checking the total number of roles after balancing
roles_df.sum().sort_values(ascending=False)

Developer, back-end                              6763
Developer, full-stack                            6338
Developer, desktop or enterprise applications    2931
Developer, front-end                             2749
DevOps specialist                                2438
Data scientist or machine learning specialist    2436
Academic researcher                              2390
Cloud infrastructure engineer                    2263
System administrator                             2123
Developer, mobile                                2083
Engineer, data                                   2077
Data or business analyst                         2011
Scientist                                        1925
Developer, embedded applications or devices      1853
Database administrator                           1845
Developer, QA or test                            1542
Security professional                            1527
Developer, game or graphics                      1468
Blockchain                  

## Train Test Split

In [18]:
# Importing train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Applying train_test_split to create training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(df.drop('DevType', axis=1),
                                                   df['DevType'], 
                                                   test_size=0.25,
                                                    random_state=0)

# Checking the shapes of train & test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (17100, 174)
Shape of X_test: (5700, 174)
Shape of y_train: (17100, 19)
Shape of y_test: (5700, 19)


  X_train, X_test, y_train, y_test = train_test_split(df.drop('DevType', axis=1),


## Train Models

### Initialize MLFlow

In [20]:
# Importing mlflow and MlflowClient
import mlflow
from mlflow import MlflowClient

# Initializing client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

### 1. Random Forest with PCA

In [75]:
# Importing necessary libraries
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

# Pipeline with random forest and pca
rf_clf = make_pipeline(RobustScaler(),
                       PCA(n_components=0.95),
                      RandomForestClassifier(n_jobs=8,
                                            verbose=1,
                                            random_state=0))

# Fitting pipeline to train sets
rf_clf.fit(X_train.values, y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.5s finished


#### Evaluation on Training Set

In [76]:
# importing metrics from sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluation on training set
predictions = pd.DataFrame(rf_clf.predict(X_train.values),
                          columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score)
               for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores, axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished


In [77]:
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,99.96,99.89,99.77,99.83
Blockchain,99.96,99.52,99.81,99.67
Cloud infrastructure engineer,100.0,100.0,100.0,100.0
Data or business analyst,99.96,99.87,99.67,99.77
Data scientist or machine learning specialist,99.96,99.84,99.84,99.84
Database administrator,99.94,100.0,99.2,99.6
DevOps specialist,99.99,100.0,99.95,99.97
"Developer, QA or test",99.98,99.91,99.83,99.87
"Developer, back-end",99.91,99.94,99.75,99.84
"Developer, desktop or enterprise applications",99.95,99.77,99.82,99.79


#### Evaluation on Testing Set

In [78]:
# Evaluating on test set
predictions = pd.DataFrame(rf_clf.predict(X_test.values),
                          columns=y_train.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score)
              for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished


In [79]:
# Printing the mean test score of different evaluation metrics
print(mean_test_scores)

# Displaying test_scores
test_scores.sort_values('precision_score')

accuracy_score     94.907895
precision_score    97.000000
recall_score       60.004211
f1_score           73.289474
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",85.47,87.92,55.22,67.83
"Developer, back-end",83.67,91.58,48.06,63.04
"Developer, front-end",92.02,92.34,36.83,52.65
"Developer, desktop or enterprise applications",92.21,94.48,44.65,60.64
Data scientist or machine learning specialist,95.65,95.09,62.93,75.73
"Developer, mobile",95.81,95.5,56.9,71.31
"Developer, embedded applications or devices",95.4,96.39,42.31,58.81
Academic researcher,95.65,97.31,62.68,76.25
Scientist,98.09,98.71,78.69,87.57
"Engineer, data",95.53,98.8,49.4,65.86


### Logging

In [80]:
# Data details
data_details = {'data_path':DF_PATH,
               'training_indices':X_train.index.tolist(),
               'test_indices':X_test.index.tolist(),
               'features_names':X_train.columns.droplevel().tolist(),
               'targets_names': y_train.columns.tolist()}

# Writing data details into a pickle file
with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as outputfile:
    pickle.dump(data_details, outputfile)

In [81]:
# Model
model = {'model_description': "Random Forest: with PCA - Basic",
        "model_details": str(rf_clf),
        'model_object': rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as outputfile:
    pickle.dump(model, outputfile)

In [82]:
# Metrics 
classes_metrics = {'train_scores': train_scores,
                  "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as outputfile:
    pickle.dump(classes_metrics, outputfile)

In [83]:
# Starting a new run and tracking
with mlflow.start_run(experiment_id=exp.experiment_id,
                     run_name=model['model_description']):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

### 2. HyperParameter Tuning

In [84]:
# Creating a new instance of RandomForest with PCA to hyper tune
hpt_rf_clf = Pipeline(steps=[('scaler', RobustScaler()),
                          ('pca', PCA(n_components=0.95)),
                          ('randomforestclassifier', RandomForestClassifier(n_jobs=8,
                                                verbose=1,
                                                random_state=0))])

In [85]:
list(hpt_rf_clf.get_params().keys())

['memory',
 'steps',
 'verbose',
 'scaler',
 'pca',
 'randomforestclassifier',
 'scaler__copy',
 'scaler__quantile_range',
 'scaler__unit_variance',
 'scaler__with_centering',
 'scaler__with_scaling',
 'pca__copy',
 'pca__iterated_power',
 'pca__n_components',
 'pca__n_oversamples',
 'pca__power_iteration_normalizer',
 'pca__random_state',
 'pca__svd_solver',
 'pca__tol',
 'pca__whiten',
 'randomforestclassifier__bootstrap',
 'randomforestclassifier__ccp_alpha',
 'randomforestclassifier__class_weight',
 'randomforestclassifier__criterion',
 'randomforestclassifier__max_depth',
 'randomforestclassifier__max_features',
 'randomforestclassifier__max_leaf_nodes',
 'randomforestclassifier__max_samples',
 'randomforestclassifier__min_impurity_decrease',
 'randomforestclassifier__min_samples_leaf',
 'randomforestclassifier__min_samples_split',
 'randomforestclassifier__min_weight_fraction_leaf',
 'randomforestclassifier__n_estimators',
 'randomforestclassifier__n_jobs',
 'randomforestclassifi

In [86]:
# Creating a variable with different parameters 
tuned_hyperparameters = [{
    'pca__n_components': [0.7, 0.85, 0.95],
    'randomforestclassifier__n_estimators': [250, 500],
    'randomforestclassifier__max_depth': [3, 10, None]
}]

In [87]:
# Importing GridSearchCV from sklearn
from sklearn.model_selection import GridSearchCV

# Applying GridSearchCV on training set
hpt_rf_clf = GridSearchCV(hpt_rf_clf, tuned_hyperparameters)
hpt_rf_clf.fit(X_train.values, y_train.values)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 235 out of 250 | elapsed:    1.6s remaining:    0.1s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 250 out

[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    3.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    5.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    3.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    5.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Paral

[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    6.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    8.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.4s finish

[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    6.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    8.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    6.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    8.5s finished
[P

[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    5.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   12.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   13.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    0.6s
[Parallel(n

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 235 out of 250 | elapsed:    5.1s remaining:    0.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    5.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    7.5s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   10.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tas

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed: 15.5min
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed: 15.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed: 33.8min
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed: 33.8min finished
[Parallel(n

In [88]:
# Checking best parameters
hpt_rf_clf.best_params_

{'pca__n_components': 0.7,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__n_estimators': 500}

#### Evaluating on Training Set

In [89]:
# Evaluating on training set
predictions = pd.DataFrame(hpt_rf_clf.predict(X_train.values),
                          columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score)
               for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores, axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    2.9s finished


In [90]:
# Dislpaying training scores
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,99.97,99.77,99.94,99.86
Blockchain,99.96,99.52,99.81,99.67
Cloud infrastructure engineer,100.0,100.0,100.0,100.0
Data or business analyst,99.96,99.87,99.67,99.77
Data scientist or machine learning specialist,99.96,99.89,99.78,99.84
Database administrator,99.94,99.93,99.27,99.6
DevOps specialist,99.99,100.0,99.95,99.97
"Developer, QA or test",99.98,99.91,99.83,99.87
"Developer, back-end",99.91,99.92,99.77,99.84
"Developer, desktop or enterprise applications",99.95,99.77,99.82,99.79


#### Evaluating on Testing Set

In [91]:
# Evaluating on testing set
predictions = pd.DataFrame(hpt_rf_clf.predict(X_test.values),
                          columns=y_train.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score)
              for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
test_scores_mean = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    1.3s finished


In [92]:
# Displaying the mean of each test score column
print(test_scores_mean)

# Displaying the test scores
test_scores

accuracy_score     95.033158
precision_score    94.840526
recall_score       62.605789
f1_score           74.680526
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,95.74,95.58,64.72,77.18
Blockchain,98.96,99.69,84.53,91.49
Cloud infrastructure engineer,95.11,95.67,53.83,68.9
Data or business analyst,96.28,97.95,58.22,73.03
Data scientist or machine learning specialist,95.79,89.98,68.62,77.86
Database administrator,97.88,99.44,74.73,85.33
DevOps specialist,94.37,97.76,49.27,65.52
"Developer, QA or test",97.88,99.2,67.57,80.39
"Developer, back-end",83.79,85.14,53.39,65.63
"Developer, desktop or enterprise applications",92.16,92.76,45.17,60.76


### Logging

#### 1. Prepare

In [93]:
# Data details
data_details = {'data_path': DF_PATH,
               'training_indices': X_train.index.tolist(),
               'testing_indices': X_test.index.tolist(),
               'features_names': X_train.columns.droplevel().tolist(),
               'targets_names': y_train.columns.tolist()}

# Writing data in to a pickle file
with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as outputfile:
    pickle.dump(data_details, outputfile)

In [94]:
# Model
model = {'model_description': "Random Forest: with PCA + Hyperparameter Tuning",
        'model_details': str(hpt_rf_clf),
        'model_object': hpt_rf_clf}

# Writing model data in a pickle file
with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as outputfile:
    pickle.dump(model, outputfile)

In [95]:
# Metrics
classes_metrics = {'train_scores': train_scores,
                  'test_scores': test_scores}

# Writing classes metrics in a pickle file
with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as outputfile:
    pickle.dump(classes_metrics, outputfile)

#### 2. Log Run

In [96]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id,
                     run_name=model['model_description']):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)