In [29]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from matplotlib import pyplot as plt
import xgboost as xgb


In [30]:
PATH = '../data/processed/data_engineered_df.pkl'
TRACKING_URI = "../mlflow"
LOG_DATA_PKL    =  "data.pkl"
EXPERIMENT_NAME = "job-profile-prediction"



In [31]:
def calculate_quality(actual, predictions):
    metrics = [accuracy_score, precision_score, recall_score, f1_score]
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = actual[col].copy()
        quality_scores[col] = [round(metric_function(role_truth, role_pred) * 100, 2) for metric_function in metrics]
    
    indecies = [score.__name__ for score in metrics]
    quality_scores = pd.DataFrame(quality_scores, index=indecies).T
    
    return quality_scores

In [32]:
DATA_PAth = '../data/processed/data_engineered_df.pkl'

In [33]:
df = pd.read_pickle(DATA_PAth)

In [34]:
skills = df.drop(['DevType', 'skills_clusters'], axis=1).copy()
jobs =  df['DevType'].copy()



  skills = df.drop(['DevType', 'skills_clusters'], axis=1).copy()


In [35]:
# Resample roles
samples_per_class = 1500
resampled_roles = []

for role_col in jobs.columns:
    sub_df = jobs.loc[jobs[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class+1000, replace=True, random_state=0) 
    
    resampled_roles.append(sub_df)

In [36]:
roles_df  = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [37]:
skills = df.drop(['DevType', 'skills_clusters'], axis=1).copy()
jobs =  df['DevType'].copy() 
X_train, X_test, Y_train, Y_test = train_test_split(skills, 
                                                    jobs, 
                                                    random_state=0)

  skills = df.drop(['DevType', 'skills_clusters'], axis=1).copy()


In [38]:
xgb_model = make_pipeline(
                       PCA(n_components=0.7),
                       xgb.XGBClassifier(),
                                            )

fit = xgb_model.fit(X_train, Y_train)


In [39]:
y_pred = pd.DataFrame(xgb_model.predict(X_train),
                            columns=Y_train.columns)
xgb_train_scores = calculate_quality(Y_train, y_pred)
                
print(xgb_train_scores.mean())
xgb_train_scores


accuracy_score     99.975745
precision_score    99.820000
recall_score       99.228511
f1_score           99.512979
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Blockchain,99.99,99.48,100.0,99.74
Cloud infrastructure engineer,99.99,99.91,99.64,99.77
Data or business analyst,99.97,99.46,98.92,99.19
Data scientist or machine learning specialist,99.98,99.83,99.23,99.53
Database administrator,99.98,99.12,99.82,99.47
DevOps specialist,100.0,100.0,99.91,99.96
"Developer, QA or test",99.98,99.73,99.38,99.55
"Developer, back-end",100.0,99.91,100.0,99.96
"Developer, desktop or enterprise applications",99.57,99.58,86.66,92.67
"Developer, embedded applications or devices",99.96,99.67,99.14,99.4


In [40]:
y_pred = pd.DataFrame(xgb_model.predict(X_test),
                            columns=Y_test.columns)
xgb_test_scores = calculate_quality(Y_test, y_pred)
                
print(xgb_test_scores.mean())
xgb_test_scores


accuracy_score     99.512340
precision_score    95.408936
recall_score       83.479149
f1_score           88.401489
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Blockchain,99.97,98.33,99.72,99.02
Cloud infrastructure engineer,99.5,98.03,75.83,85.51
Data or business analyst,99.55,94.83,80.83,87.27
Data scientist or machine learning specialist,99.35,89.29,68.18,77.32
Database administrator,99.94,98.11,98.38,98.25
DevOps specialist,99.34,97.6,65.59,78.46
"Developer, QA or test",99.85,98.87,92.84,95.76
"Developer, back-end",100.0,100.0,100.0,100.0
"Developer, desktop or enterprise applications",98.19,88.93,40.66,55.81
"Developer, embedded applications or devices",99.07,93.03,76.35,83.87


In [41]:
random_forest = make_pipeline(
                       PCA(n_components=0.7),
                       RandomForestClassifier(n_jobs=8,
                                              verbose=1,
                                              random_state=0,
                                              n_estimators=500,
                                              ))

random_forest.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   11.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   55.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:  2.5min finished


In [42]:
y_pred = pd.DataFrame(random_forest.predict(X_train),
                            columns=Y_train.columns)
rf_train_scores = calculate_quality(Y_train, y_pred)
                
print(rf_train_scores.mean())
rf_train_scores

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   14.7s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   39.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   44.4s finished


accuracy_score     99.992553
precision_score    99.837872
recall_score       99.768511
f1_score           99.803191
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Blockchain,99.99,99.48,100.0,99.74
Cloud infrastructure engineer,100.0,99.91,99.82,99.86
Data or business analyst,99.97,99.28,99.1,99.19
Data scientist or machine learning specialist,99.99,99.83,99.49,99.66
Database administrator,99.98,99.12,99.82,99.47
DevOps specialist,100.0,100.0,100.0,100.0
"Developer, QA or test",99.99,99.73,99.64,99.69
"Developer, back-end",100.0,99.91,100.0,99.96
"Developer, desktop or enterprise applications",99.96,99.79,98.96,99.37
"Developer, embedded applications or devices",99.98,99.73,99.51,99.62


In [43]:
y_pred = pd.DataFrame(random_forest.predict(X_test),
                            columns=Y_test.columns)
rf_test_scores = calculate_quality(Y_test, y_pred)
                
print(rf_test_scores.mean())
rf_test_scores

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   10.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   11.8s finished


accuracy_score     99.507021
precision_score    98.731489
recall_score       80.648085
f1_score           87.584468
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Blockchain,99.97,98.6,99.72,99.16
Cloud infrastructure engineer,99.53,100.0,75.83,86.25
Data or business analyst,99.59,97.49,80.57,88.23
Data scientist or machine learning specialist,99.33,97.09,60.61,74.63
Database administrator,99.95,98.64,98.38,98.51
DevOps specialist,99.38,100.0,65.86,79.42
"Developer, QA or test",99.87,99.44,93.37,96.31
"Developer, back-end",100.0,100.0,100.0,100.0
"Developer, desktop or enterprise applications",98.4,94.24,45.72,61.57
"Developer, embedded applications or devices",99.15,97.39,75.12,84.82


In [44]:
hpt_rf_clf = make_pipeline(RobustScaler(),
                           PCA(),
                           RandomForestClassifier(n_jobs=8,
                                                  verbose=1,
                                                  random_state=0))

In [45]:
# list(hpt_rf_clf.get_params().keys())
tuned_parameters = [{
    'pca__n_components': [0.7, 0.85, 0.95],
    'randomforestclassifier__n_estimators': [250, 500],
    'randomforestclassifier__max_depth':    [16, 26, 50],
}]

In [46]:
hpt_rf_clf = GridSearchCV(hpt_rf_clf, tuned_parameters)
hpt_rf_clf.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   10.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   45.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   59.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    3.3s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   41.4s


KeyboardInterrupt: 

In [None]:
hpt_rf_clf.best_params_

In [47]:
data_details = {
    "features_names": X_train.columns.droplevel(0).tolist(),
    "targets_names": Y_train.columns.tolist()
}

In [48]:
os.environ["MLFLOW_TRACKING_URI"] = TRACKING_URI

exp_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id


In [49]:


# Start a new run and track 
with mlflow.start_run(experiment_id=exp_id, run_name="xgboost"):
    # Log pickles 
    mlflow.sklearn.log_model(xgb_model, "model", registered_model_name="xgboost") 
    [mlflow.log_metric(score, value) for score, value in rf_test_scores.mean().items()]
    mlflow.log_dict(data_details, "data/data_details")

    

Successfully registered model 'xgboost'.
Created version '1' of model 'xgboost'.
