In [19]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

In [20]:
LOG_PATH = "../models/"
LOG_DATA_PKL =  "../data/data_details_lr.pkl"
PATH = '../data/processed/data_engineered_df.pkl'
EXPERIMENT_NAME = "job-profile-prediction"
TRACKING_URI = "../mlflow"

os.environ["MLFLOW_TRACKING_URI"] = TRACKING_URI


In [21]:
def calculate_quality(actual, predictions):
    metrics = [accuracy_score, precision_score, recall_score, f1_score]
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = actual[col].copy()
        quality_scores[col] = [round(metric_function(role_truth, role_pred) * 100, 2) for metric_function in metrics]
    
    indecies = [score.__name__ for score in metrics]
    quality_scores = pd.DataFrame(quality_scores, index=indecies).T
    
    return quality_scores

In [22]:
df = pd.read_pickle(PATH)

In [23]:
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices",...,skill_cluster_6,skill_cluster_7,skill_cluster_8,skill_cluster_9,skill_cluster_10,skill_cluster_11,skill_cluster_12,skill_cluster_13,skill_cluster_14,skill_cluster_15
0,0,0,0,0,0,0,0,0,0,0,...,0,2,1,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,2,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,3,0,0,1,1,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,3,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2
48994,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,1
48995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48996,0,0,0,0,0,0,0,0,0,0,...,2,0,2,2,0,0,0,0,0,0


In [24]:
jobs =  df['DevType'].copy() 

jobs.sum().sort_values(ascending=False)

.NET-full-stack                                  4149
React[NAtive]-full-stack                         4099
Developer, front-end                             4084
Developer, desktop or enterprise applications    3192
PHP-full-stack                                   2849
Python-back-end                                  2556
Python-full-stack                                2235
Developer, mobile                                2145
Java-back-end                                    2006
.NET-back-end                                    1821
Developer, embedded applications or devices      1558
Java-full-stack                                  1490
Scientest/Researcher                             1300
Data scientist or machine learning specialist    1221
PHP-back-end                                     1220
Research & Development role                      1016
Engineer, data                                    945
React[NAtive]-back-end                            918
DevOps specialist           

In [25]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in jobs.columns:
    sub_df = jobs.loc[jobs[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class+1000, replace=True, random_state=0) 
    
    resampled_roles.append(sub_df)

In [26]:
# Construct dfs
roles_df  = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [27]:
jobs =  df['DevType'].copy() 

jobs.sum().sort_values(ascending=False)

Scientest/Researcher                             2200
Developer, front-end                             2200
.NET-back-end                                    2200
Python-full-stack                                2200
Python-back-end                                  2200
Java-back-end                                    2200
Java-full-stack                                  2200
Developer, mobile                                2200
PHP-full-stack                                   2200
PHP-back-end                                     2200
Developer, embedded applications or devices      2200
React[NAtive]-full-stack                         2200
Data scientist or machine learning specialist    2200
.NET-full-stack                                  2200
Developer, desktop or enterprise applications    2200
Ruby-back-end                                    1200
Swift-full-stack                                 1200
Swift-back-end                                   1200
Rust-full-stack             

In [28]:

# Define the features and target variables
skills = df.drop(['DevType', 'skills_clusters'], axis=1).copy()
jobs =  df['DevType'].copy() 

X_train, X_test, Y_train, Y_test = train_test_split(skills, 
                                                    jobs, 
                                                    random_state=0)


  skills = df.drop(['DevType', 'skills_clusters'], axis=1).copy()


In [29]:
clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression(max_iter=1000)))




clf.fit(X_train.values, Y_train.values)
predictions =  pd.DataFrame(clf.predict(X_train.values),
                            columns=Y_train.columns)

In [30]:
# Evaluate on training set
predictions =  pd.DataFrame(clf.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = calculate_quality(Y_train, predictions)
                
train_scores
mean_train_scores = train_scores.mean()
print(mean_train_scores)
train_scores.sort_values('precision_score', ascending=False)

accuracy_score     98.352766
precision_score    67.622128
recall_score       39.125532
f1_score           46.395532
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",99.84,92.87,97.65,95.2
"Developer, full-stack",99.73,90.65,93.36,91.98
"Developer, QA or test",98.32,87.5,0.77,1.54
Elixir-full-stack,99.26,79.44,75.66,77.51
Scala/Big data-full-stack,99.2,79.34,71.76,75.36
Blockchain,98.77,78.81,36.74,50.11
Swift-back-end,99.1,78.8,63.93,70.59
"Developer, mobile",98.26,78.1,62.13,69.21
Oracle-full-stack,99.06,78.09,61.37,68.73
Scala/Big data-back-end,99.12,78.04,66.74,71.95


In [31]:
# Evaluate on test set
predictions =  pd.DataFrame(clf.predict(X_test.values),
                            columns=Y_test.columns)
test_scores = calculate_quality(Y_test, predictions)
                
test_scores
mean_test_scores = test_scores.mean()
print(mean_test_scores)
test_scores.sort_values('precision_score', ascending=False)

  _warn_prf(average, modifier, msg_start, len(result))


accuracy_score     98.244255
precision_score    62.398511
recall_score       36.281277
f1_score           43.097660
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",98.36,100.0,1.01,2.01
"Developer, full-stack",99.74,91.95,92.26,92.1
"Developer, back-end",99.81,91.21,98.37,94.65
Oracle-full-stack,98.92,76.44,49.66,60.21
Scala/Big data-full-stack,99.15,75.84,70.34,72.99
Elixir-back-end,99.0,75.0,64.76,69.51
Blockchain,98.74,73.72,38.46,50.55
Swift-back-end,98.93,73.68,56.19,63.76
Swift-full-stack,98.88,72.69,56.17,63.37
Elixir-full-stack,99.0,72.41,63.85,67.86


In [32]:
import pickle


# Data details
data_details = {
    "features_names": X_train.columns.droplevel(0).tolist(),
    "targets_names": Y_train.columns.tolist()
}

# Save the dictionary as a pickle file
with open(LOG_DATA_PKL, "wb") as file:
    pickle.dump(data_details, file)


In [33]:

mlflow.create_experiment(EXPERIMENT_NAME)
exp_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id


In [34]:


# Start a new run and track 
with mlflow.start_run(experiment_id=exp_id, run_name="logistic_regression"):

    # Set the desired directory path

    # Set the MLFLOW_TRACKING_URI environment variable
    mlflow.sklearn.log_model(clf, "model", registered_model_name="logistic_regression") 
    [mlflow.log_metric(score, value) for score, value in mean_test_scores.items()]
    scores = mean_train_scores.index.tolist()
    mlflow.log_dict(data_details, "data_details")

    

Successfully registered model 'logistic_regression'.
Created version '1' of model 'logistic_regression'.
