In [44]:
# Load packages
import pandas as pd 
import numpy as np

import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

In [45]:
LOG_PATH = "../models/"
LOG_DATA_PKL =  "../data/data_details_lr.pkl"
PATH = '../data/processed/data_engineered_df.pkl'
EXPERIMENT_NAME = "job-profile-prediction"
TRACKING_URI = "../mlflow"

os.environ["MLFLOW_TRACKING_URI"] = TRACKING_URI


In [46]:
# quality metrics for model evaluation
def calculate_quality(actual, predictions):
    metrics = [accuracy_score, precision_score, recall_score, f1_score]
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = actual[col].copy()
        quality_scores[col] = [round(metric_function(role_truth, role_pred) * 100, 2) for metric_function in metrics]
    
    indecies = [score.__name__ for score in metrics]
    quality_scores = pd.DataFrame(quality_scores, index=indecies).T
    
    return quality_scores

In [47]:
df = pd.read_pickle(PATH)

In [48]:
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices",...,Swift full-stack,PHP back-end,PHP full-stack,Scala/Big data back-end,Scala/Big data full-stack,React[NAtive] back-end,React[NAtive] full-stack,AI back-end,AI full-stack,Scientest/Researcher
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64331,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64333,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
64335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
jobs =  df['DevType'].copy() 

jobs.sum().sort_values(ascending=False)

.NET full-stack                                  4149
React[NAtive] full-stack                         4099
Developer, front-end                             4084
Developer, desktop or enterprise applications    3192
PHP full-stack                                   2849
Python back-end                                  2556
Python full-stack                                2235
Developer, mobile                                2145
Java back-end                                    2006
.NET back-end                                    1821
Scientest/Researcher                             1772
Developer, embedded applications or devices      1558
Java full-stack                                  1490
Data scientist or machine learning specialist    1221
PHP back-end                                     1220
Research & Development role                      1016
Engineer, data                                    945
React[NAtive] back-end                            918
DevOps specialist           

## Upsampling and Downsampling

In [50]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in jobs.columns:
    sub_df = jobs.loc[jobs[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class+1000, replace=True, random_state=0) 
    
    resampled_roles.append(sub_df)

In [51]:
# Construct dfs
roles_df  = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [52]:
jobs =  df['DevType'].copy() 

jobs.sum().sort_values(ascending=False)

Scientest/Researcher                             5600
Developer, embedded applications or devices      4138
Developer, front-end                             2200
.NET back-end                                    2200
Python full-stack                                2200
Python back-end                                  2200
Java back-end                                    2200
Java full-stack                                  2200
Developer, mobile                                2200
PHP full-stack                                   2200
PHP back-end                                     2200
.NET full-stack                                  2200
React[NAtive] full-stack                         2200
Data scientist or machine learning specialist    2200
Developer, desktop or enterprise applications    2200
Hardware Engineer                                1462
AI full-stack                                    1200
AI back-end                                      1200
Elixir back-end             

## train test split

In [53]:

# Define the features and target variables
skills = df.drop(['DevType'], axis=1).copy()
jobs =  df['DevType'].copy() 

X_train, X_test, Y_train, Y_test = train_test_split(skills, 
                                                    jobs, 
                                                    random_state=0)


  skills = df.drop(['DevType'], axis=1).copy()


## train model

In [54]:
classifier = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression(max_iter=1000)))




classifier.fit(X_train.values, Y_train.values)
predictions =  pd.DataFrame(classifier.predict(X_train.values),
                            columns=Y_train.columns)

## evaluate model

In [55]:
# Evaluate on training set
predictions =  pd.DataFrame(classifier.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = calculate_quality(Y_train, predictions)
                
train_scores
mean_train_scores = train_scores.mean()
print(mean_train_scores)
train_scores.sort_values('precision_score', ascending=False)

accuracy_score     98.249149
precision_score    67.931915
recall_score       40.277447
f1_score           47.424043
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",99.84,92.87,97.65,95.2
"Developer, full-stack",99.73,90.74,93.36,92.03
"Developer, QA or test",98.32,81.82,1.0,1.97
Scala/Big data full-stack,99.21,79.42,72.09,75.58
Blockchain,98.8,79.1,39.07,52.3
Elixir full-stack,99.25,78.78,76.0,77.36
Swift back-end,99.11,78.67,64.26,70.74
Oracle full-stack,99.05,78.01,60.71,68.28
"Developer, mobile",98.25,78.0,61.77,68.94
Scala/Big data back-end,99.12,77.99,66.96,72.06


In [56]:
# Evaluate on test set
predictions =  pd.DataFrame(classifier.predict(X_test.values),
                            columns=Y_test.columns)
test_scores = calculate_quality(Y_test, predictions)
                
test_scores
mean_test_scores = test_scores.mean()
print(mean_test_scores)
test_scores.sort_values('precision_score', ascending=False)

accuracy_score     98.134894
precision_score    62.157234
recall_score       37.320000
f1_score           44.015957
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",99.74,91.95,92.26,92.1
"Developer, back-end",99.81,91.21,98.37,94.65
"Developer, QA or test",98.36,80.0,1.35,2.66
Oracle full-stack,98.91,75.65,49.66,59.96
Scala/Big data full-stack,99.13,75.47,68.97,72.07
Elixir back-end,98.99,74.73,64.76,69.39
Swift back-end,98.97,74.68,58.19,65.41
Blockchain,98.75,74.19,38.46,50.66
Swift full-stack,98.87,73.36,54.55,62.57
"Developer, game or graphics",98.8,72.73,43.39,54.35


## Save model

In [57]:

# Data details
data_details = {
    "features_names": X_train.columns.droplevel(0).tolist(),
    "targets_names": Y_train.columns.tolist()
}

# Save the dictionary as a pickle file
with open(LOG_DATA_PKL, "wb") as file:
    pickle.dump(data_details, file)


In [60]:

mlflow.create_experiment(EXPERIMENT_NAME)
exp_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id


In [61]:


# Start a new run and track 
with mlflow.start_run(experiment_id=exp_id, run_name="logistic_regression"):

    # Set the desired directory path

    # Set the MLFLOW_TRACKING_URI environment variable
    mlflow.sklearn.log_model(clf, "model", registered_model_name="logistic_regression") 
    [mlflow.log_metric(score, value) for score, value in mean_test_scores.items()]
    scores = mean_train_scores.index.tolist()
    mlflow.log_dict(data_details, "data_details")

    

Successfully registered model 'logistic_regression'.
Created version '1' of model 'logistic_regression'.
