In [10]:
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "7a24a21a88fe4329a986ba8dd6c942cb"

LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/fe_cluster_skills_description.yaml"

In [45]:
import os 
import sklearn
import pickle
import yaml
import re
import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

## Initialization

### MLFlow

In [65]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artificats_path = re.sub('file:\\\\','',os.path.normpath(run.info.artifact_uri))

In [66]:
artificats_path

'D:\\projects\\Machine Learning\\Tech jobs analysis\\models\\mlruns\\953534236177104710\\7a24a21a88fe4329a986ba8dd6c942cb\\artifacts'

'D:\\projects\\Machine Learning\\Tech jobs analysis\\models\\mlruns\\953534236177104710\\7a24a21a88fe4329a986ba8dd6c942cb\\artifacts'

### Loading Model and Data

In [67]:
# model
model_path = os.path.join(artificats_path, LOG_MODEL_PKL)
with open(model_path, "rb") as f:
    model = pickle.load(f)

model

{'model_description': 'Random Forest: with PCA + Hyperparamter tuning',
 'model_details': "GridSearchCV(estimator=Pipeline(steps=[('robustscaler', RobustScaler()),\n                                       ('pca', PCA()),\n                                       ('randomforestclassifier',\n                                        RandomForestClassifier(n_jobs=8,\n                                                               random_state=0,\n                                                               verbose=1))]),\n             param_grid=[{'pca__n_components': [0.6, 0.65, 0.7, 0.8, 0.95],\n                          'randomforestclassifier__max_depth': [3, 10, None],\n                          'randomforestclassifier__n_estimators': [250, 500,\n                                                                   600]}])",
 'model_object': GridSearchCV(estimator=Pipeline(steps=[('robustscaler', RobustScaler()),
                                        ('pca', PCA()),
                      

In [68]:
# Load data pkl
data_path  = os.path.join(artificats_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [69]:
# Unpack vars
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [70]:
classifier

## Loading skill clusters

In [72]:
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)

In [75]:
clusters_list = [(cluster_name, skill)
                 for cluster_name, cluster_skills in clusters_config.items()
                 for skill in cluster_skills]
clusters_df = pd.DataFrame(clusters_list, columns=['cluster_name', 'skill'])

### Prediction

In [78]:
sample_skills = ['Scala', 'Hadoop', 'Python']
#Verification
pd.Series(sample_skills).isin(features_names).all()

True

### Creating cluster features

In [83]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
cluster_features  = sample_clusters.groupby('cluster_name')['sample_skills'].sum()
cluster_features 

cluster_name
skill_group_0     0
skill_group_1     0
skill_group_10    0
skill_group_11    0
skill_group_12    0
skill_group_13    0
skill_group_14    0
skill_group_15    2
skill_group_16    0
skill_group_17    0
skill_group_18    0
skill_group_19    0
skill_group_2     0
skill_group_20    0
skill_group_3     0
skill_group_4     0
skill_group_5     0
skill_group_6     0
skill_group_7     0
skill_group_8     0
skill_group_9     1
Name: sample_skills, dtype: int64

### Creating one-hot encoded skills

In [84]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                          APL
1                          Ada
2                         Apex
3                     Assembly
4      Bash/Shell (all shells)
                ...           
203              liblittletest
204                        npm
205                       pnpm
206                     snitch
207                      tunit
Length: 208, dtype: object

In [94]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), index=skills_names)

In [98]:
features = pd.concat([ohe_skills, cluster_features]).loc[features_names]

#### Predict

In [99]:
predictions = classifier.predict_proba([features.values])

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.1s finished


In [102]:
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)[:5]

Developer                                        0.772
 QA or test                                      0.700
Data scientist or machine learning specialist    0.088
Engineer                                         0.068
 data                                            0.064
dtype: float64