In [3]:
CLUSTERS_YAML_PATH = "C:/Users/M/Documents/data_science_project/data/processed/features_skills_clusters_description.yaml"
LOG_DATA_PKL = 'data.pkl'
LOG_MODEL_PKL = 'model.pkl'
ARTIFACTS_PATH = 'C:/Users/M/Documents/data_science_project/notebooks/mlruns/1/67403f4417c34d82b102947d9e1aaa9d/artifacts/'

In [2]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

In [4]:
class JobPrediction :
    """Production Class For Predicting The Probability Of a job from skills """
    # Class Attributes
    path_clusters_config = None
    skills_clusters_df   = None
    
    artifacts_path = None
    
    model = None
    all_features = None
    all_jobs = None
    
    # Constructor 
    def __init__(self ,artifacts_path ,clusters_yaml_path) :
        self.artifacts_path = artifacts_path
        # Retrive models and features
        mlflow_objs = self.load_mlflow_objs(artifacts_path)
        self.model = mlflow_objs[0]
        self.all_features = mlflow_objs[1]
        self.all_jobs = mlflow_objs[2]
        # Load cluster config 
        self.path_clusters_config = clusters_yaml_path
        self.skills_clusters_df = self.load_clusters_config(clusters_yaml_path)
        
    # ---------------------------------------------------------------------------------

    # Helper Functions

    def load_mlflow_objs(self ,artifacts_path) :
        # load data pkl
        data_path = os.path.join(artifacts_path ,LOG_DATA_PKL)
        with open(data_path ,'rb') as handle :
            data_pkl = pickle.load(handle)
            
        # load model pkl 
        model_path = os.path.join(artifacts_path ,LOG_MODEL_PKL)
        with open(model_path ,'rb') as handle :
            model_pkl = pickle.load(handle)
            
        return model_pkl["model_object"], data_pkl["features_names"], data_pkl["targets_names"]
    
    # ------------------------------------------------------------------------------------
    
    def load_clusters_config(self ,path_clusters_config) :
        with open(path_clusters_config ,'r') as stream :
            cluster_config = yaml.safe_load(stream)
            
        clusters_df = [(cluster_name, cluster_skill)
                       for cluster_name, cluster_skills in cluster_config.items()
                       for cluster_skill in cluster_skills]
        clusters_df = pd.DataFrame(clusters_df ,columns = ['cluster_name' ,'skill'])
        return clusters_df
    
    # ========================================================
    # **************    Prediction Functions    **************  
    # ========================================================
    
    def create_features_array(self, available_skills):
        """Create the features array from a list of the available skills"""
        
        # Method's helper functions 
        def create_clusters_features(self, available_skills):
            sample_clusters = self.skills_clusters_df.copy()
            sample_clusters["available_skills"] = sample_clusters["skill"].isin(available_skills)
            cluster_features = sample_clusters.groupby("cluster_name")["available_skills"].sum()
            return cluster_features
            
        def create_skills_features(self, available_skills, exclude_features):
            all_features = pd.Series(self.all_features.copy())
            skills_names = all_features[~all_features.isin(exclude_features)]
            ohe_skills = pd.Series(skills_names.isin(available_skills).astype(int).tolist(), 
                                   index=skills_names)
            return ohe_skills
        
        # -------------------------
        
        # Method's main
        clusters_features = create_clusters_features(self, available_skills)
        skills_features   = create_skills_features(self, available_skills, 
                                                   exclude_features=clusters_features.index)
        # ... Combine features and sort 
        features = pd.concat([skills_features, clusters_features])
        features = features[self.all_features]
        return features.values 
    
    
    def predict_jobs_probabilities(self, available_skills):
        '''Returns probabilities of the different jobs according to the skills'''
        # Create features array 
        features_array = self.create_features_array(available_skills)
        
        # Predict and format
        predictions = self.model.predict_proba([features_array])
        predictions = [prob[0][1] for prob in predictions] # Keep positive probs 
        predictions = pd.Series(predictions, index=self.all_jobs)
        
        return predictions

    
    # ==============================================================        

In [5]:
job_pred = JobPrediction(ARTIFACTS_PATH, CLUSTERS_YAML_PATH)

In [6]:
job_pred.create_features_array(['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Julia', 'Python'])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       3, 0, 0], dtype=int64)

In [7]:
job_pred.predict_jobs_probabilities(['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Julia', 'Python'])

Academic researcher                              0.63
Data or business analyst                         0.02
Data scientist or machine learning specialist    0.88
Database administrator                           0.00
DevOps specialist                                0.01
Developer, QA or test                            0.01
Developer, back-end                              0.03
Developer, desktop or enterprise applications    0.01
Developer, embedded applications or devices      0.00
Developer, front-end                             0.00
Developer, full-stack                            0.01
Developer, game or graphics                      0.01
Developer, mobile                                0.02
Engineer, data                                   0.03
Scientist                                        0.26
System administrator                             0.01
dtype: float64