# IT Educational Institute

In [12]:
# Constants
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_RUN_ID = "dbd98ae7e3614aee9f92c3149f7226c7"

CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [2]:
# Importing libraries
import os
import pickle
import yaml
import numpy as np
import pandas as pd
import sklearn
import mlflow
from mlflow.tracking import MlflowClient

## Initialize

### MLFlow

In [5]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artifacts_path = run.info.artifact_uri

### Load Model

In [7]:
# Loading model
model_path = os.path.join(artifacts_path, LOG_MODEL_PKL)
with open(model_path, 'rb') as f:
    model = pickle.load(f)
    
# Displaying model
model

{'model_description': 'Random Forest: with PCA - Basic',
 'model_details': "Pipeline(steps=[('robustscaler', RobustScaler()),\n                ('pca', PCA(n_components=0.95)),\n                ('randomforestclassifier',\n                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])",
 'model_object': Pipeline(steps=[('robustscaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95)),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])}

In [8]:
# Loading data
data_path = os.path.join(artifacts_path, LOG_DATA_PKL)
with open(data_path, 'rb') as f:
    data = pickle.load(f)
    
# Displaying data keys
data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [9]:
# Unpacking variables
features_names = pd.Series(data['features_names'])
targets_names = pd.Series(data['targets_names'])
classifier = model['model_object']

In [10]:
# Displaying classifier
classifier

### Load Skills Clusters

In [13]:
# Loading skills clusters
with open(CLUSTERS_YAML_PATH, 'r') as f:
    clusters_config = yaml.safe_load(f)
    
# Displaying clusters configurations
clusters_config

{'skills_group_0': ['APL',
  'COBOL',
  'Clojure',
  'Crystal',
  'F#',
  'Fortran',
  'Haskell',
  'Julia',
  'LISP',
  'OCaml',
  'Perl',
  'R',
  'SAS',
  'CouchDB',
  'Couchbase',
  'IBM DB2',
  'Colocation',
  'IBM Cloud or Watson',
  'OpenStack',
  'Tidyverse',
  'Flow',
  'Emacs',
  'RStudio'],
 'skills_group_1': ['HTML/CSS',
  'JavaScript',
  'Solidity',
  'TypeScript',
  'MongoDB',
  'Heroku',
  'Angular',
  'Angular.js',
  'Deno',
  'Express',
  'Fastify',
  'Gatsby',
  'Next.js',
  'Node.js',
  'Nuxt.js',
  'React.js',
  'Svelte',
  'Vue.js',
  'Electron',
  'React Native',
  'Yarn',
  'npm',
  'Visual Studio Code'],
 'skills_group_10': ['Elixir', 'Erlang', 'Phoenix'],
 'skills_group_11': ['Capacitor', 'Cordova', 'Ionic'],
 'skills_group_12': ['Objective-C',
  'Ruby',
  'Swift',
  'Ruby on Rails',
  'Homebrew',
  'RubyMine',
  'TextMate',
  'Xcode'],
 'skills_group_2': ['Go',
  'Scala',
  'Cassandra',
  'DynamoDB',
  'Elasticsearch',
  'Neo4j',
  'PostgreSQL',
  'Redis',
  '

In [14]:
# Reforming into DataFrame
molten_clusters = [(cluster_name, cluster_skill)
                  for cluster_name, cluster_skills in clusters_config.items()
                  for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=['cluster_name', 'skill'])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,APL
1,skills_group_0,COBOL
2,skills_group_0,Clojure
3,skills_group_0,Crystal
4,skills_group_0,F#
...,...,...
156,skills_group_9,Laravel
157,skills_group_9,Symfony
158,skills_group_9,jQuery
159,skills_group_9,PhpStorm


### Predict Sample Entry

In [16]:
sample_skills = ['Scala', 'Hadoop', 'Python']

In [17]:
# Verify
pd.Series(sample_skills).isin(features_names)

0    True
1    True
2    True
dtype: bool

#### 1. Recreate Cluster Features

In [18]:
sample_clusters = clusters_df.copy()
sample_clusters['sample_skills'] = sample_clusters['skill'].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,APL,False
1,skills_group_0,COBOL,False
2,skills_group_0,Clojure,False
3,skills_group_0,Crystal,False
4,skills_group_0,F#,False
...,...,...,...
156,skills_group_9,Laravel,False
157,skills_group_9,Symfony,False
158,skills_group_9,jQuery,False
159,skills_group_9,PhpStorm,False


In [19]:
cluster_features = sample_clusters.groupby('cluster_name')['sample_skills'].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_2     2
skills_group_3     1
skills_group_4     0
skills_group_5     0
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

#### 2. Create OneHotEncoded Skills

In [20]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
156                   Vim
157         Visual Studio
158    Visual Studio Code
159              Webstorm
160                 Xcode
Length: 161, dtype: object

In [21]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(),
                      index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 161, dtype: int64

#### 3. Combine Features

In [22]:
# Concatenating
features = pd.concat([ohe_skills,
                     cluster_features])

# Sort columns
features = features.loc[features_names]
features

APL               0
Assembly          0
Bash/Shell        0
C                 0
C#                0
                 ..
skills_group_5    0
skills_group_6    0
skills_group_7    0
skills_group_8    0
skills_group_9    0
Length: 174, dtype: int64

#### 4. Predict

In [23]:
predictions = classifier.predict_proba([features.values])
predictions

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


[array([[0.81857143, 0.18142857]]),
 array([[0.93, 0.07]]),
 array([[0.9, 0.1]]),
 array([[0.87, 0.13]]),
 array([[0.92, 0.08]]),
 array([[0.98, 0.02]]),
 array([[0.91, 0.09]]),
 array([[0.94, 0.06]]),
 array([[0.66, 0.34]]),
 array([[0.93, 0.07]]),
 array([[0.93, 0.07]]),
 array([[0.96, 0.04]]),
 array([[0.9, 0.1]]),
 array([[0.98, 0.02]]),
 array([[0.97, 0.03]]),
 array([[0.79, 0.21]]),
 array([[0.89, 0.11]]),
 array([[0.96, 0.04]]),
 array([[0.94, 0.06]])]

In [24]:
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs,
         index=targets_names).sort_values(ascending=False)

Developer, back-end                              0.340000
Engineer, data                                   0.210000
Academic researcher                              0.181429
Data or business analyst                         0.130000
Scientist                                        0.110000
Cloud infrastructure engineer                    0.100000
Developer, full-stack                            0.100000
DevOps specialist                                0.090000
Data scientist or machine learning specialist    0.080000
Developer, embedded applications or devices      0.070000
Developer, desktop or enterprise applications    0.070000
Blockchain                                       0.070000
Developer, QA or test                            0.060000
System administrator                             0.060000
Developer, front-end                             0.040000
Security professional                            0.040000
Developer, mobile                                0.030000
Developer, gam