In [17]:
LOG_DATA_PKL = 'data.pkl'
LOG_MODEL_PKL = 'model.pkl'
artifacts_path = 'C:/Users/M/Documents/data_science_project/notebooks/mlruns/1/67403f4417c34d82b102947d9e1aaa9d/artifacts/'

In [2]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

# Load Model

In [16]:
# Load data pkl
data_path  = os.path.join(artifacts_path ,LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data_pkl = pickle.load(handle)

In [19]:
# Load model pkl
model_path = os.path.join(artifacts_path, LOG_MODEL_PKL)
with open(model_path, 'rb') as handle:
    model_pkl = pickle.load(handle)

model = model_pkl["model_object"]

In [20]:
model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('featureunion',
                 FeatureUnion(transformer_list=[('linear_pca',
                                                 PCA(n_components=40)),
                                                ('kernel_pca',
                                                 KernelPCA(kernel='rbf',
                                                           n_components=40))])),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=0))])

# Predict Sample Entry 

In [21]:
CLUSTERS_YAML_PATH = "C:/Users/M/Documents/data_science_project/data/processed/features_skills_clusters_description.yaml"

In [22]:
CLUSTERS_YAML_PATH

'C:/Users/M/Documents/data_science_project/data/processed/features_skills_clusters_description.yaml'

In [23]:
with open(CLUSTERS_YAML_PATH ,'r') as stream :
    cluster_config = yaml.safe_load(stream)

In [26]:
cluster_config

{'skills_group_0': ['C#',
  'VBA',
  'Microsoft SQL Server',
  'ASP.NET',
  'ASP.NET Core',
  '.NET',
  '.NET Core',
  'Xamarin'],
 'skills_group_1': ['Go',
  'DynamoDB',
  'Elasticsearch',
  'PostgreSQL',
  'Redis',
  'Ansible',
  'Teraform'],
 'skills_group_10': ['HTML/CSS', 'JavaScript', 'SQL', 'jQuery'],
 'skills_group_11': ['Ruby', 'Ruby on Rails'],
 'skills_group_12': ['Assembly', 'C', 'C++'],
 'skills_group_13': ['Unity 3D', 'Unreal Engine'],
 'skills_group_14': ['Dart', 'Firebase', 'Flutter'],
 'skills_group_15': ['Chef', 'Puppet'],
 'skills_group_16': ['Objective-C', 'Swift'],
 'skills_group_2': ['PHP',
  'MariaDB',
  'MySQL',
  'SQLite',
  'Drupal',
  'Laravel',
  'Symfony',
  'Vue.js'],
 'skills_group_3': ['Scala',
  'Cassandra',
  'Couchbase',
  'Apache Spark',
  'Hadoop'],
 'skills_group_4': ['MongoDB',
  'Express',
  'Gatsby',
  'React.js',
  'Node.js',
  'React Native'],
 'skills_group_5': ['Bash/Shell/PowerShell',
  'Perl',
  'Python',
  'Django',
  'Flask'],
 'skills_g

In [28]:
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in cluster_config.items()
                   for cluster_skill in cluster_skills]

In [30]:
clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,C#
1,skills_group_0,VBA
2,skills_group_0,Microsoft SQL Server
3,skills_group_0,ASP.NET
4,skills_group_0,ASP.NET Core
...,...,...
69,skills_group_9,Java
70,skills_group_9,Kotlin
71,skills_group_9,IBM DB2
72,skills_group_9,Oracle


# Recreate cluster features

In [31]:
sample_skills = ['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Python', 'Keras']

In [32]:
sample_clusters = clusters_df.copy()

In [33]:
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)

In [35]:
cluster_features = sample_clusters.groupby('cluster_name')['sample_skills'].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_2     0
skills_group_3     0
skills_group_4     0
skills_group_5     1
skills_group_6     0
skills_group_7     4
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

# Create OneHotEncoded Skills

In [38]:
features_names = pd.Series(data_pkl['features_names'])
features_names

0                  Assembly
1     Bash/Shell/PowerShell
2                         C
3                        C#
4                       C++
              ...          
86           skills_group_5
87           skills_group_6
88           skills_group_7
89           skills_group_8
90           skills_group_9
Length: 91, dtype: object

In [39]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                  Assembly
1     Bash/Shell/PowerShell
2                         C
3                        C#
4                       C++
              ...          
69                 Teraform
70            Torch/PyTorch
71                 Unity 3D
72            Unreal Engine
73                  Xamarin
Length: 74, dtype: object

In [42]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist() ,index = skills_names)
ohe_skills

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
Teraform                 0
Torch/PyTorch            1
Unity 3D                 0
Unreal Engine            0
Xamarin                  0
Length: 74, dtype: int64

In [43]:
features = pd.concat([ohe_skills ,cluster_features])

In [44]:
features

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
skills_group_5           1
skills_group_6           0
skills_group_7           4
skills_group_8           0
skills_group_9           0
Length: 91, dtype: int64

# Predict

In [46]:
predictions = model.predict_proba([features.values])

In [55]:
positive_pred = [prob[0][1] for prob in predictions]
preds = pd.Series(positive_pred ,index = data_pkl['targets_names']).sort_values(ascending = False)
preds

Data scientist or machine learning specialist    0.900000
Scientist                                        0.143333
Data or business analyst                         0.070000
Academic researcher                              0.036667
Developer, game or graphics                      0.030000
Developer, back-end                              0.013333
Developer, embedded applications or devices      0.010000
Engineer, data                                   0.010000
DevOps specialist                                0.006667
Database administrator                           0.000000
Developer, QA or test                            0.000000
Developer, desktop or enterprise applications    0.000000
Developer, front-end                             0.000000
Developer, full-stack                            0.000000
Developer, mobile                                0.000000
System administrator                             0.000000
dtype: float64