In [1]:
DF_PATH       = '../data/processed/merged_cleaned_data.pkl'

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [11]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
import warnings
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")

## Functions

In [3]:
def calculate_quality(targets, predictions, metric_funciton, sort_values=False):
    """
    given target, predictions, and metric function it applies the metric
    to each column of the data and return the result as a data series
    Params:
        targets(pd.DataFrame): targets
        predictions(pd.DataFrame): predicions
        metric_function(sklearn.metrics): metric function
        sort_values(bool): if True the returned values will be sorted - default=False
    Returns:
        (pd.Series):metric score for each column
    """
    quality_scores={}
    for col in targets.columns:
        col_target = targets[col].copy()
        col_pred = predictions[col].copy()
        quality_scores[col] = round(metric_funciton(col_target, col_pred) *100, 2)
    
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [4]:
def balance_sample(targets_df, samples_per_class=1200):
    """
    given a targets data frame returns a mask for balanced sample from the target classes
    Params:
        targets_df(pd.DataFrame): a one-hot encoded data frame with each column represent class
        samples_per_class(int): number of sampler per class - default=1200
    
    Returns:
        (pd.Series): sampling mask
    """
    resampled_targets = []
    for col in targets_df.columns:
        sub_df = targets_df.loc[targets_df[col]==1].copy()
        
        #upsampling
        if len(sub_df) < samples_per_class:
            sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
        #downsampling
        else:
            sub_df = sub_df.sample(samples_per_class, random_state=0)
        resampled_targets.append(sub_df)
        
    resampled_targets = pd.concat(resampled_targets)
    
    return resampled_targets

In [14]:
def evaluate_model(model, X_train, X_test, Y_train, Y_test, metrics=[accuracy_score, precision_score, recall_score, f1_score]):
    """
    evaluates the model on the train and test data using the given metrics and report the results
    Params:
        model(sklearn.base)
        X_trian(np.ndarray)
        X_test(np.ndarray)
        Y_train(np.ndarray)
        Y_test(np.ndarray)
        metrics(list[sklearn.metrics]): default = [accuracy_score, precision_score, recall_score, f1_score]
    Returns:
        train_scores(pd.DataFrame)
        test_scores(pd.DataFrame)
    """
    
    # evaluate on training data
    predictions = pd.DataFrame( model.predict(X_train.values), columns=Y_train.columns)
    train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                    for score in metrics}
    train_scores = pd.concat(train_scores, axis=1)
    
    # evaluate on test data
    predictions = pd.DataFrame( model.predict(X_test.values), columns=Y_test.columns)
    test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                    for score in metrics}
    test_scores = pd.concat(test_scores, axis=1)
    
    return train_scores, test_scores

## Model

In [5]:
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,QA or test,VP,back-end,data,desktop or enterprise applications,embedded applications or devices,front-end,full-stack,game or graphics,mobile,...,skill_group_19,skill_group_2,skill_group_20,skill_group_3,skill_group_4,skill_group_5,skill_group_6,skill_group_7,skill_group_8,skill_group_9
1,0,1,0,0,0,0,0,0,0,0,...,1,4,0,0,0,2,0,0,0,2
2,0,0,1,0,0,0,0,0,0,0,...,2,0,0,2,0,3,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,5,0,1,1,3,0,2,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,8,0,0,0,4,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,7,1,1,1,7,0,0,0,0


### Handling imbalanced data

In [6]:
roles_df = df['DevType'].copy()
roles_df = balance_sample(roles_df)
df = df.loc[roles_df.index].copy()

### Splitting data

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), df["DevType"], random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), df["DevType"], random_state=0)


### Training

#### initializing MLflow

In [9]:
# create directories
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

#initialize
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
if not exp:
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
    exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

#### Random Forests

In [18]:
rf_clf = make_pipeline(RobustScaler(),
                       PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=8, verbose=1, random_state=0))
rf_clf.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   31.2s finished


In [25]:
rf_train_scores, rf_test_scores = evaluate_model(rf_clf, X_train, X_test, Y_train, Y_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    5.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.6s finished


In [26]:
rf_mean_test_scores = rf_test_scores.mean(axis=0)
rf_mean_test_scores

accuracy_score     97.653214
precision_score    85.542857
recall_score       54.727143
f1_score           61.363571
dtype: float64

#### AdaBoosting

In [24]:
ada_clf = make_pipeline(RobustScaler(),
                       PCA(n_components=0.95),
                       MultiOutputClassifier(AdaBoostClassifier(random_state=0)))
ada_clf.fit(X_train.values, Y_train.values)

In [27]:
ada_train_scores, ada_test_scores = evaluate_model(ada_clf, X_train, X_test, Y_train, Y_test)

In [28]:
ada_mean_test_scores = ada_test_scores.mean(axis=0)
ada_mean_test_scores

accuracy_score     95.095000
precision_score    38.269286
recall_score       15.444286
f1_score           20.718571
dtype: float64

#### Gradient Boosting

In [30]:
grad_clf = make_pipeline(RobustScaler(),
                       PCA(n_components=0.95),
                      MultiOutputClassifier(GradientBoostingClassifier(random_state=0)))
grad_clf.fit(X_train.values, Y_train.values)

In [31]:
grad_train_scores, grad_test_scores = evaluate_model(ada_clf, X_train, X_test, Y_train, Y_test)

In [32]:
grad_mean_test_scores = ada_test_scores.mean(axis=0)
grad_mean_test_scores

accuracy_score     95.095000
precision_score    38.269286
recall_score       15.444286
f1_score           20.718571
dtype: float64

### Log

In [33]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_test.index.tolist(),
                "features_names": X_train.columns.droplevel(0).tolist(),
                "targets_names": Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [37]:
# Model
model = {"model_description": "Random Forest: with PCA - Basic",
         "model_details": str(rf_clf),
         "model_object": rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [38]:
# Performance details
classes_metrics = {"train_scores": rf_train_scores,
                   "test_scores": rf_test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [39]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id,
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)

    # Track metrics
    for metric, score in rf_mean_test_scores.items():
        mlflow.log_metric(metric, score)

### Hyperparameter tuning

In [None]:
ada_model = {"model_description": "AdaBoosting: with PCA - Basic",
         "model_details": str(ada_clf),
         "model_object": ada_clf}
with open(os.path.join(LOG_PATH, 'ada_'+LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(ada_model, output_file)

gb_model = {"model_description": "Gradient Boosting: with PCA - Basic",
         "model_details": str(grad_clf),
         "model_object": grad_clf}

with open(os.path.join(LOG_PATH, 'gb_'+LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(gb_model, output_file) 

In [40]:
hpt_rf_clf = make_pipeline(RobustScaler(),
                           PCA(),
                           RandomForestClassifier(n_jobs=8,
                                                  verbose=1,
                                                  random_state=0))

In [42]:
tuned_parameters = [{
    'pca__n_components': [.6,.65,0.7, 0.8, .95],
    'randomforestclassifier__n_estimators': [250, 500, 600],
    'randomforestclassifier__max_depth':    [3, 10, None],
}]

In [43]:
hpt_rf_clf = GridSearchCV(hpt_rf_clf, tuned_parameters)
hpt_rf_clf.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   12.4s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   14.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    3.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    8.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed: 

In [44]:
hpt_rf_clf.best_params_

{'pca__n_components': 0.7,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__n_estimators': 250}

In [45]:
hpt_rf_train_scores, hpt_rf_test_scores = evaluate_model(hpt_rf_clf, X_train, X_test, Y_train, Y_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   10.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   13.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    3.8s finished


In [46]:
hpt_rf_mean_test_scores = hpt_rf_test_scores.mean(axis=0)
hpt_rf_mean_test_scores

accuracy_score     97.690000
precision_score    83.990714
recall_score       56.286429
f1_score           62.840000
dtype: float64

### Log

In [48]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [49]:
# Model
model = {"model_description": "Random Forest: with PCA + Hyperparamter tuning",
         "model_details": str(hpt_rf_clf),
         "model_object": hpt_rf_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [51]:
# Preformance details
classes_metrics = {"train_scores": hpt_rf_train_scores, 
                   "test_scores":  hpt_rf_test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [52]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, 
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)  
    
    # Track metrics 
    for metric, score in hpt_rf_mean_test_scores.items():
        mlflow.log_metric(metric, score) 

## Compare results

In [54]:
runs = mlflow.search_runs([exp.experiment_id])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.f1_score,metrics.precision_score,metrics.accuracy_score,metrics.recall_score,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.source.name
0,7a24a21a88fe4329a986ba8dd6c942cb,953534236177104710,FINISHED,file:///D:/projects/Machine Learning/Tech jobs...,2023-11-15 21:24:24.070000+00:00,2023-11-15 21:25:57.137000+00:00,62.84,83.990714,97.69,56.286429,ashraf hesham,Random Forest: with PCA + Hyperparamter tuning,LOCAL,C:\Users\ashraf hesham\anaconda3\lib\site-pack...
1,ce63e87749b74b2f909cd034d4a2c4bf,953534236177104710,FINISHED,file:///D:/projects/Machine Learning/Tech jobs...,2023-11-15 17:48:20.861000+00:00,2023-11-15 17:48:54.411000+00:00,61.363571,85.542857,97.653214,54.727143,ashraf hesham,Random Forest: with PCA - Basic,LOCAL,C:\Users\ashraf hesham\anaconda3\lib\site-pack...
2,170a904ebe024e2f8913159c422b42f0,953534236177104710,FINISHED,file:///D:/projects/Machine Learning/Tech jobs...,2023-11-15 14:25:53.032000+00:00,2023-11-15 14:25:53.183000+00:00,21.896429,46.950714,95.418571,16.2475,ashraf hesham,Baseline model: Logistic Regression,LOCAL,C:\Users\ashraf hesham\anaconda3\lib\site-pack...
