In [3]:
DF_PATH       = '../data/processed/merged_cleaned_data.pkl'

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [49]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import os
import warnings
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")

## Functions

In [34]:
def calculate_quality(targets, predictions, metric_funciton, sort_values=False):
    """
    given target, predictions, and metric function it applies the metric
    to each column of the data and return the result as a data series
    Params:
        targets(pd.DataFrame): targets
        predictions(pd.DataFrame): predicions
        metric_function(sklearn.metrics): metric function
        sort_values(bool): if True the returned values will be sorted - default=False
    Returns:
        (pd.Series):metric score for each column
    """
    quality_scores={}
    for col in targets.columns:
        col_target = targets[col].copy()
        col_pred = predictions[col].copy()
        quality_scores[col] = round(metric_funciton(col_target, col_pred) *100, 2)
    
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [19]:
def balance_sample(targets_df, samples_per_class=1200):
    """
    given a targets data frame returns a mask for balanced sample from the target classes
    Params:
        targets_df(pd.DataFrame): a one-hot encoded data frame with each column represent class
        samples_per_class(int): number of sampler per class - default=1200
    
    Returns:
        (pd.Series): sampling mask
    """
    resampled_targets = []
    for col in targets_df.columns:
        sub_df = targets_df.loc[targets_df[col]==1].copy()
        
        #upsampling
        if len(sub_df) < samples_per_class:
            sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
        #downsampling
        else:
            sub_df = sub_df.sample(samples_per_class, random_state=0)
        resampled_targets.append(sub_df)
        
    resampled_targets = pd.concat(resampled_targets)
    
    return resampled_targets

In [50]:
def evaluate_model(model, X_train, X_test, Y_train, Y_test, metrics=[accuracy_score, precision_score, recall_score, f1_score]):
    """
    evaluates the model on the train and test data using the given metrics and report the results
    Params:
        model(sklearn.base)
        X_trian(np.ndarray)
        X_test(np.ndarray)
        Y_train(np.ndarray)
        Y_test(np.ndarray)
        metrics(list[sklearn.metrics]): default = [accuracy_score, precision_score, recall_score, f1_score]
    Returns:
        train_scores(pd.DataFrame)
        test_scores(pd.DataFrame)
    """
    
    # evaluate on training data
    predictions = pd.DataFrame( model.predict(X_train.values), columns=Y_train.columns)
    train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                    for score in metrics}
    train_scores = pd.concat(train_scores, axis=1)
    
    # evaluate on test data
    predictions = pd.DataFrame( model.predict(X_test.values), columns=Y_test.columns)
    test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                    for score in metrics}
    test_scores = pd.concat(test_scores, axis=1)
    
    return train_scores, test_scores

## Baseline Model

In [6]:
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,QA or test,VP,back-end,data,desktop or enterprise applications,embedded applications or devices,front-end,full-stack,game or graphics,mobile,...,skill_group_19,skill_group_2,skill_group_20,skill_group_3,skill_group_4,skill_group_5,skill_group_6,skill_group_7,skill_group_8,skill_group_9
1,0,1,0,0,0,0,0,0,0,0,...,1,4,0,0,0,2,0,0,0,2
2,0,0,1,0,0,0,0,0,0,0,...,2,0,0,2,0,3,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,5,0,1,1,3,0,2,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,8,0,0,0,4,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,7,1,1,1,7,0,0,0,0


### Handling imbalanced classes

In [20]:
roles_df = df['DevType'].copy()
roles_df = balance_sample(roles_df)
df = df.loc[roles_df.index].copy()

### split the data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), df["DevType"], random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), df["DevType"], random_state=0)


## Training

#### initializing MLFlow

In [47]:
# create directories
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

#initialize
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
if not exp:
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
    exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

#### Logistic Regression

In [32]:
lr_clf = make_pipeline(StandardScaler(),
                       MultiOutputClassifier(LogisticRegression()))
lr_clf.fit(X_train.values,Y_train.values)

In [51]:
train_scores, test_scores = evaluate_model(lr_clf, X_train, X_test, Y_train, Y_test)

In [52]:
mean_test_scores = test_scores.mean(axis=0)
mean_test_scores

accuracy_score     95.418571
precision_score    46.950714
recall_score       16.247500
f1_score           21.896429
dtype: float64

In [53]:
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
QA or test,96.63,43.75,2.49,4.71
VP,96.2,22.22,0.64,1.24
back-end,95.61,28.57,2.82,5.14
data,94.88,70.16,32.83,44.73
desktop or enterprise applications,96.13,22.22,1.94,3.56
embedded applications or devices,95.94,40.0,13.79,20.51
front-end,96.7,69.73,36.86,48.22
full-stack,94.93,37.84,3.36,6.17
game or graphics,97.52,75.73,49.68,60.0
mobile,97.61,75.21,56.35,64.42


## Log run

In [39]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [41]:
# Model
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(lr_clf),
         "model_object": lr_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [42]:
# Performance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [48]:
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model['model_description']):
    mlflow.log_artifacts(LOG_PATH)
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)