# IT Educational Institute

In [51]:
# Constants
DF_PATH = "../data/processed/2_cleaned_data.pkl"

# Specific Role and Tech coluns from Data
ROLE_COLS = ['DevType']
TECH_COLS = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
            'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith',
            'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith',
            'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

In [52]:
# Importing libraries
import os
from pathlib import Path
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import pickle

## Function

In [53]:
# Creating a function to save the quality scores of models
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
        
    return quality_scores

## Initialize

### Create Directories

In [54]:
# Creating directories for logging
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

### Read Data

In [55]:
# Reading data
df = pd.read_pickle(DF_PATH)

# Displaying first 5 osbervations
df.head()

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications",...,skills_group_11,skills_group_12,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,2,4,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,3,1,3,4,0,2
9,0,0,0,0,0,0,0,0,0,1,...,0,2,1,0,0,2,0,1,2,0
10,0,0,0,0,0,0,0,0,1,0,...,0,0,3,0,10,0,1,0,0,0


### Imbalance Classes

In [56]:
# Checking the total samples of roles
roles_df = df['DevType'].copy()
roles_df.sum().sort_values(ascending=False)

Developer, full-stack                            17510
Developer, back-end                              15692
Developer, front-end                              7927
Developer, desktop or enterprise applications     4328
Developer, mobile                                 3703
DevOps specialist                                 2382
Cloud infrastructure engineer                     1994
Data scientist or machine learning specialist     1889
Developer, embedded applications or devices       1803
Engineer, data                                    1627
Academic researcher                               1535
System administrator                              1404
Data or business analyst                          1313
Database administrator                            1037
Developer, QA or test                             1024
Scientist                                          911
Developer, game or graphics                        810
Security professional                              585
Blockchain

In [57]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0)
        
    resampled_roles.append(sub_df)

In [58]:
# Concatenating DataFrames
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [59]:
# Checking the total number of samples after balancing
roles_df.sum().sort_values(ascending=False)

Developer, back-end                              6763
Developer, full-stack                            6338
Developer, desktop or enterprise applications    2931
Developer, front-end                             2749
DevOps specialist                                2438
Data scientist or machine learning specialist    2436
Academic researcher                              2390
Cloud infrastructure engineer                    2263
System administrator                             2123
Developer, mobile                                2083
Engineer, data                                   2077
Data or business analyst                         2011
Scientist                                        1925
Developer, embedded applications or devices      1853
Database administrator                           1845
Developer, QA or test                            1542
Security professional                            1527
Developer, game or graphics                      1468
Blockchain                  

### Train Test Split

In [60]:
# importing train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Creating train and test datasets
X_train, X_test, y_train, y_test = train_test_split(df.drop('DevType',axis=1),
                                                    df['DevType'],
                                                    random_state=0)

# Checking the shapes of both datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (17100, 174)
Shape of X_test: (5700, 174)
Shape of y_train: (17100, 19)
Shape of y_test: (5700, 19)


  X_train, X_test, y_train, y_test = train_test_split(df.drop('DevType',axis=1),


## Train Models

### Initialize MLflow

In [79]:
# Importing mlflow and MlflowClient
import mlflow
from mlflow.tracking import MlflowClient

# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

2023/01/18 12:00:42 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_stackoverflow' does not exist. Creating a new experiment.


### 1. Logistic Regression

In [62]:
# Importing necessary libraries
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

# Creating a pipeline
clf = make_pipeline(StandardScaler(),
                   MultiOutputClassifier(LogisticRegression()))

# Fitting pipeline to train data
clf.fit(X_train.values, y_train.values)

In [63]:
# importing metrics from sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluating on training set
predictions = pd.DataFrame(clf.predict(X_train.values),
                           columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score)
               for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores, axis=1)

In [64]:
# Evaluating on testing set
predictions = pd.DataFrame(clf.predict(X_test.values), columns=y_train.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score)
               for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()

In [65]:
# Displaying the mean of the test scores
print(mean_test_scores)

# Displaying the test_scores DataFrame
test_scores

accuracy_score     90.352632
precision_score    59.217895
recall_score       27.523684
f1_score           36.256842
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,90.14,62.54,28.66,39.31
Blockchain,94.46,72.52,25.33,37.55
Cloud infrastructure engineer,91.11,61.67,30.84,41.11
Data or business analyst,91.37,50.26,19.88,28.49
Data scientist or machine learning specialist,92.14,69.28,48.78,57.25
Database administrator,91.49,37.93,4.67,8.32
DevOps specialist,90.95,70.04,29.08,41.1
"Developer, QA or test",93.47,22.22,0.54,1.06
"Developer, back-end",74.12,59.21,34.44,43.55
"Developer, desktop or enterprise applications",88.11,69.47,20.5,31.65


## Log Run
### 1. Prepare

In [66]:
# Data details
data_details = {'data_path': DF_PATH,
               'training_indices': X_train.index.tolist(),
               'testing_indices': X_test.index.tolist(),
               'features_names': X_train.columns.droplevel().tolist(),
               'targets_names': y_train.columns.tolist()}

# Writing data in a pickle file
with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as output_file:
    pickle.dump(data_details, output_file)

In [67]:
# Model
model = {'model_description': 'Baseline model: Logistic Regression',
        'model_details': str(clf),
        'model_object': clf}

# Writing model data in a pickle file
with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as output_file:
    pickle.dump(model, output_file)

In [68]:
# Metrics
classes_metrics = {'train_scores': train_scores,
                  "test_scores": test_scores}

# Writing metrics to a pickle file
with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as output_file:
    pickle.dump(classes_metrics, output_file)

### 2. Log

In [81]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model['model_description']):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)