In [1]:
DF_PATH = "C:/Users/M/Documents/data_science_project/data/processed/2_cleaned_data.pkl"

ROLE_COLS = ['DevType']
TECH_COLS = ['LanguageWorkedWith','DatabaseWorkedWith','WebframeWorkedWith','MiscTechWorkedWith']

EXPERIMENT_NAME = "stackoverflow_single_model"
LOG_PATH = "C:/Users/M/Documents/data_science_project/models/temp/baseline/"
LOG_DATA_PKL =  "data.pkl"
LOG_MODEL_PKL =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from matplotlib import pyplot as plt

In [11]:
df = pd.read_pickle(DF_PATH)
df

Unnamed: 0_level_0,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Assembly,Bash/Shell/PowerShell,C,C#,C++,Dart,Go,HTML/CSS,Haskell,Java,...,skills_group_15,skills_group_16,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64405,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,1
64416,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2
64422,0,0,0,0,0,0,0,1,0,0,...,0,0,2,0,0,3,0,0,0,0
64428,0,1,0,0,0,0,0,1,0,0,...,0,0,4,0,0,1,0,0,0,0


In [32]:
def calculate_quality(ground_truth ,prediction ,metric_function ,sort_values = False) :
    quality_scores = {}
    for col in predictions.columns :
        role_prediction = prediction[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth ,role_prediction)*100 ,2)
        
    quality_scores = pd.Series(quality_scores.values() ,index = quality_scores.keys())
    if sort_values : 
        quality_scores = quality_scores.sort_values()
    return quality_scores    

# Balance Classes

In [7]:
roles_df = df['DevType'].copy()
role_sum = roles_df.sum(axis = 0)
role_sum

Academic researcher                               581
Data or business analyst                          669
Data scientist or machine learning specialist     799
Database administrator                            296
DevOps specialist                                 677
Developer, QA or test                             493
Developer, back-end                              5503
Developer, desktop or enterprise applications    1671
Developer, embedded applications or devices       795
Developer, front-end                             2890
Developer, full-stack                            5578
Developer, game or graphics                       342
Developer, mobile                                1859
Engineer, data                                    483
Scientist                                         292
System administrator                              440
dtype: int64

In [8]:
samples_per_class = 500 
resampled_roles = []
for role_col in roles_df.columns :
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    if len(sub_df) < samples_per_class :
        sub_df = sub_df.sample(samples_per_class ,replace = True ,random_state = 0)
    else : 
        sub_df = sub_df.sample(samples_per_class ,random_state = 0)
        
    resampled_roles.append(sub_df)    

In [14]:
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index]
df

Unnamed: 0_level_0,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Assembly,Bash/Shell/PowerShell,C,C#,C++,Dart,Go,HTML/CSS,Haskell,Java,...,skills_group_15,skills_group_16,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
54451,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,4,0,0,0,2,0
37550,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
49646,0,1,1,0,1,0,0,0,0,0,...,0,0,2,0,0,3,0,1,0,1
47422,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
5139,0,0,1,0,1,0,0,1,0,1,...,0,0,2,0,0,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53453,0,1,0,0,0,0,0,0,0,0,...,0,0,2,0,0,1,0,0,0,0
44375,0,1,1,0,0,0,0,0,0,1,...,0,0,2,0,0,2,0,0,0,1
15982,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
63800,0,0,0,0,0,0,0,1,0,1,...,0,0,2,0,0,2,0,0,0,1


# Train Models

In [21]:
X_train ,X_test ,y_train ,y_test = train_test_split(df.drop('DevType' ,axis = 1) ,df['DevType'] ,random_state = 0)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [26]:
X_train.shape

(6000, 91)

# Initialize ML flow

In [27]:
# Initialize client and experiment
client = MlflowClient()
mlflow.set_experiment(EXPERIMENT_NAME)
exp = client.get_experiment_by_name(EXPERIMENT_NAME)

INFO: 'stackoverflow_single_model' does not exist. Creating a new experiment


### 1.Logistic Regression

In [29]:
clf = make_pipeline(StandardScaler() ,
                    MultiOutputClassifier(LogisticRegression()))
clf.fit(X_train ,y_train)
predictions = pd.DataFrame(clf.predict(X_train) ,columns = y_train.columns)

In [33]:
# Evaluate On Training Set
train_scores = {score.__name__ : calculate_quality(y_train ,predictions ,score) 
                for score in [accuracy_score ,precision_score ,recall_score ,f1_score]}
train_scores = pd.concat(train_scores ,axis = 1)
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,90.97,64.81,17.8,27.93
Data or business analyst,91.88,70.08,31.2,43.17
Data scientist or machine learning specialist,92.48,72.77,51.43,60.26
Database administrator,92.55,48.68,8.31,14.2
DevOps specialist,93.17,73.98,34.47,47.03
"Developer, QA or test",92.88,55.56,1.17,2.29
"Developer, back-end",81.4,66.2,32.1,43.23
"Developer, desktop or enterprise applications",88.7,48.72,11.28,18.31
"Developer, embedded applications or devices",92.72,62.45,32.47,42.73
"Developer, front-end",92.17,73.85,48.91,58.84


In [34]:
#Evaluate on test set
predictions = pd.DataFrame(clf.predict(X_test) ,columns = y_test.columns)
test_scores = {score.__name__ : 
               calculate_quality(y_test ,predictions ,score) for score in [accuracy_score ,precision_score ,recall_score ,f1_score]}
test_scores = pd.concat(test_scores ,axis = 1)
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,90.25,62.0,14.98,24.12
Data or business analyst,92.95,69.41,33.91,45.56
Data scientist or machine learning specialist,92.0,63.86,51.46,56.99
Database administrator,92.6,39.13,6.29,10.84
DevOps specialist,93.0,72.29,33.9,46.15
"Developer, QA or test",92.1,100.0,2.47,4.82
"Developer, back-end",80.55,61.45,34.3,44.03
"Developer, desktop or enterprise applications",89.35,55.56,11.47,19.01
"Developer, embedded applications or devices",92.45,57.47,30.49,39.84
"Developer, front-end",91.7,63.51,45.63,53.11


In [37]:
mean_test_scores = test_scores.mean()
print(mean_test_scores)

accuracy_score     91.190625
precision_score    65.601250
recall_score       28.439375
f1_score           37.190625
dtype: float64


In [38]:
test_scores.sort_values('f1_score')

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",92.1,100.0,2.47,4.82
Database administrator,92.6,39.13,6.29,10.84
"Developer, desktop or enterprise applications",89.35,55.56,11.47,19.01
Scientist,92.65,52.78,12.75,20.54
Academic researcher,90.25,62.0,14.98,24.12
System administrator,91.85,56.0,16.57,25.57
"Engineer, data",92.85,69.39,20.99,32.23
"Developer, embedded applications or devices",92.45,57.47,30.49,39.84
"Developer, full-stack",84.5,63.13,31.65,42.16
"Developer, back-end",80.55,61.45,34.3,44.03


# Log

In [43]:
# tracking data information 
data_details = {'data_path' : DF_PATH ,
                'training_indicies' : X_train.index.tolist() ,
                'testing_indicies' : X_test.index.tolist() ,
                'features_names' : X_train.columns.droplevel(0).tolist() ,
                'target_names' : y_train.columns.tolist()}
with open(os.path.join(LOG_PATH ,LOG_DATA_PKL) ,'wb') as output_file :
    pickle.dump(data_details ,output_file)

In [45]:
# tracking model 
model = {'model_description' : 'Baseline Model : Logistic Regression' ,
         'model_details' : str(clf) ,
         'model' : clf}
with open(os.path.join(LOG_PATH ,LOG_MODEL_PKL) ,'wb') as output_file :
    pickle.dump(model ,output_file)

In [46]:
# tracking metrics 
classes_metrics = {'train_scores' : train_scores ,
                   'test-scores' : test_scores}
with open(os.path.join(LOG_PATH ,LOG_METRICS_PKL) ,'wb') as output_file :
    pickle.dump(classes_metrics ,output_file)

In [47]:
# start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)