In [31]:
import warnings
import pathlib, os
from jack.tooling import io as io_tool
from jack.ranking import weak
from typing import List, Tuple
import plotly.figure_factory as ff
import numpy as np
import copy
warnings.filterwarnings('ignore')

> Let's load the test dataset first

In [2]:
data_dir = pathlib.Path.home() / "Dataset"
filename = "test_df"

rename_columns = {"Title": "text", "Code": "label"}

In [3]:
df = next(io_tool.load(data_dir, filename, ext=".csv", rename_columns=rename_columns))

In [4]:
df.head()

Unnamed: 0,text,label
0,sr business director usa,11-2021.00
1,retail team leader glebe road,41-2031.00
2,deployment technician i,15-1151.00
3,ecm project manager oconus position,15-1142.00
4,business analyst lvl,13-1111.00


> In `explore.ipynb` we have already replaced `label` code with textual description. Let's reuse it to here again. For simplicity copy/paste the result

In [5]:
mapping = {
    '11-1021.00': 'General and Operations Managers',
    '11-2021.00': 'Marketing Managers',
    '11-2022.00': 'Sales Managers',
    '11-3031.02': 'Financial Managers, Branch or Department',
    '13-1111.00': 'Management Analysts',
    '13-2051.00': 'Financial Analysts',
    '15-1121.00': 'Computer Systems Analysts',
    '15-1122.00': 'Information Security Analysts',
    '15-1132.00': 'Software Developers, Applications',
    '15-1133.00': 'Software Developers, Systems Software',
    '15-1134.00': 'Web Developers',
    '15-1142.00': 'Network and Computer Systems Administrators',
    '15-1151.00': 'Computer User Support Specialists',
    '29-1141.00': 'Registered Nurses',
    '31-1014.00': 'Nursing Assistants',
    '33-3021.06': 'Intelligence Analysts',
    '41-2031.00': 'Retail Salespersons',
    '43-4051.00': 'Customer Service Representatives',
    '49-3023.02': 'Automotive Specialty Technicians',
    '49-9071.00': 'Maintenance and Repair Workers, General',
    '53-3032.00': 'Heavy and Tractor-Trailer Truck Drivers'
 }

In [6]:
df = df.replace({"label": dict(mapping)})

In [7]:
df.head()

Unnamed: 0,text,label
0,sr business director usa,Marketing Managers
1,retail team leader glebe road,Retail Salespersons
2,deployment technician i,Computer User Support Specialists
3,ecm project manager oconus position,Network and Computer Systems Administrators
4,business analyst lvl,Management Analysts


Now let's initialize `ranker`. It requires `label list` as a parameter.

In [8]:
label_list = list(mapping.values())

In [9]:
ranker = weak.WeakRanker(label_list=label_list)

> Now let's iterate over the test set and harvest the fruits :-)

In [13]:
response = []

In [14]:
for ex in iter(df.to_dict(orient="records")):
    query = ex["text"]
    label = ex["label"]
    ans, score = ranker.rank(query)
    res = (query, ans, score, label)
    response.append(res)

In [15]:
response

[('sr business director usa',
  'Software Developers, Applications',
  1.5555330598502624,
  'Marketing Managers'),
 ('retail team leader glebe road',
  'General and Operations Managers',
  3.3990241647287007,
  'Retail Salespersons'),
 ('deployment technician i',
  None,
  1.4696588113054703,
  'Computer User Support Specialists'),
 ('ecm project manager oconus position',
  'Software Developers, Systems Software',
  2.397671562828017,
  'Network and Computer Systems Administrators'),
 ('business analyst lvl', None, 0.745062938695187, 'Management Analysts'),
 ('licensed social worker social service director',
  'Registered Nurses',
  1.8316976450671931,
  'Customer Service Representatives'),
 ('ground systems engineering management',
  'Software Developers, Systems Software',
  2.3030528630893343,
  'Management Analysts'),
 ('cre relationship manager',
  'Sales Managers',
  2.313351911856767,
  'Sales Managers'),
 ('infrastructure operations engineer',
  'Network and Computer Systems A

> Let's see how many examples haven't passed the threshold, the recall and precision

In [24]:
import copy

def evaluate(dataset: List[Tuple], label_list) -> int:
    
    pos = {_label: 0 for _label in label_list}
    monitor = {_label: {"pos": 0, "neg": 0} for _label in label_list}
    for _d in dataset:
        query, pred, score, label = _d
        pos[label] += 1
        if pred:
            if pred == label:
                monitor[label]["pos"] += 1
            else:
                monitor[pred]["neg"] += 1
    rec = {l: score["pos"] * 1.0 / pos[l] for l, score in monitor.items()}
    prec = {l: score["pos"] * 1.0 / (score["pos"] + score["neg"]) for l, score in monitor.items()}
    return rec, prec

In [25]:
eval_results = evaluate(response, label_list)

In [27]:
rec, prec = eval_results

In [28]:
rec

{'General and Operations Managers': 0.14965986394557823,
 'Marketing Managers': 0.3435897435897436,
 'Sales Managers': 0.2857142857142857,
 'Financial Managers, Branch or Department': 0.42592592592592593,
 'Management Analysts': 0.27167630057803466,
 'Financial Analysts': 0.25,
 'Computer Systems Analysts': 0.14545454545454545,
 'Information Security Analysts': 0.5133079847908745,
 'Software Developers, Applications': 0.45925925925925926,
 'Software Developers, Systems Software': 0.24796747967479674,
 'Web Developers': 0.16326530612244897,
 'Network and Computer Systems Administrators': 0.5,
 'Computer User Support Specialists': 0.19101123595505617,
 'Registered Nurses': 0.7777777777777778,
 'Nursing Assistants': 0.2,
 'Intelligence Analysts': 0.5067567567567568,
 'Retail Salespersons': 0.43119266055045874,
 'Customer Service Representatives': 0.33695652173913043,
 'Automotive Specialty Technicians': 0.41379310344827586,
 'Maintenance and Repair Workers, General': 0.2564102564102564,
 

In [43]:
def visualize(recall: List[float], precision: List[float]):
    x1 = np.array(recall)
    x2 = np.array(precision)
    
    data = [x1, x2]
    group_labels = ["recall", "precision"]
    fig = ff.create_distplot(data, group_labels, show_hist=False)
    
    return fig

In [44]:
fig = visualize(list(rec.values()), list(prec.values()))

In [45]:
fig.show()