In [1]:
import warnings
import pathlib, os
from jack.tooling import io as io_tool
from jack.ranking import weak
from typing import List, Tuple
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import copy
warnings.filterwarnings('ignore')

In [2]:
data_dir = pathlib.Path.home() / "Dataset"
filename = "test_df"

rename_columns = {"Title": "text", "Code": "label"}

In [3]:
df = next(io_tool.load(data_dir, filename, ext=".csv", rename_columns=rename_columns))

In [4]:
df.head()

Unnamed: 0,text,label
0,sr business director usa,11-2021.00
1,retail team leader glebe road,41-2031.00
2,deployment technician i,15-1151.00
3,ecm project manager oconus position,15-1142.00
4,business analyst lvl,13-1111.00


In [5]:
mapping = {
    '11-1021.00': 'General and Operations Managers',
    '11-2021.00': 'Marketing Managers',
    '11-2022.00': 'Sales Managers',
    '11-3031.02': 'Financial Managers, Branch or Department',
    '13-1111.00': 'Management Analysts',
    '13-2051.00': 'Financial Analysts',
    '15-1121.00': 'Computer Systems Analysts',
    '15-1122.00': 'Information Security Analysts',
    '15-1132.00': 'Software Developers, Applications',
    '15-1133.00': 'Software Developers, Systems Software',
    '15-1134.00': 'Web Developers',
    '15-1142.00': 'Network and Computer Systems Administrators',
    '15-1151.00': 'Computer User Support Specialists',
    '29-1141.00': 'Registered Nurses',
    '31-1014.00': 'Nursing Assistants',
    '33-3021.06': 'Intelligence Analysts',
    '41-2031.00': 'Retail Salespersons',
    '43-4051.00': 'Customer Service Representatives',
    '49-3023.02': 'Automotive Specialty Technicians',
    '49-9071.00': 'Maintenance and Repair Workers, General',
    '53-3032.00': 'Heavy and Tractor-Trailer Truck Drivers'
 }

In [6]:
df = df.replace({"label": dict(mapping)})

In [7]:
df.head()

Unnamed: 0,text,label
0,sr business director usa,Marketing Managers
1,retail team leader glebe road,Retail Salespersons
2,deployment technician i,Computer User Support Specialists
3,ecm project manager oconus position,Network and Computer Systems Administrators
4,business analyst lvl,Management Analysts


In [8]:
label_list = list(mapping.values())

In [9]:
theta_list = [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5]

In [10]:
eval_report = []

In [11]:
def rankme(dataset: pd.DataFrame, ranker):
    response = []
    for ex in iter(dataset.to_dict(orient="records")):
        query = ex["text"]
        label = ex["label"]
        ans, score = ranker.rank(query)
        res = (query, ans, score, label)
        response.append(res)
    return response

In [12]:
def evaluate(dataset: List[Tuple], label_list) -> int:
    
    pos = {_label: 0 for _label in label_list}
    monitor = {_label: {"pos": 0, "neg": 0} for _label in label_list}
    for _d in dataset:
        query, pred, score, label = _d
        pos[label] += 1
        if pred:
            if pred == label:
                monitor[label]["pos"] += 1
            else:
                monitor[pred]["neg"] += 1
    rec = {l: score["pos"] * 1.0 / pos[l] for l, score in monitor.items()}
    prec = {l: score["pos"] * 1.0 / (score["pos"] + score["neg"]) for l, score in monitor.items()}
    return rec, prec

In [13]:
for theta in theta_list:
    ranker = weak.WeakRanker(label_list=label_list, theta=theta)
    result = rankme(df, ranker)
    report = evaluate(result, label_list)
    
    eval_report.append((theta, report))

> Let's just see visually which threshold  align best with the expectation)

In [14]:
def visualize(recall: List[float], precision: List[float]):
    x1 = np.array(recall)
    x2 = np.array(precision)
    
    data = [x1, x2]
    group_labels = ["recall", "precision"]
    fig = ff.create_distplot(data, group_labels, show_hist=False)
    
    return fig

In [15]:
figs = []

In [16]:
for rep in eval_report:
    th, r = rep
    _rec, _prec = r
    fig = visualize(list(_rec.values()), list(_prec.values()))
    figs.append(fig)

In [17]:
figs[0].show() # theta == 0.25

In [19]:
figs[5].show() # theta == 1.5

In [18]:
figs[-1].show() # theta == 2.5

In [24]:
th, (r, p) = eval_report[0]

In [29]:
f1 = {l: 2.0 * r * p / (r + p) for l, (r, p) in zip(label_list, zip(r.values(), p.values()))}

In [32]:
final_report = pd.DataFrame([r, p, f1], columns=label_list).head()

In [33]:
where = pathlib.Path.home() / "Dataset" / "report.csv"

In [35]:
final_report.to_csv(str(where), encoding="utf-8", sep=",", index=False)