In [17]:
import pathlib
import numpy as np
import pandas as pd
from jack.tooling import io as io_tool
import collections

In [2]:
pd.set_option("plotting.backend", "plotly")

In [3]:
data_dir = pathlib.Path.home() / "Dataset"
filename_train = "train_df"
filename_test = "test_df"
filename_info = "occup_df"

In [4]:
df = next(io_tool.load(data_dir, filename_train, ext=".csv"))

In [5]:
df.head()

Unnamed: 0,Title,Code
0,senior program analyst navy strike fighter sen...,15-1121.00
1,senior intelligence analyst iii job,33-3021.06
2,retail wireless sales consultant part,41-2031.00
3,test automation engineer w selenium and gerkin...,15-1121.00
4,public sector portfolio management senior asso...,11-1021.00


> `occup_df` contains the description of the actual label code. Let's decode it back and forget about that file :-)

In [15]:
df_info = next(io_tool.load(data_dir, filename_info, ext=".csv"))

In [20]:
label_col = "Code"
text_col = "Title"
description_col = "Occupation"

In [16]:
df_info

Unnamed: 0,Occupation,Code
0,General and Operations Managers,11-1021.00
1,Marketing Managers,11-2021.00
2,Sales Managers,11-2022.00
3,"Financial Managers, Branch or Department",11-3031.02
4,Management Analysts,13-1111.00
5,Financial Analysts,13-2051.00
6,Computer Systems Analysts,15-1121.00
7,Information Security Analysts,15-1122.00
8,"Software Developers, Applications",15-1132.00
9,"Software Developers, Systems Software",15-1133.00


In [18]:
def get_mapping(dataset: pd.DataFrame, key_col: str, val_col: str):
    response = collections.defaultdict()
    for rec in dataset.to_dict(orient="records"):
        response[rec[key_col]] = rec[val_col]
    return response

In [21]:
mapping = get_mapping(df_info, key_col=label_col, val_col=description_col)

In [35]:
dict(mapping)

{'11-1021.00': 'General and Operations Managers',
 '11-2021.00': 'Marketing Managers',
 '11-2022.00': 'Sales Managers',
 '11-3031.02': 'Financial Managers, Branch or Department',
 '13-1111.00': 'Management Analysts',
 '13-2051.00': 'Financial Analysts',
 '15-1121.00': 'Computer Systems Analysts',
 '15-1122.00': 'Information Security Analysts',
 '15-1132.00': 'Software Developers, Applications',
 '15-1133.00': 'Software Developers, Systems Software',
 '15-1134.00': 'Web Developers',
 '15-1142.00': 'Network and Computer Systems Administrators',
 '15-1151.00': 'Computer User Support Specialists',
 '29-1141.00': 'Registered Nurses',
 '31-1014.00': 'Nursing Assistants',
 '33-3021.06': 'Intelligence Analysts',
 '41-2031.00': 'Retail Salespersons',
 '43-4051.00': 'Customer Service Representatives',
 '49-3023.02': 'Automotive Specialty Technicians',
 '49-9071.00': 'Maintenance and Repair Workers, General',
 '53-3032.00': 'Heavy and Tractor-Trailer Truck Drivers'}

In [36]:
df = df.replace({label_col: dict(mapping)})

In [37]:
df.head()

Unnamed: 0,Title,Code
0,senior program analyst navy strike fighter sen...,Computer Systems Analysts
1,senior intelligence analyst iii job,Intelligence Analysts
2,retail wireless sales consultant part,Retail Salespersons
3,test automation engineer w selenium and gerkin...,Computer Systems Analysts
4,public sector portfolio management senior asso...,General and Operations Managers


In [38]:
klasses = df[label_col].value_counts().to_dict()

In [39]:
klasses

{'Registered Nurses': 1186,
 'Network and Computer Systems Administrators': 1082,
 'Software Developers, Applications': 1080,
 'Information Security Analysts': 1051,
 'Software Developers, Systems Software': 985,
 'Marketing Managers': 778,
 'Management Analysts': 690,
 'Intelligence Analysts': 590,
 'General and Operations Managers': 589,
 'Sales Managers': 477,
 'Computer Systems Analysts': 438,
 'Retail Salespersons': 434,
 'Financial Managers, Branch or Department': 432,
 'Customer Service Representatives': 371,
 'Computer User Support Specialists': 355,
 'Maintenance and Repair Workers, General': 314,
 'Financial Analysts': 287,
 'Nursing Assistants': 283,
 'Heavy and Tractor-Trailer Truck Drivers': 248,
 'Automotive Specialty Technicians': 232,
 'Web Developers': 196}

In [40]:
df.plot.hist(x=label_col)