In [9]:
import severability
import numpy as np
import pandas as pd

In [None]:
# read the co-occurrence matrix file
skill_df = pd.read_csv("Lightcast_taxonomy_M2R.csv", index_col=0)

inds = skill_df.index
skills = skill_df['Lightcast_skills'].to_list()
skill_to_ind = {s: i for i, s in enumerate(skills)}

matrix_df = pd.read_csv("Lightcast_co_mat.csv", index_col=0).fillna(0)
matrix_df = matrix_df.iloc[inds, inds]

matrix_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3906 entries, Dc.js to Dyeing
Columns: 3906 entries, Dc.js to Dyeing
dtypes: float64(3906)
memory usage: 116.4+ MB


In [11]:
def keep_top_k_per_row(A: np.ndarray, k: int = 50):
    """
    For each row in A, keep only the k largest values; zero out the rest.
    """
    A_topk = np.zeros_like(A)
    topk_idx = np.argpartition(-A, k, axis=1)[:, :k]
    rows = np.arange(A.shape[0])[:, None]
    A_topk[rows, topk_idx] = A[rows, topk_idx]
    return A_topk

# generate adjacency matrix
A = matrix_df.to_numpy(dtype=np.float32)
A = keep_top_k_per_row(A)

P = severability.transition_matrix(np.matrix(A))

In [12]:
class Skill:
    def __init__(self, name: str, index: int):
        self.name = name
        self.index = index
        self.sev_series = []

    def __repr__(self):
        return f"Skill(name={self.name}, index={self.index})"
    
    def calculate_components(self, transition, max_size=10):
        for t in range(1, 4):
            comp = severability.node_component(P=transition, i=self.index, t=t, max_size=max_size)
            time_comp = (t, comp)
            self.sev_series.append(time_comp)

In [13]:
# Load and clean the skills taxonomy
skills = pd.read_csv("Lightcast_taxonomy_M2R.csv", index_col=1)
if 'Unnamed: 0' in skills.columns:
    skills = skills.drop('Unnamed: 0', axis=1)

# Function to get skill:index mapping for a category, filtered by available labels
def get_skills_from_category(category, labels):
    # Get all skills in the category
    category_skills = skills[skills['Category (corrected)'] == category].index
    
    # Intersect with labels (preserves order and position in labels)
    filtered_labels = labels.intersection(category_skills)
        
    # Build mapping: skill_name -> index in labels
    return [ Skill(skill, labels.get_loc(skill)) for skill in filtered_labels ]

tech_skills = get_skills_from_category('Information Technology', matrix_df.columns) + get_skills_from_category('Analysis', matrix_df.columns)
hospitality_skills = get_skills_from_category('Hospitality and Food Services', matrix_df.columns)

len(tech_skills), len(hospitality_skills)

(834, 48)

In [14]:
tech_skills[1].calculate_components(P)

LinAlgError: Last 2 dimensions of the array must be square

In [None]:
tech_skills_enumerated = enumerate([s.name for s in tech_skills])
bad_skills = []
for i, skill in tech_skills_enumerated:
    try:
        tech_skills[i].calculate_components(P)
    except:
        print(f"Error processing skill: {skill}")
        bad_skills.append(tech_skills[i])
        continue

len(bad_skills)

Error processing skill: Linux Kernel
Error processing skill: Fiber Optics
Error processing skill: Technical Services
Error processing skill: OmniPage
Error processing skill: Network Routing
Error processing skill: Information Assurance
Error processing skill: IIS Manager
Error processing skill: Media RSS
Error processing skill: Bluetooth
Error processing skill: Systems Engineering
Error processing skill: SolarWinds
Error processing skill: Bit.ly
Error processing skill: HP 3000
Error processing skill: HTML Application
Error processing skill: Remote Server Management
Error processing skill: ODB++
Error processing skill: Deployment Project
Error processing skill: System Design and Implementation
Error processing skill: System Testing
Error processing skill: Penetration Testing
Error processing skill: AES3
Error processing skill: Remote Data Services
Error processing skill: Oracle Databases
Error processing skill: Prolog (Programming Language)
Error processing skill: Voicemail
Error proces

149