# Skill Gap Analysis - Graph Exploration

Este notebook contiene análisis exploratorio de grafos de habilidades y experimentos con clustering.


In [None]:
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path().resolve().parent))

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from collections import Counter

from core.graph_analysis import (
    build_bipartite_graph,
    build_skill_cooccurrence_graph,
    compute_centralities,
    detect_communities
)
from core.analysis import cluster_jobs


## 1. Cargar Datos


In [None]:
# Load processed jobs data
data_files = [
    "../data/processed_jobs_data_analyst_madrid.csv",
    "../data/processed_jobs_data_analyst_barcelona.csv",
    "../data/processed_jobs_data_scientist_barcelona.csv"
]

dfs = []
for file in data_files:
    try:
        df = pd.read_csv(file)
        # Convert skills_detected from string to list if needed
        if "skills_detected" in df.columns:
            df["skills_detected"] = df["skills_detected"].apply(
                lambda x: [s.strip() for s in str(x).split(",") if s.strip()] if pd.notna(x) and x else []
            )
        dfs.append(df)
    except FileNotFoundError:
        print(f"File not found: {file}")

if dfs:
    jobs_df = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(jobs_df)} jobs")
    print(f"Columns: {jobs_df.columns.tolist()}")
else:
    print("No data files found. Please run the main app first to generate data.")


## 2. Análisis de Grafos


In [None]:
# Build skill co-occurrence graph
skill_graph = build_skill_cooccurrence_graph(jobs_df)

print(f"Number of nodes (skills): {skill_graph.number_of_nodes()}")
print(f"Number of edges: {skill_graph.number_of_edges()}")
print(f"Graph density: {nx.density(skill_graph):.4f}")


In [None]:
# Compute centralities
centralities_df = compute_centralities(skill_graph)
print("Top 10 skills by degree centrality:")
print(centralities_df.head(10))


In [None]:
# Detect communities
communities = detect_communities(skill_graph)
print(f"Number of communities: {len(set(communities.values()))}")

# Show skills by community
from collections import defaultdict
comm_dict = defaultdict(list)
for skill, comm_id in communities.items():
    comm_dict[comm_id].append(skill)

for comm_id, skills in sorted(comm_dict.items()):
    print(f"\nCommunity {comm_id} ({len(skills)} skills):")
    print(", ".join(sorted(skills)[:10]))


## 3. Clustering de Ofertas


In [None]:
# Apply clustering
if "match_ratio" not in jobs_df.columns:
    # Add dummy match_ratio if not present
    jobs_df["match_ratio"] = 0.5

clustered_df = cluster_jobs(jobs_df, n_clusters=4)

print("Cluster distribution:")
print(clustered_df["cluster"].value_counts().sort_index())


In [None]:
# Analyze clusters
from core.analysis import interpret_clusters

cluster_summary = interpret_clusters(clustered_df)
print("Cluster interpretations:")
print(cluster_summary)
