The goal of this notebook is to compute synergy metrics between the stores/retailers/categories based on the cross visit data. We will use graph-based methods to analyze the relationships and interactions between different entities in the dataset.

# Imports

In [None]:
import networkx as nx
import numpy as np
import pandas as pd

# Data Loading

In [None]:
import constants.constants as cst
import constants.paths as pth

# Dim Tables
dim_blocks = pd.read_csv(pth.DIM_BLOCKS, **cst.CSV_PARAMS)
dim_malls = pd.read_csv(pth.DIM_MALLS, **cst.CSV_PARAMS)

# Fact Tables
fact_stores = pd.read_csv(pth.FACT_STORES, **cst.CSV_PARAMS)
fact_malls = pd.read_csv(pth.FACT_MALLS, **cst.CSV_PARAMS)
fact_sri_scores = pd.read_csv(pth.FACT_SRI_SCORES, **cst.CSV_PARAMS)

# Store financials table
store_financials = pd.read_csv(pth.STORE_FINANCIALS, **cst.CSV_PARAMS)

# Cross visit table
cross_visit = pd.read_csv(pth.CROSS_VISITS, **cst.CSV_PARAMS)

## Data enriching

In [None]:
dim_blocks[dim_blocks["store_code"].duplicated()].sort_values("store_code")

We want to compute synergy metrics at the store level, retailer level and category level. For that, we need to enrich the cross visit data with retailer and category information. Additionally, to build graphs per mall, we add the mall id to the enriching data.

In [None]:
store_1_enrich = (
    dim_blocks[
        [
            "store_code",
            "mall_id",
            "retailer_code",
            "bl1_label",
            "bl2_label",
            "bl3_label",
        ]
    ]
    .drop_duplicates("store_code")
    .add_suffix("_1")
)

store_2_enrich = (
    dim_blocks[
        [
            "store_code",
            "mall_id",
            "retailer_code",
            "bl1_label",
            "bl2_label",
            "bl3_label",
        ]
    ]
    .drop_duplicates("store_code")
    .add_suffix("_2")
)

cross_visit_enriched = pd.merge(
    cross_visit,
    store_1_enrich,
    left_on="store_code_1",
    right_on="store_code_1",
    how="left",
    validate="m:1",
)

cross_visit_enriched = pd.merge(
    cross_visit_enriched,
    store_2_enrich,
    left_on="store_code_2",
    right_on="store_code_2",
    how="left",
    validate="m:1",
)

We check that there is no error in the `mall_id` (no mismatching `mall_id`). The only differences come from when one store has a mall_id and the other does not. Thus, we can combine the `mall_id` columns to get a full one.

In [None]:
cross_visit_enriched[
    (cross_visit_enriched["mall_id_1"] != cross_visit_enriched["mall_id_2"])
    & (
        cross_visit_enriched["mall_id_1"].notna()
        & cross_visit_enriched["mall_id_2"].notna()
    )
]

In [None]:
cross_visit_enriched["mall_id"] = cross_visit_enriched["mall_id_1"].combine_first(
    cross_visit_enriched["mall_id_2"]
)

cross_visit_enriched = cross_visit_enriched.drop(columns=["mall_id_1", "mall_id_2"])

# Drop rows where mall_id is missing alltogether
cross_visit_enriched = cross_visit_enriched.dropna(axis=0, subset="mall_id")

In [None]:
cross_visit_enriched

We still need to normalize the edge weights to have comparable values. We can do:
$$
edge\_weight_{ij} = \frac{cross\_total\_cross\_visits_{ij}}{\sqrt{total\_visits_i \times {total\_visits_j}}}
$$

At this point, the issue is that for some stores, there are more cross visits in `cross_visits` than total visits in `fact_stores`... Ask the question, but maybe use sum of cross visits as proxy?

# Graph Construction

In [None]:
def construct_graph(data: pd.DataFrame, mall_id: int, granularity: str) -> nx.Graph:
    """Construct a graph for a specific mall and granularity level.

    Parameters:
    - data: DataFrame containing cross visit data.
    - mall_id: The mall ID to filter the data.
    - granularity: The granularity level. Must be one of
                   ('store', 'retailer', 'cat_high', 'cat_mid', 'cat_low').

    Returns:
    - A NetworkX graph object.
    """
    if granularity == "store":
        node_col_1 = "store_code_1"
        node_col_2 = "store_code_2"
    elif granularity == "retailer":
        node_col_1 = "retailer_code_1"
        node_col_2 = "retailer_code_2"
    elif granularity == "cat_high":
        node_col_1 = "bl1_label_1"
        node_col_2 = "bl1_label_2"
    elif granularity == "cat_mid":
        node_col_1 = "bl2_label_1"
        node_col_2 = "bl2_label_2"
    elif granularity == "cat_low":
        node_col_1 = "bl3_label_1"
        node_col_2 = "bl3_label_2"
    else:
        raise ValueError(
            "Granularity must be one of 'store', 'retailer', 'cat_high', 'cat_mid' "
            "or 'cat_low'."
        )

    mall_data = data[data["mall_id"] == mall_id]

    graph = nx.from_pandas_edgelist(
        mall_data,
        source=node_col_1,
        target=node_col_2,
        edge_attr="total_cross_visits",
        create_using=nx.Graph(),
    )

    return graph

In [None]:
def compute_node_metrics(graph: nx.Graph) -> pd.DataFrame:
    """Compute graph metrics for each node in the graph.

    Parameters:
    - graph: A NetworkX graph object (typically from construct_graph).

    Returns:
    - DataFrame with node metrics indexed by node ID.
    """
    # Weighted degree (strength) - total cross-visit volume
    weighted_degree = dict(graph.degree(weight="total_cross_visits"))

    # Unweighted degree - number of connections
    degree = dict(graph.degree())

    # Degree centrality (normalized)
    degree_centrality = nx.degree_centrality(graph)

    # Betweenness centrality - bridge nodes connecting clusters
    # Use weight inversion: high cross-visits = short distance
    betweenness = nx.betweenness_centrality(
        graph, weight="total_cross_visits", normalized=True
    )

    # PageRank - importance via incoming flow
    pagerank = nx.pagerank(graph, weight="total_cross_visits")

    # Clustering coefficient - local clustering tightness
    clustering = nx.clustering(graph, weight="total_cross_visits")

    # Eigenvector centrality - connected to well-connected nodes
    try:
        eigenvector = nx.eigenvector_centrality(
            graph, weight="total_cross_visits", max_iter=1000
        )
    except nx.PowerIterationFailedConvergence:
        eigenvector = {node: float("nan") for node in graph.nodes()}

    # Combine into DataFrame
    metrics_df = pd.DataFrame(
        {
            "degree": degree,
            "weighted_degree": weighted_degree,
            "degree_centrality": degree_centrality,
            "betweenness_centrality": betweenness,
            "pagerank": pagerank,
            "clustering_coefficient": clustering,
            "eigenvector_centrality": eigenvector,
        }
    )
    metrics_df.index.name = "node_id"

    return metrics_df

In [None]:
def compute_graph_metrics(graph: nx.Graph) -> dict:
    """Compute mall-level (graph-level) synergy metrics.

    Parameters:
    - graph: A NetworkX graph object (typically from construct_graph).

    Returns:
    - Dictionary with graph-level metrics.
    """
    from networkx.algorithms.community import louvain_communities

    metrics = {}

    # Basic graph properties
    metrics["n_nodes"] = graph.number_of_nodes()
    metrics["n_edges"] = graph.number_of_edges()
    metrics["density"] = nx.density(graph)

    # Degree statistics
    degrees = [d for _, d in graph.degree()]
    weighted_degrees = [d for _, d in graph.degree(weight="total_cross_visits")]

    metrics["avg_degree"] = np.mean(degrees) if degrees else 0
    metrics["avg_weighted_degree"] = (
        np.mean(weighted_degrees) if weighted_degrees else 0
    )
    metrics["std_weighted_degree"] = np.std(weighted_degrees) if weighted_degrees else 0

    # Gini coefficient of weighted degree (measures concentration)
    if weighted_degrees and sum(weighted_degrees) > 0:
        sorted_degrees = np.sort(weighted_degrees)
        n = len(sorted_degrees)
        cumsum = np.cumsum(sorted_degrees)
        metrics["gini_weighted_degree"] = (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n
    else:
        metrics["gini_weighted_degree"] = 0

    # Top-k concentration (share of total weighted degree held by top 5 nodes)
    if weighted_degrees:
        total = sum(weighted_degrees)
        top_k = sorted(weighted_degrees, reverse=True)[:5]
        metrics["top5_degree_share"] = sum(top_k) / total if total > 0 else 0
    else:
        metrics["top5_degree_share"] = 0

    # Clustering
    metrics["avg_clustering"] = nx.average_clustering(
        graph, weight="total_cross_visits"
    )

    # Transitivity (global clustering coefficient)
    metrics["transitivity"] = nx.transitivity(graph)

    # Assortativity - do high-degree nodes connect to high-degree nodes?
    try:
        metrics["degree_assortativity"] = nx.degree_assortativity_coefficient(graph)
    except ValueError:
        metrics["degree_assortativity"] = float("nan")

    # Community detection and modularity
    try:
        communities = louvain_communities(graph, weight="total_cross_visits", seed=42)
        metrics["n_communities"] = len(communities)
        metrics["modularity"] = nx.community.modularity(
            graph, communities, weight="total_cross_visits"
        )
    except Exception:
        metrics["n_communities"] = float("nan")
        metrics["modularity"] = float("nan")

    # Connected components
    metrics["n_connected_components"] = nx.number_connected_components(graph)
    metrics["is_connected"] = nx.is_connected(graph)

    # Average shortest path length (only if connected)
    if metrics["is_connected"] and metrics["n_nodes"] > 1:
        try:
            metrics["avg_path_length"] = nx.average_shortest_path_length(graph)
        except Exception:
            metrics["avg_path_length"] = float("nan")
    else:
        metrics["avg_path_length"] = float("nan")

    return metrics

In [None]:
store_graph = construct_graph(cross_visit_enriched, mall_id=22, granularity="store")

## Node-level metrics (per store)

In [None]:
store_node_metrics = compute_node_metrics(store_graph)
store_node_metrics.sort_values("weighted_degree", ascending=False).head(10)

## Graph-level metrics (per mall)

In [None]:
mall_graph_metrics = compute_graph_metrics(store_graph)
pd.Series(mall_graph_metrics)