
# Graph-Based Recommendation with NetworkX (3.x-safe)

This notebook shows how to use graphs as simple recommendation engines:
- Build a **user-item bipartite graph**
- Create an **item-item projection** (co-occurrence strengths)
- Score recommendations via **co-occurrence**, **Jaccard/Adamic-Adar** link prediction ideas, and **Personalized PageRank**
- Run a tiny **offline evaluation** (Hit-Rate@K and MRR@K)
- Visualize a few pieces with Matplotlib

All code uses modern, non-deprecated NetworkX 3.x APIs.


## Setup

In [None]:

# If you need packages, uncomment:
# !pip install networkx matplotlib pandas numpy

import itertools
import math
import random
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

print("NetworkX:", nx.__version__)
print("Pandas:", pd.__version__)


## Create a tiny synthetic user-item dataset (🔁 Customize)

In [None]:

# Users and items
users = [f"U{i}" for i in range(1, 13)]
items = [f"I{i}" for i in range(1, 17)]

# Deterministic small dataset with overlapping tastes
interactions = {
    "U1":  ["I1","I2","I3","I7"],
    "U2":  ["I2","I3","I4","I8"],
    "U3":  ["I1","I3","I5","I9"],
    "U4":  ["I4","I5","I6","I10"],
    "U5":  ["I2","I6","I7","I11"],
    "U6":  ["I1","I5","I7","I12"],
    "U7":  ["I8","I9","I10","I13"],
    "U8":  ["I3","I9","I11","I14"],
    "U9":  ["I6","I10","I12","I15"],
    "U10": ["I2","I11","I13","I16"],
    "U11": ["I4","I8","I12","I14"],
    "U12": ["I5","I9","I15","I16"],
}

# Convert to a DataFrame (implicit feedback: 1 per interaction)
df = pd.DataFrame([(u, it, 1) for u, its in interactions.items() for it in its],
                  columns=["user","item","weight"])
df.head()


## Build a user-item bipartite graph

In [None]:

def build_bipartite_graph(df, user_col="user", item_col="item", weight_col="weight"):
    G = nx.Graph(name="User-Item Graph")
    users = df[user_col].unique().tolist()
    items = df[item_col].unique().tolist()
    # Add nodes with bipartite attribute
    G.add_nodes_from(users, bipartite="user")
    G.add_nodes_from(items, bipartite="item")
    # Add edges
    for _, row in df.iterrows():
        G.add_edge(row[user_col], row[item_col], weight=float(row.get(weight_col, 1.0)))
    return G

G = build_bipartite_graph(df)
print(G)  # concise graph summary
# Node counts by type
n_users = sum(1 for n, d in G.nodes(data=True) if d.get("bipartite") == "user")
n_items = sum(1 for n, d in G.nodes(data=True) if d.get("bipartite") == "item")
print("Users:", n_users, "Items:", n_items, "Edges:", G.number_of_edges())


## Item-item projection (co-occurrence)

In [None]:

from networkx.algorithms import bipartite

item_nodes = [n for n, d in G.nodes(data=True) if d.get("bipartite") == "item"]
# Weighted projection: edge weight = number of shared users
G_item = bipartite.weighted_projected_graph(G, item_nodes)
print(G_item)


## Item-item similarity via Jaccard & Adamic-Adar

In [None]:

from networkx.algorithms.link_prediction import jaccard_coefficient, adamic_adar_index

# Precompute Jaccard and Adamic-Adar for item pairs only
def pair_key(a, b):
    # Undirected pair key for dict lookups
    return tuple(sorted((a, b)))

item_pairs = list(itertools.combinations(item_nodes, 2))

jacc = {}
for (u, v, p) in jaccard_coefficient(G, item_pairs):
    jacc[pair_key(u, v)] = p

aa = {}
for (u, v, p) in adamic_adar_index(G, item_pairs):
    aa[pair_key(u, v)] = p

# Peek a few scores
sample = list(jacc.items())[:10]
sample


## Recommender functions

In [None]:

def user_items(G, user):
    '''Return the set of items a user has interacted with.'''
    return {nbr for nbr in G.neighbors(user) if G.nodes[nbr].get("bipartite") == "item"}

def candidate_items(G, user):
    '''Items the user has NOT interacted with yet.'''
    have = user_items(G, user)
    items = {n for n, d in G.nodes(data=True) if d.get("bipartite") == "item"}
    return list(items - have)

def recommend_cooccurrence(G, G_item, user, topk=10):
    '''Score candidates by summing item-item co-occurrence weights to the user's items.'''
    owned = user_items(G, user)
    cand = candidate_items(G, user)
    scores = {}
    for c in cand:
        s = 0.0
        for o in owned:
            w = G_item.get_edge_data(c, o, default={"weight": 0.0})["weight"] if G_item.has_node(c) and G_item.has_node(o) else 0.0
            s += float(w)
        scores[c] = s
    recs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]
    return pd.DataFrame(recs, columns=["item", "score"])

def recommend_jaccard(G, user, jacc, agg="mean", topk=10):
    '''Score candidates by aggregating Jaccard scores to user's items (mean or max).'''
    owned = list(user_items(G, user))
    cand = candidate_items(G, user)
    scores = {}
    for c in cand:
        vals = [jacc.get(tuple(sorted((c, o))), 0.0) for o in owned]
        val = max(vals) if agg == "max" else (sum(vals) / len(vals) if vals else 0.0)
        scores[c] = float(val)
    recs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]
    return pd.DataFrame(recs, columns=["item", "score"])

def recommend_adamic_adar(G, user, aa, agg="sum", topk=10):
    '''Score candidates by aggregating Adamic-Adar scores to user's items (sum or mean).'''
    owned = list(user_items(G, user))
    cand = candidate_items(G, user)
    scores = {}
    for c in cand:
        vals = [aa.get(tuple(sorted((c, o))), 0.0) for o in owned]
        if agg == "mean":
            val = (sum(vals) / len(vals)) if vals else 0.0
        else:
            val = sum(vals)
        scores[c] = float(val)
    recs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]
    return pd.DataFrame(recs, columns=["item", "score"])

def recommend_ppr(G, user, topk=10, alpha=0.85):
    '''Personalized PageRank from the user node; return top items not yet seen.'''
    if user not in G:
        raise ValueError(f"Unknown user: {user}")
    personalization = {user: 1.0}
    pr = nx.pagerank(G, alpha=alpha, personalization=personalization)
    owned = user_items(G, user)
    items = [n for n, d in G.nodes(data=True) if d.get("bipartite") == "item"]
    candidates = [it for it in items if it not in owned]
    recs = sorted(((it, pr.get(it, 0.0)) for it in candidates), key=lambda x: x[1], reverse=True)[:topk]
    return pd.DataFrame(recs, columns=["item","score"])


## Demo: recommendations for a single user

In [None]:

u = "U3"
print("User", u, "owns:", sorted(user_items(G, u)))

co_df  = recommend_cooccurrence(G, G_item, u, topk=10)
jac_df = recommend_jaccard(G, u, jacc, agg="mean", topk=10)
aa_df  = recommend_adamic_adar(G, u, aa, agg="sum", topk=10)
ppr_df = recommend_ppr(G, u, topk=10, alpha=0.85)

print("\nCo-occurrence:"); display(co_df)
print("\nJaccard (mean to owned):"); display(jac_df)
print("\nAdamic-Adar (sum to owned):"); display(aa_df)
print("\nPersonalized PageRank:"); display(ppr_df)


## Quick visualizations

In [None]:

# Degree distribution for items
item_degs = [G.degree(i) for i in item_nodes]
plt.figure(figsize=(6,4))
plt.hist(item_degs, bins=range(1, 1+max(item_degs)))
plt.title("Item degree distribution")
plt.xlabel("Degree")
plt.ylabel("Count")
plt.show()


In [None]:

# Visualize a small subgraph of the item-item projection: top 20 strongest co-occurrences
edges_sorted = sorted(G_item.edges(data=True), key=lambda e: e[2].get("weight", 0.0), reverse=True)[:20]
H = nx.Graph()
H.add_nodes_from(item_nodes)
H.add_edges_from([(u,v,{"weight":d.get("weight", 0.0)}) for u,v,d in edges_sorted])

pos = nx.spring_layout(H, seed=42)
plt.figure(figsize=(7,5))
nx.draw_networkx_nodes(H, pos, node_size=400)
nx.draw_networkx_edges(H, pos, width=[H[u][v]["weight"] for u,v in H.edges()])
nx.draw_networkx_labels(H, pos, font_size=9)
plt.title("Top item-item co-occurrences (projection)")
plt.axis("off")
plt.show()


## Tiny offline evaluation (Hit-Rate@K and MRR@K)

In [None]:

def evaluate_holdout(interactions, K=5):
    rows = []
    for u, its in interactions.items():
        if len(its) < 2:
            continue
        hidden = its[-1]
        train_its = its[:-1]
        # Build training DF (replace the user's interactions with train-only)
        rec_df = pd.DataFrame([(uu, it, 1) for uu, lst in interactions.items() for it in (lst if uu != u else train_its)],
                              columns=["user","item","weight"])
        Gtr = build_bipartite_graph(rec_df)
        item_nodes_tr = [n for n, d in Gtr.nodes(data=True) if d.get("bipartite") == "item"]
        Gtr_item = nx.algorithms.bipartite.weighted_projected_graph(Gtr, item_nodes_tr)

        # Precompute sims on training graph
        pairs_tr = list(itertools.combinations(item_nodes_tr, 2))
        jacc_tr = {tuple(sorted((a,b))): s for (a,b,s) in nx.jaccard_coefficient(Gtr, pairs_tr)}
        aa_tr   = {tuple(sorted((a,b))): s for (a,b,s) in nx.adamic_adar_index(Gtr, pairs_tr)}

        # Get rankings
        def rank_of(item, df):
            arr = df["item"].tolist()
            return (arr.index(item) + 1) if item in arr else None

        co    = recommend_cooccurrence(Gtr, Gtr_item, u, topk=K)
        jac   = recommend_jaccard(Gtr, u, jacc_tr, agg="mean", topk=K)
        aad   = recommend_adamic_adar(Gtr, u, aa_tr, agg="sum", topk=K)
        ppr   = recommend_ppr(Gtr, u, topk=K, alpha=0.85)

        for name, df_rec in [("cooccurrence", co), ("jaccard", jac), ("adamic_adar", aad), ("ppr", ppr)]:
            r = rank_of(hidden, df_rec)
            hit = 1 if (r is not None and r <= K) else 0
            mrr = (1.0 / r) if r is not None else 0.0
            rows.append({"user": u, "held_out": hidden, "method": name, "rank": r, "hit@K": hit, "mrr@K": mrr})

    results = pd.DataFrame(rows)
    summary = results.groupby("method")[["hit@K","mrr@K"]].mean().reset_index().sort_values("mrr@K", ascending=False)
    return results, summary

results, summary = evaluate_holdout(interactions, K=5)
print("Per-user results (first 10 rows):")
display(results.head(10))
print("\nSummary (averaged over users):")
display(summary)


In [None]:

plt.figure(figsize=(6,4))
plt.bar(summary["method"], summary["hit@K"])
plt.title("Hit-Rate@5 by method")
plt.xlabel("Method")
plt.ylabel("Hit-Rate@5")
plt.xticks(rotation=20)
plt.show()



## Next steps
- Swap in your own interactions data (CSV of `user,item[,weight]`).
- Try weighting edges by rating/recency, or filter by minimum interactions.
- Add Random Walk with Restart or node2vec embeddings for items.
- For larger datasets, precompute projections/similarities offline and cache them.
- Evaluate more formally (AUC, Recall@K, NDCG@K) using stronger splits.
