In [1]:
from collections import defaultdict, deque
import pandas as pd

In [2]:
pathways = pd.read_csv("KEGG_data.csv")

In [3]:
IL23R_GENE_ID = 149233

# Find all rows where IL23R is involved (either as source or target)
il23r_rows = pathways[
    (pathways['from_gene_entrez'] == IL23R_GENE_ID) | 
    (pathways['to_gene_entrez'] == IL23R_GENE_ID)
].copy()

In [6]:
th17_df = il23r_rows[il23r_rows["pathway_name"].str.contains("Th17 cell differentiation", na=False)].copy()

In [8]:
adj = defaultdict(set)
for u, v in th17_df[['from_gene_entrez','to_gene_entrez']].itertuples(index=False):
    adj[u].add(v)
    adj[v].add(u)

In [14]:
start = 149233  # IL23R
pth = "Th17 cell differentiation"

# 1) Filter to the full pathway subgraph (not just rows touching IL23R)
th17_df = pathways[pathways["pathway_name"] == pth].copy()

# Sanity: how many edges?
print("Edges in pathway:", len(th17_df))

# 2) Build UNDIRECTED adjacency (for direction-agnostic spillovers)
adj = defaultdict(set)
for _, row in th17_df.iterrows():
    u = int(row["from_gene_entrez"])
    v = int(row["to_gene_entrez"])
    if pd.notna(u) and pd.notna(v):
        adj[u].add(v)
        adj[v].add(u)   # <= key line: makes it undirected

# Sanity: show start’s degree and neighbors
print("Start in graph?", start in adj)
print("Degree(start):", len(adj[start]) if start in adj else 0)
print("Neighbors(start):", sorted(adj[start]) if start in adj else [])

def k_hop_layers(adj, start, k):
    if start not in adj:
        return {d: set() for d in range(1, k+1)}
    visited = {start}
    frontier = {start}
    layers = {}
    for d in range(1, k+1):
        nxt = set()
        for u in frontier:
            nxt |= (adj[u] - visited)
        layers[d] = nxt
        # Diagnostics
        print(f"Level {d}: count={len(nxt)}")
        if len(nxt) == 0:
            # No more expansion possible; fill remaining levels with empties
            for dd in range(d+1, k+1):
                layers[dd] = set()
            break
        visited |= nxt
        frontier = nxt
    return layers

layers = k_hop_layers(adj, start, 3)

level_1_genes = sorted(layers[1])
level_2_genes = sorted(layers[2])
level_3_genes = sorted(layers[3])
print("L1:", level_1_genes)
print("L2:", level_2_genes)
print("L3:", level_3_genes)



Edges in pathway: 147
Start in graph? True
Degree(start): 4
Neighbors(start): [4087, 4088, 4089, 6774]
Level 1: count=4
Level 2: count=15
Level 3: count=14
L1: [4087, 4088, 4089, 6774]
L2: [3091, 3561, 3570, 3572, 3594, 3605, 6095, 6097, 7046, 7048, 50615, 50616, 50943, 59067, 112744]
L3: [861, 1432, 2475, 2625, 4772, 4773, 4775, 5600, 5603, 6300, 6776, 6777, 6778, 7040]


In [15]:
ibd_df = pd.read_parquet("ibd_panel.parquet")

In [18]:
allowed_genes = set([IL23R_GENE_ID] + level_1_genes + level_2_genes + level_3_genes)


ibd_df['gene_id'] = ibd_df['gene_id'].astype(int)

# 3) Filter to keep only rows whose gene is in the allowed set
ibd_df_filtered = ibd_df[ibd_df['gene_id'].isin(allowed_genes)].copy()

In [21]:
ibd_df_filtered.to_csv("th17_il23r_spillovers.csv", index=False)