In [1]:
import pandas as pd
import networkx as nx
from collections import defaultdict


#pd.set_option('display.max_rows', None)  # Display all rows
#pd.set_option('display.max_columns', None)  
#

In [2]:
# Load the data
ppi_data = pd.read_csv(
    r"/work/haarscheid/cancer_baseline2/cancer_baseline/data/PPI_homo_sapiens.txt",
    delimiter=' ',  # Adjust to ',' or ' ' if needed
    names=['protein1', 'protein2', 'combined_score']  # Use this if the file doesn't have headers
)

ppi_data['protein1'] = ppi_data['protein1'].str.replace(r'^\d+\.', '', regex=True)
ppi_data['protein2'] = ppi_data['protein2'].str.replace(r'^\d+\.', '', regex=True)
ppi_data['combined_score'] = pd.to_numeric(ppi_data['combined_score'], errors='coerce')

mapping_data = pd.read_csv(r'/work/haarscheid/cancer_baseline2/cancer_baseline/data/ensembl_to_protein_mapping.csv')





  ppi_data = pd.read_csv(


                 protein1         protein2  combined_score
0                protein1         protein2             NaN
1         ENSP00000000233  ENSP00000356607           173.0
2         ENSP00000000233  ENSP00000427567           154.0
3         ENSP00000000233  ENSP00000253413           151.0
4         ENSP00000000233  ENSP00000493357           471.0
...                   ...              ...             ...
13715400  ENSP00000501317  ENSP00000475489           195.0
13715401  ENSP00000501317  ENSP00000370447           158.0
13715402  ENSP00000501317  ENSP00000312272           226.0
13715403  ENSP00000501317  ENSP00000402092           169.0
13715404  ENSP00000501317  ENSP00000404074           251.0

[13715405 rows x 3 columns]


In [None]:
valid_proteins = set(mapping_data['Protein stable ID'])

print(valid_proteins)
ppi_data_filtered = ppi_data[
    (ppi_data['protein1'].isin(valid_proteins)) & 
    (ppi_data['protein2'].isin(valid_proteins))
]
print(ppi_data_filtered)

In [4]:
protein_to_gene = dict(zip(mapping_data['Protein stable ID'], mapping_data['Gene stable ID']))
# Initialize gene-gene and self-loop interaction storage
gene_interactions = defaultdict(list)
self_loops = defaultdict(list)


In [6]:

# Process the filtered PPI data
for _, row in ppi_data_filtered.iterrows():
    protein_a, protein_b, strength = row['protein1'], row['protein2'], row['combined_score']
    
    # Map proteins to genes
    gene_a = protein_to_gene.get(protein_a)
    gene_b = protein_to_gene.get(protein_b)
    
    if gene_a and gene_b:
        if gene_a == gene_b:
            # Handle self-loop: same gene for both proteins
            self_loops[gene_a].append(strength)
        else:
            # Ensure consistent ordering of gene pairs (to avoid duplicate edges)
            if gene_a > gene_b:
                gene_a, gene_b = gene_b, gene_a
            # Add interaction strength to the gene pair
            gene_interactions[(gene_a, gene_b)].append(strength)

# Construct the gene network
gene_network = nx.Graph()

# Add edges for inter-gene interactions
for (gene_a, gene_b), strengths in gene_interactions.items():
    # Compute average strength for overlapping edges
    average_strength = sum(strengths) / len(strengths)
    gene_network.add_edge(gene_a, gene_b, weight=average_strength)

# Add self-loops
for gene, strengths in self_loops.items():
    # Compute average strength for self-loops
    average_strength = sum(strengths) / len(strengths)
    gene_network.add_edge(gene, gene, weight=average_strength)

# Save the graph
nx.write_edgelist(
    gene_network,
    r"/work/haarscheid/cancer_baseline2/cancer_baseline/data/gene_network_with_self_loops.edgelist",
    data=["weight"]
)

# Optional: visualize the graph
# import matplotlib.pyplot as plt
# pos = nx.spring_layout(gene_network)
# weights = nx.get_edge_attributes(gene_network, 'weight')
# nx.draw(gene_network, pos, with_labels=True, node_size=500, font_size=10)
# nx.draw_networkx_edge_labels(gene_network, pos, edge_labels=weights)
# plt.show()


In [2]:
edge_list_path = r"/work/haarscheid/cancer_baseline2/cancer_baseline/data/gene_network_with_self_loops.edgelist"
gene_network = nx.read_edgelist(edge_list_path, nodetype=str)

In [7]:
print(list(gene_network.edges)[:5])

[('ENSG00000003056', 'ENSG00000257335'), ('ENSG00000003056', 'ENSG00000029725'), ('ENSG00000003056', 'ENSG00000131238'), ('ENSG00000003056', 'ENSG00000104112'), ('ENSG00000003056', 'ENSG00000143457')]


In [16]:
# Compute the mean edge weight
edge_weights = [attr['weight'] for _, _, attr in gene_network.edges(data=True)]
mean_weight = sum(edge_weights) / len(edge_weights)

print(f"Mean edge weight: {mean_weight}")

# Remove edges below the mean weight
edges_to_remove = [(u, v) for u, v, attr in gene_network.edges(data=True) if attr['weight'] < mean_weight]
gene_network.remove_edges_from(edges_to_remove)

# Print the updated graph info
print(f"Number of edges after filtering: {gene_network.number_of_edges()}")

# Optionally save the filtered graph
nx.write_edgelist(gene_network, r"/work/haarscheid/cancer_baseline2/cancer_baseline/data/filtered_gene_network.edgelist")
















Mean edge weight: 269.20197769767367
Number of edges after filtering: 416057


In [None]:
import matplotlib.pyplot as plt
pos = nx.spring_layout(edge_list)
weights = nx.get_edge_attributes(edge_list, 'weight')
nx.draw(gene_network, pos, with_labels=True, node_size=500, font_size=10)
nx.draw_networkx_edge_labels(edge_list, pos, edge_labels=weights)
plt.show()