In [1]:
import pandas as pd
df = pd.read_csv("filtered_primekg.csv")
df.head()

  df = pd.read_csv("filtered_primekg.csv")


Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1803896 entries, 0 to 1803895
Data columns (total 12 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   relation          object
 1   display_relation  object
 2   x_index           int64 
 3   x_id              object
 4   x_type            object
 5   x_name            object
 6   x_source          object
 7   y_index           int64 
 8   y_id              object
 9   y_type            object
 10  y_name            object
 11  y_source          object
dtypes: int64(2), object(10)
memory usage: 165.2+ MB


In [3]:
import networkx as nx
import matplotlib.pyplot as plt
import random
from pyvis.network import Network

# Count unique relations
unique_relations = df["relation"].value_counts()
print("\nUnique Relations:\n", unique_relations)

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())



Unique Relations:
 relation
protein_protein               642150
disease_phenotype_positive    300634
bioprocess_protein            289610
disease_protein               160822
drug_effect                   129568
pathway_protein                85292
disease_disease                64388
contraindication               61350
drug_protein                   51306
indication                     18776
Name: count, dtype: int64

Missing Values:
 relation            0
display_relation    0
x_index             0
x_id                0
x_type              0
x_name              0
x_source            0
y_index             0
y_id                0
y_type              0
y_name              0
y_source            0
dtype: int64


In [None]:
# Degree distribution analysis (node connectivity)
node_counts = pd.concat([df["x_name"], df["y_name"]]).value_counts()
plt.figure(figsize=(10, 5))
plt.hist(node_counts, bins=100, log=True, alpha=0.7, color='blue')
plt.xlabel("Node Degree")
plt.ylabel("Frequency (Log Scale)")
plt.title("Node Degree Distribution")
plt.show()


# Small World Graph
A small-world graph is a type of network characterized by high clustering and short average path lengths, meaning that most nodes are only a few connections apart despite being part of a large network.

Key properties of small-world networks:

- High Clustering Coefficient – Nodes tend to form tightly connected groups.
- Short Average Path Length – Any node can be reached from any other in just a few hops.
- Presence of Hubs – Certain highly connected nodes help link distant parts of the network.



The script below analyzes whether a protein-protein interaction network exhibits small-world properties by computing its clustering coefficient and average shortest path length, then comparing it with a random network of similar size.


In [7]:
# Extract specific relationship types (modify as needed)
from joblib import Parallel, delayed
import networkx as nx
import matplotlib.pyplot as plt
import random
from pyvis.network import Network
import numpy as np



# Function to compute shortest paths in parallel
def compute_avg_shortest_path(G, sample_size=5000):
    """
    Computes the average shortest path length using a random sample of nodes 
    for efficiency (since calculating for the entire graph is too slow).
    """
    nodes = list(G.nodes())
    sampled_nodes = random.sample(nodes, min(sample_size, len(nodes)))
    
    path_lengths = []
    for node in sampled_nodes:
        sp = nx.single_source_shortest_path_length(G, node)
        path_lengths.extend(sp.values())
    
    return np.mean(path_lengths)

# Function to check small-world properties
def check_small_world(G, sample_size=5000):
    """
    Check if a given graph satisfies small-world properties:
    1. High clustering coefficient
    2. Short average shortest path length
    """
    print("\nComputing Clustering Coefficient...")
    clustering_coeff = nx.average_clustering(G)  # Efficient for large graphs

    print("\nComputing Approximate Shortest Path Length...")
    avg_shortest_path_length = compute_avg_shortest_path(G, sample_size)

    print("\nGenerating Random Graph for Comparison...")
    random_G = nx.gnm_random_graph(n=G.number_of_nodes(), m=G.number_of_edges())

    random_clustering_coeff = nx.average_clustering(random_G)
    random_avg_path_length = compute_avg_shortest_path(random_G, sample_size)

    is_small_world = (clustering_coeff > random_clustering_coeff) and (
        avg_shortest_path_length < random_avg_path_length)

    # Print results
    print("\n--- Graph Properties ---")
    print(f"Clustering Coefficient: {clustering_coeff:.4f}")
    print(f"Average Shortest Path Length (Approx): {avg_shortest_path_length:.4f}")

    print("\n--- Random Graph Properties ---")
    print(f"Random Clustering Coefficient: {random_clustering_coeff:.4f}")
    print(f"Random Average Shortest Path Length (Approx): {random_avg_path_length:.4f}")

    print("\nSmall-World Network? ", "✅ YES" if is_small_world else "❌ NO")
    
    return is_small_world

# Function to visualize graph using PyVis
def visualize_graph(G, filename="optimized_small_world_graph.html"):
    print("\nGenerating Visualization...")
    net = Network(height="750px", width="100%", notebook=True, cdn_resources="remote")
    net.from_nx(G)

    net.set_options("""
    var options = {
      "nodes": {
        "shape": "dot",
        "size": 20
      },
      "physics": {
        "solver": "forceAtlas2Based"
      }
    }
    """)

    net.show(filename)
    print(f"Graph visualization saved as {filename}")

# Load or create a graph from the dataset
print("\nLoading Dataset and Constructing Graph...")

# Modify this section to load your real dataset
selected_relation = "protein_protein"
filtered_df = df[df['relation'] == selected_relation]  # Filter PPI relations

# Sample 100K edges to optimize performance
sample_df = filtered_df.sample(n=min(10000, len(filtered_df)), random_state=42)

# Create Graph
G_filtered = nx.Graph()
for _, row in sample_df.iterrows():
    G_filtered.add_node(row['x_id'], name=row['x_name'], node_type=row['x_type'], source=row['x_source'])
    G_filtered.add_node(row['y_id'], name=row['y_name'], node_type=row['y_type'], source=row['y_source'])
    G_filtered.add_edge(row['x_id'], row['y_id'], relation=row['relation'], display_relation=row['display_relation'])

# Run small-world check
is_small_world = check_small_world(G_filtered)

# Visualize Graph
visualize_graph(G_filtered, filename=f"optimized_small_world_{selected_relation}_v4.html")


Loading Dataset and Constructing Graph...

Computing Clustering Coefficient...

Computing Approximate Shortest Path Length...

Generating Random Graph for Comparison...

--- Graph Properties ---
Clustering Coefficient: 0.0009
Average Shortest Path Length (Approx): 8.5356

--- Random Graph Properties ---
Random Clustering Coefficient: 0.0006
Random Average Shortest Path Length (Approx): 11.4347

Small-World Network?  ✅ YES

Generating Visualization...
optimized_small_world_protein_protein_v4.html
Graph visualization saved as optimized_small_world_protein_protein_v4.html


In [None]:
# Create a small subgraph for visualization (1000 random edges)
sample_edges = df.sample(n=1000, random_state=42)
G = nx.Graph()
for _, row in sample_edges.iterrows():
    G.add_edge(row["x_name"], row["y_name"], relation=row["relation"])

# Interactive Graph Visualization with PyVis
nt = Network(height="600px", width="100%", notebook=True, cdn_resources="remote")
nt.from_nx(G)
nt.show("knowledge_graph.html")  # Opens in browser

# Save the subgraph for future use using GraphML (widely supported format)
nx.write_graphml(G, "small_knowledge_graph.graphml")

print("Visualization saved as 'knowledge_graph.html'. Open it in your browser!")
print("Graph saved as 'small_knowledge_graph.graphml'.")


In [None]:
G = nx.read_graphml("small_knowledge_graph.graphml")

In [None]:
from IPython.display import IFrame

nt.show("knowledge_graph.html")
display(IFrame("knowledge_graph.html", width=900, height=600))

In [None]:
pip install igraph

# Visualizing Degree Centrality, Betweeness Centrality, Closeness Centrality

In [None]:
 # Create a graph-tool Graph
G_gt = Graph(directed=False)

# Create a mapping of node names to graph-tool vertices
node_map = {}  
for node in pd.concat([df["x_name"], df["y_name"]]).unique():
    node_map[node] = G_gt.add_vertex()

# Add edges to the graph
for _, row in df.iterrows():
    G_gt.add_edge(node_map[row["x_name"]], node_map[row["y_name"]])

# Compute Degree Centrality
degree_centrality = G_gt.degree_property_map("total")

# Compute Betweenness Centrality
betweenness_centrality, _ = betweenness(G_gt)

# Compute Closeness Centrality
closeness_centrality = closeness(G_gt)

# Convert results into a DataFrame
centrality_df = pd.DataFrame({
    "node": list(node_map.keys()),
    "degree_centrality": [degree_centrality[v] for v in G_gt.vertices()],
    "betweenness_centrality": [betweenness_centrality[v] for v in G_gt.vertices()],
    "closeness_centrality": [closeness_centrality[v] for v in G_gt.vertices()]
})

# Save centrality results
centrality_df.to_csv("graph_tool_centrality.csv", index=False)

In [21]:
def visualize_centrality(df, centrality_df, centrality_type, filename):
    """
    Function to visualize a graph using PyVis based on the selected centrality measure.

    Parameters:
    - df: DataFrame containing edge list (filtered_primekg.csv)
    - centrality_df: DataFrame containing centrality scores (graph_tool_centrality.csv)
    - centrality_type: String, one of ["degree_centrality", "betweenness_centrality", "closeness_centrality"]
    - filename: String, name of the output HTML file for visualization

    Returns:
    - Saves an interactive HTML graph colored based on the selected centrality measure.
    """

    # Sample only 100 relationships for visualization
    sample_df = df.sample(n=1000, random_state=42)

    # Initialize PyVis Network
    net = Network(height="600px", width="100%", notebook=True, cdn_resources="remote")

    # Ensure only nodes present in centrality_df are included
    centrality_nodes = set(centrality_df["node"])

    # Define distinct color schemes for each centrality measure
    if centrality_type == "degree_centrality":
        high_color, mid_color, low_color = "#008080", "#90EE90", "#000080"  # Teal, Light Green, Navy
    elif centrality_type == "betweenness_centrality":
        high_color, mid_color, low_color = "#800080", "#E6E6FA", "#ADD8E6"  # Purple, Lavender, Light Blue
    elif centrality_type == "closeness_centrality":
        high_color, mid_color, low_color = "#4682B4", "#87CEEB", "#D3D3D3"  # Steel Blue, Sky Blue, Light Gray
    else:
        raise ValueError("Invalid centrality type. Choose from: 'degree_centrality', 'betweenness_centrality', 'closeness_centrality'.")

    # Add nodes with color based on the selected centrality type
    for _, row in sample_df.iterrows():
        node1, node2 = row["x_name"], row["y_name"]

        # Default colors (gray if missing in centrality_df)
        color1, color2 = "gray", "gray"

        if node1 in centrality_nodes:
            centrality_value1 = centrality_df.loc[centrality_df["node"] == node1, centrality_type].values[0]
            color1 = high_color if centrality_value1 > 0.02 else mid_color if centrality_value1 > 0.005 else low_color

        if node2 in centrality_nodes:
            centrality_value2 = centrality_df.loc[centrality_df["node"] == node2, centrality_type].values[0]
            color2 = high_color if centrality_value2 > 0.02 else mid_color if centrality_value2 > 0.005 else low_color

        net.add_node(node1, title=f"{centrality_type}: {centrality_value1:.4f}", color=color1)
        net.add_node(node2, title=f"{centrality_type}: {centrality_value2:.4f}", color=color2)

        net.add_edge(node1, node2)

    # Add force-directed physics solver for better visualization
    net.set_options("""
    var options = {
      "nodes": {
        "shape": "dot",
        "size": 20
      },
      "physics": {
        "solver": "forceAtlas2Based"
      }
    }
    """)

    # Save visualization
    net.show(filename)
    print(f"Graph visualization saved as {filename}")

In [22]:
visualize_centrality(df, centrality_df, "degree_centrality", "degree_centrality_network.html")
net.show("degree_centrality_network.html")

degree_centrality_network.html
Graph visualization saved as degree_centrality_network.html
degree_centrality_network.html


In [23]:
visualize_centrality(df, centrality_df, "betweenness_centrality", "betweenness_centrality_network.html")
print(f"Graph visualization saved as betweeness_centrality.html")
# Save visualization
net.show("betweeness_centrality_network.html")

betweenness_centrality_network.html
Graph visualization saved as betweenness_centrality_network.html
Graph visualization saved as betweeness_centrality.html
betweeness_centrality_network.html


In [24]:
visualize_centrality(df, centrality_df, "closeness_centrality", "closeness_centrality_network.html")
print(f"Graph visualization saved as closeness_centrality.html")
# Save visualization
net.show("closeness_centrality_network.html")

closeness_centrality_network.html
Graph visualization saved as closeness_centrality_network.html
Graph visualization saved as closeness_centrality.html
closeness_centrality_network.html
