In [32]:
import pandas as pd

# Load the data
df = pd.read_csv('PedInf.txt', delimiter='\t')

# Create edge list
edges = []
for _, row in df.iterrows():
    if pd.notna(row['MaleParent']):
        edges.append((row['MaleParent'], row['LineName']))
    if pd.notna(row['FemaleParent']):
        edges.append((row['FemaleParent'], row['LineName']))

edges_df = pd.DataFrame(edges, columns=["Parent", "Child"])

# Save edges to CSV for loading into cuGraph
edges_df.to_csv("edges.csv", index=False)


In [19]:
#Undirected Louvain Community Detection
import cudf
import cugraph

# Load the edges into cuDF
edges_df = cudf.read_csv("edges.csv")

# Create an undirected graph directly
G = cugraph.Graph(directed=False)
G.from_cudf_edgelist(edges_df, source='Parent', destination='Child')

# Perform Louvain community detection
louvain_parts, modularity = cugraph.louvain(G)
louvain_parts = louvain_parts.to_pandas()

# Display the results
print("Louvain Community Detection")
print(louvain_parts.head(10))




Louvain Community Detection
   partition      vertex
0         27   CP95-1987
1          4   CP06-2707
2          3   CP69-0391
3          6   CP13-1137
4          6   CP10-1246
5         25   CP17-1729
6         26   CP07-1183
7         17   CP91-1034
8          3  LCP85-0322
9          9   CP12-1176


In [28]:
import cudf
import cugraph

# Load the edges into cuDF
edges_df = cudf.read_csv("edges.csv")

# Create a directed graph
G = cugraph.Graph(directed=True)
G.from_cudf_edgelist(edges_df, source='Parent', destination='Child')

# Compute PageRank
pagerank_scores = cugraph.pagerank(G)
pagerank_scores = pagerank_scores.to_pandas()

# Compute Betweenness Centrality
betweenness_centrality = cugraph.betweenness_centrality(G)
betweenness_centrality = betweenness_centrality.to_pandas()

# Compute Katz Centrality
katz_centrality = cugraph.katz_centrality(G)
katz_centrality = katz_centrality.to_pandas()

# Compute Strongly Connected Components (SCC)
scc_df = cugraph.strongly_connected_components(G)
scc_df = scc_df.to_pandas()

# Display the results
print("PageRank Scores")
print(pagerank_scores.head(10))

print("Betweenness Centrality")
print(betweenness_centrality.head(10))

print("Katz Centrality")
print(katz_centrality.head(10))

print("Strongly Connected Components (SCC)")
print(scc_df.head(10))




PageRank Scores
   pagerank     vertex
0   0.00002  CP97-1183
1   0.00002  CP14-1305
2   0.00002  CP02-2443
3   0.00002  CP19-4053
4   0.00002  CP12-1041
5   0.00002  CP03-1662
6   0.00002  CP14-2504
7   0.00002  CP12-2407
8   0.00002  CP04-1623
9   0.00002  CP13-1558
Betweenness Centrality
   betweenness_centrality     vertex
0                     0.0  CP07-2572
1                     0.0  CP01-2617
2                     0.0  CP97-1173
3                     0.0  CP95-1786
4                     0.0  CP04-1481
5                     0.0  CP95-1984
6                     0.0  CP00-1161
7                     0.0  CP01-2669
8                     0.0  CP14-4600
9                     0.0  CP17-2065
Katz Centrality
   katz_centrality       vertex
0         0.004622    CP02-2112
1         0.004622    CP15-2144
2         0.004622       US1639
3         0.004622    CP92-1375
4         0.004622    CP98-1052
5         0.004622  CPCL14-4056
6         0.004622   US66-116-1
7         0.004622    CP14-16

In [29]:
# Export PageRank Scores
pagerank_scores.to_csv("pagerank_scores.csv", index=False)

# Export Betweenness Centrality
betweenness_centrality.to_csv("betweenness_centrality.csv", index=False)

# Export Katz Centrality
katz_centrality.to_csv("katz_centrality.csv", index=False)

# Export SCC
scc_df.to_csv("strongly_connected_components.csv", index=False)


In [47]:
import networkx as nx
import pandas as pd

# Load the edges into a pandas DataFrame
edges_df = pd.read_csv("edges.csv")

# Create a directed NetworkX graph from the edge list
G_nx = nx.from_pandas_edgelist(edges_df, source='Parent', target='Child', create_using=nx.DiGraph())

def count_paths_and_steps(G, start_node, end_node):
    """Count the number of directed paths and the average number of steps between two nodes in a graph using DFS."""
    def dfs(current, target, visited, depth):
        if current == target:
            return (1, depth)
        if current in visited:
            return (0, 0)
        visited.add(current)
        path_count = 0
        total_steps = 0
        for neighbor in G.successors(current):
            p_count, steps = dfs(neighbor, target, visited, depth + 1)
            path_count += p_count
            total_steps += steps
        visited.remove(current)
        return (path_count, total_steps)
    
    path_count, total_steps = dfs(start_node, end_node, set(), 0)
    average_steps = total_steps / path_count if path_count > 0 else float('inf')
    return path_count, average_steps

# Example usage: Count the number of paths and average steps between two nodes
start_node = 'BlackCheribon'  # Replace with actual start node
end_node = 'LCP85-0384'    # Replace with actual end node
path_count, average_steps = count_paths_and_steps(G_nx, start_node, end_node)
print(f"Number of directed paths from {start_node} to {end_node}: {path_count}")
print(f"Average number of steps from {start_node} to {end_node}: {average_steps}")


Number of directed paths from BlackCheribon to LCP85-0384: 25
Average number of steps from BlackCheribon to LCP85-0384: 8.68


In [None]:
import networkx as nx
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Load the edges into a pandas DataFrame
edges_df = pd.read_csv("edges.csv")

# Create a directed NetworkX graph from the edge list
G_nx = nx.from_pandas_edgelist(edges_df, source='Parent', target='Child', create_using=nx.DiGraph())

def count_paths_and_steps(G, start_node, end_node):
    """Count the number of directed paths and the average number of steps between two nodes in a graph using DFS."""
    def dfs(current, target, visited, depth):
        if current == target:
            return (1, depth)
        if current in visited:
            return (0, 0)
        visited.add(current)
        path_count = 0
        total_steps = 0
        for neighbor in G.successors(current):
            p_count, steps = dfs(neighbor, target, visited, depth + 1)
            path_count += p_count
            total_steps += steps
        visited.remove(current)
        return (path_count, total_steps)
    
    path_count, total_steps = dfs(start_node, end_node, set(), 0)
    average_steps = total_steps / path_count if path_count > 0 else float('inf')
    return path_count, average_steps

# Get all nodes in the graph
nodes = list(G_nx.nodes)

# Initialize the matrix with infinities
avg_steps_matrix = pd.DataFrame(np.inf, index=nodes, columns=nodes)

# Function to compute average steps for a pair of nodes
def compute_avg_steps(pair):
    start_node, end_node = pair
    try:
        _, avg_steps = count_paths_and_steps(G_nx, start_node, end_node)
    except Exception as e:
        avg_steps = float('inf')
    return (start_node, end_node, avg_steps)

# Create all pairs of nodes
node_pairs = [(start_node, end_node) for start_node in nodes for end_node in nodes if start_node != end_node]

# Use a smaller batch size to reduce memory and CPU load
batch_size = 100
num_batches = len(node_pairs) // batch_size + 1

# Process in smaller batches
for i in tqdm(range(num_batches), desc="Processing batches"):
    batch_pairs = node_pairs[i*batch_size:(i+1)*batch_size]
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(compute_avg_steps, pair) for pair in batch_pairs]
        for future in as_completed(futures):
            start_node, end_node, avg_steps = future.result()
            avg_steps_matrix.at[start_node, end_node] = avg_steps

# Display the resulting matrix
import ace_tools as tools; tools.display_dataframe_to_user(name="Average Steps Matrix", dataframe=avg_steps_matrix)
