In [5]:
import pandas as pd

# Read the dataset
data = pd.read_csv('p2p-Gnutella04.txt', sep='\t', comment='#', header=None, names=['FromNodeId', 'ToNodeId'])

# Find all nodes with outgoing edges
outgoing_nodes = set(data['FromNodeId'])

# Find all nodes 
all_nodes = set(data['FromNodeId']).union(set(data['ToNodeId']))

# Find sinks
sinks = sorted(list(all_nodes - outgoing_nodes))

# Write the result to sinks.csv
sinks_df = pd.DataFrame(sinks, columns=['SinkNodeId'])
sinks_df.to_csv('sinks.csv', index=False)

In [2]:
import pandas as pd

# Parameters
damping_factor = 0.85
num_iterations = 10

# Read the dataset
data = pd.read_csv('p2p-Gnutella04.txt', sep='\t', comment='#', header=None, names=['FromNodeId', 'ToNodeId'])

# Identify all nodes
all_nodes = set(data['FromNodeId']).union(set(data['ToNodeId']))
num_nodes = len(all_nodes)

# Initialize PageRank values
pagerank = {node: 1 / num_nodes for node in all_nodes}

# Create dictionaries for outgoing and incoming links for node
outgoing_links = {node: [] for node in all_nodes}
incoming_links = {node: [] for node in all_nodes}

for _, row in data.iterrows():
    outgoing_links[row['FromNodeId']].append(row['ToNodeId'])

    incoming_links[row['ToNodeId']].append(row['FromNodeId'])

# Function to remove sinks and update the edge list
def remove_sinks(edges, all_nodes):
    
    sinks = [node for node in all_nodes if not outgoing_links[node]]
    
    for sink in sinks:
        if sink in all_nodes:
            all_nodes.remove(sink)
    
        if sink in outgoing_links:
            outgoing_links.pop(sink)
        
        if sink in incoming_links:
            incoming_links.pop(sink)
 

    edges_no_sinks = [edge for edge in edges if edge['FromNodeId'] in all_nodes and edge['ToNodeId'] in all_nodes]
    
    return edges_no_sinks

# Remove sinks and update edges
edges_no_sinks = remove_sinks(data.to_dict('records'), all_nodes)

# Assgining scores
def page_rank(edges, all_nodes):
    scores = {node: 1.0 / len(all_nodes) for node in all_nodes}
    
    incoming_links = {node: [] for node in all_nodes}
    
    for edge in edges:
        source = edge['FromNodeId']
        destination = edge['ToNodeId']
        incoming_links[destination].append(source)
    
    for _ in range(num_iterations):
        new_scores = {}
    
        for node in all_nodes:
            incoming_sum = sum(scores[incoming] / len(outgoing_links[incoming]) for incoming in incoming_links[node])
            new_scores[node] = (1 - damping_factor) / len(all_nodes) + damping_factor * incoming_sum
        scores = new_scores
    
    return scores

# Compute PageRank scores
pagerank_scores = page_rank(edges_no_sinks, all_nodes)

# Sort the scores by PageRank value in descending order
sorted_pagerank = sorted(pagerank_scores.items(), key=lambda item: item[1], reverse=True)

# Convert to DataFrame
pagerank_df = pd.DataFrame(sorted_pagerank, columns=['NodeId', 'PageRank'])

# Write the result to PR_results.csv
pagerank_df.to_csv('PR_results.csv', index=False)

# Output the first few rows for verification
print(pagerank_df.head())


   NodeId  PageRank
0    1054  0.000367
1    1536  0.000304
2     171  0.000301
3     453  0.000290
4     407  0.000282
