In [1]:
import json
import networkx as nx
import re
import os

In [5]:
DATA_FILE = 'Data/node_text_data.json'
MAPPING_FILE = 'Data/url_id_mapping.json'

In [3]:
def load_json_file(filename):
    """Loads data from a JSON file."""
    if not os.path.exists(filename):
        print(f"Error: File '{filename}' not found.")
        return None
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from '{filename}': {e}")
        return None
    except Exception as e:
        print(f"An error occurred while reading '{filename}': {e}")
        return None

In [4]:
def extract_stackoverflow_urls(text):
    """Extracts potential Stack Overflow URLs from text using regex."""
    # Simple regex to find Stack Overflow URLs - adjust as needed
    # This looks for stackoverflow.com/questions/ followed by numbers
    pattern = r'https?://stackoverflow\.com/questions/\d+/[^ \s\'\"\)\]\}]+'
    urls = re.findall(pattern, text)
    return urls

In [6]:
print("Loading data...")
# Load the data and mapping files
node_data = load_json_file(DATA_FILE)
url_to_id_mapping = load_json_file(MAPPING_FILE)

if node_data is None or url_to_id_mapping is None:
    print("Failed to load necessary data files. Exiting.")

print("Building reverse mapping (ID to URL)...")
# Create a reverse mapping for easier lookup
# Need to handle potential missing 'url_to_id' key
if 'url_to_id' not in url_to_id_mapping:
    print(f"Error: Key 'url_to_id' not found in {MAPPING_FILE}")
    
id_to_url_mapping = {v: k for k, v in url_to_id_mapping['url_to_id'].items()}

Loading data...
Building reverse mapping (ID to URL)...


In [7]:
print("Creating graph...")
# Create a directed graph
G = nx.DiGraph()

print("Adding nodes...")
# Add nodes to the graph
for node_id, text_content in node_data.items():
    # Ensure node_id exists in the reverse mapping
    node_url = id_to_url_mapping.get(int(node_id)) # Convert node_id string to int for lookup
    if node_url:
        G.add_node(
            node_id,
            url=node_url,
            text=text_content
            # Add other attributes if needed
        )
    else:
            print(f"Warning: No URL found for node ID {node_id}. Skipping node.")


print("Attempting to add edges based on explicit URLs...")
# Add edges based on URLs found in text content
edges_added = 0
for source_id, data in G.nodes(data=True):
    if 'text' in data:
        found_urls = extract_stackoverflow_urls(data['text'])
        for url in found_urls:
            # Check if the found URL exists in our mapping
            target_id_str = url_to_id_mapping['url_to_id'].get(url)
            if target_id_str is not None:
                # Ensure the target node actually exists in the graph
                if G.has_node(target_id_str):
                        # Avoid self-loops unless desired
                    if source_id != target_id_str:
                        G.add_edge(source_id, target_id_str)
                        edges_added += 1
                else:
                        print(f"Warning: Found URL {url} mapping to ID {target_id_str}, but node not in graph.")

Creating graph...
Adding nodes...
Attempting to add edges based on explicit URLs...


In [8]:
print("\n--- Graph Inspection ---")
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges found (based on simple URL extraction): {G.number_of_edges()}")

# Display info for a few sample nodes
print("\nSample Node Data:")
node_list = list(G.nodes())
for i in range(min(3, len(node_list))): # Show up to 3 nodes
    node_id = node_list[i]
    print(f"\nNode ID: {node_id}")
    print(f"  URL: {G.nodes[node_id].get('url', 'N/A')}")
    text_preview = G.nodes[node_id].get('text', '')[:100] # Preview first 100 chars
    print(f"  Text Preview: {text_preview}...")

# Display info for a few sample edges
print("\nSample Edges:")
edge_list = list(G.edges())
for i in range(min(3, len(edge_list))): # Show up to 3 edges
    print(f"  {edge_list[i][0]} -> {edge_list[i][1]}")

print("\n--- Further Analysis Suggestions ---")
print("- Improve edge detection using NLP (e.g., linking based on mentioned question titles/IDs).")
print("- Analyze node centrality (degree, betweenness, etc.).")
print("- Visualize the graph (requires matplotlib or other plotting libraries).")
print("- Perform community detection.")


--- Graph Inspection ---
Number of nodes: 4168
Number of edges found (based on simple URL extraction): 0

Sample Node Data:

Node ID: 3558
  URL: https://stackoverflow.com/documentation/ios/topics
  Text Preview: We have shut down Stack Overflow Documentation. Documentation was our attempt at improving existing ...

Node ID: 1245
  URL: https://stackoverflow.com/?ref=blog.krugazor.eu
  Text Preview: Every tab open to Stack Overflow. has aFor over 15 years we’ve been the Q&A platform of choice that ...

Node ID: 866
  URL: https://stackoverflow.com/help/badges/680
  Text Preview: Tags Awarded Mar 15, 2019 at 4:03 to Awarded May 22, 2017 at 4:03 to Awarded Mar 22, 2016 at 4:03 to...

Sample Edges:

--- Further Analysis Suggestions ---
- Improve edge detection using NLP (e.g., linking based on mentioned question titles/IDs).
- Analyze node centrality (degree, betweenness, etc.).
- Visualize the graph (requires matplotlib or other plotting libraries).
- Perform community detection.
