## Week 2 Practical Assignment: Exploring Real-World Networks
# Analysis of the SNAP YouTube Social Network
## Course: Model Based Decisions (2025)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import seaborn as sns
import numpy as np
import gzip
import os
from collections import defaultdict
import seaborn as sns
from matplotlib.patches import Patch
import warnings
import math
import collections
import random



warnings.filterwarnings('ignore')

# Set matplotlib to display plots inline
%matplotlib inline

# Set style for better looking plots
plt.style.use('ggplot')
sns.set_palette("husl")

seed = 42
np.random.seed(seed)
random.seed(seed) # Random seed for reproducible model generation

In [12]:

# #################################################################################################### #
# Network Visualization Suite Class (Adapted for YouTube Analysis from network_visualisation_suite.py) #
# #################################################################################################### #
class NetworkVisualizationSuite:
    """
    Comprehensive network analysis suite, adapted for N > 1M nodes.
    Uses sampling for path-dependent metrics (Betweenness, Closeness, Avg Path Length)
    to ensure execution.
    """
    
    def __init__(self):
        """Initialize the suite."""
        self.networks = {}               # Dictionary to store the networks
        self.network_stats = {}           # Dictionary to store the network statistics
        self.youtube_url = "https://snap.stanford.edu/data/com-youtube.ungraph.txt.gz"
        self.real_network_key = 'YouTube'
        self.sampling_k = 1000            # Sampling size for all path-dependent metrics (L, Betweenness, Closeness)
        
    # -------------------------------------------------------------------------------- #
    # 1. Load Real YouTube Network
    # -------------------------------------------------------------------------------- #
    def load_youtube_network(self):
        """Load YouTube social network from compressed SNAP dataset URL and return LCC."""
        print(f"--- 1. Data Loading ---")
        
        # Load the dataset using pandas
        youtube_df = pd.read_csv(
            self.youtube_url,
            compression="gzip",
            sep="\t",
            comment="#",
            names=["start_node", "end_node"],
        )
        
        # Create the graph object G from the pandas edgelist 
        G = nx.from_pandas_edgelist(youtube_df, "start_node", "end_node")
        
        # Check connectivity and get LCC (which for YouTube is the entire graph)
        if nx.is_connected(G):
            G_lcc = G.copy()
            print("Graph is fully connected. Proceeding with the full graph.")
        else:
            # Fallback for LCC calculation
            largest_cc_nodes = max(nx.connected_components(G), key=len)
            G_lcc = G.subgraph(largest_cc_nodes).copy()

        print(f"Loaded YouTube network: {G_lcc.number_of_nodes():,} nodes, {G_lcc.number_of_edges():,} edges\n")
        self.networks[self.real_network_key] = G_lcc 
        return G_lcc
    
    # -------------------------------------------------------------------------------- #
    # 2. Generate Theoretical Network Models (ER, WS, BA)
    # -------------------------------------------------------------------------------- #
    def generate_theoretical_networks(self, G_lcc):
        """Generate theoretical network models (ER, WS, BA) for comparison."""
        
        n = G_lcc.number_of_nodes()
        e = G_lcc.number_of_edges()
        avg_deg = (2 * e) / n
        
        print(f"--- 2. Model Generation ---")
        
        # 1. Erdős-Rényi (ER) Model 
        p_er = avg_deg / (n - 1)
        er_graph = nx.erdos_renyi_graph(n, p_er, seed=seed) 
        
        # 2. Watts-Strogatz (small-world) 
        k = max(4, int(round(avg_deg))) 
        k = k if k % 2 == 0 else k + 1     # Ensure k is even
        k_ws = k 
        p_ws = 0.1 
        ws_graph = nx.watts_strogatz_graph(n, k_ws, p_ws, seed=seed)
        
        # 3. Barabási-Albert (scale-free) 
        m = max(2, int(round(avg_deg / 2)))   # m ≈ <k> / 2
        m_ba = m
        ba_graph = nx.barabasi_albert_graph(n, m_ba, seed=seed)
        
        # Store all theoretical models
        self.networks.update({
            "Erdős-Rényi": er_graph,
            "Watts-Strogatz": ws_graph,
            "Barabási-Albert": ba_graph
        })
        
        print(f"Theoretical networks generated successfully (ER p={p_er:.8f}, WS k={k_ws}, BA m={m_ba})!\n")

    # -------------------------------------------------------------------------------- #
    # 3. Approximate Average Path Length (Sampling)
    # -------------------------------------------------------------------------------- #
    def _approximate_avg_path_length(self, G, k):
        """Approximates Average Shortest Path Length by sampling k source nodes."""
        nodes = list(G.nodes())
        sampled_nodes = np.random.choice(nodes, min(len(nodes), k), replace=False)
        total_path_length = 0
        num_paths = 0
        
        for source in sampled_nodes:
            try:
                length = nx.shortest_path_length(G, source=source)
            except nx.NetworkXNoPath:
                # Skip if the sampled component is somehow disconnected
                continue 
            
            # Sum path lengths and count valid paths
            for target, path_len in length.items():
                if source != target:
                    total_path_length += path_len
                    num_paths += 1
                    
        if num_paths > 0:
            return total_path_length / num_paths
        return float('nan')    # Return NaN if path computation fails

    # -------------------------------------------------------------------------------- #
    # 4. Analyze Network Properties (Structural Metrics)
    # -------------------------------------------------------------------------------- #
    def analyze_network_properties(self):
        """
        Computes network metrics. Uses sampling for Avg. Path Length due to N > 1M.
        """
        print("--- 3. Analyzing Network Properties ---")
        
        for name, G in self.networks.items():
            print(f"Analyzing {name} network...")
            
            stats = {}
            stats['nodes'] = G.number_of_nodes()
            stats['edges'] = G.number_of_edges()
            stats['avg_degree'] = np.mean([d for _, d in G.degree()])
            
            # Clustering, Assortativity (All feasible exactly)
            stats['avg_clustering'] = nx.average_clustering(G)
            stats['assortativity'] = nx.degree_assortativity_coefficient(G) 
            
            # Path lengths (Approximated via Sampling)
            k_sample = 250 if name != self.real_network_key else self.sampling_k 
            
            try:
                 stats['avg_path_length'] = self._approximate_avg_path_length(G, k_sample)
            except Exception as e:
                 print(f"  Warning: Path length calculation failed for {name}: {e}")
                 stats['avg_path_length'] = np.nan
                
            self.network_stats[name] = stats
        
        print("\nAnalysis of network properties complete.\n")

    # -------------------------------------------------------------------------------- #
    # 5. Centrality Analysis (with sampling)
    # -------------------------------------------------------------------------------- #
    def run_centrality_analysis(self, top_n=5):
        """Calculates all four core centrality measures, using sampling for intractable ones."""
        G_lcc = self.networks[self.real_network_key]
        print(f"\n--- 4. Centrality Analysis (Top {top_n} Nodes) ---")
        
        def print_top_nodes(centrality_dict, name):
            """Helper function to sort and print top N nodes."""
            sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
            print(f"\n--- Top {top_n} Nodes by {name} ---")
            for i in range(top_n):
                node, score = sorted_nodes[i]
                print(f"  {i+1}. Node {node}: {score:.4f}")
            return sorted_nodes

        # Store summary centralities for later table output
        centrality_summary = {}

        # 1. Degree Centrality (Feasible) 
        deg_cent = nx.degree_centrality(G_lcc)
        sorted_deg = print_top_nodes(deg_cent, "Degree Centrality")
        centrality_summary["deg_avg"] = np.mean(list(deg_cent.values()))
        centrality_summary["deg_max"] = sorted_deg[0][1]

        # 2. Eigenvector Centrality (Feasible) 
        try:
            eig_cent = nx.eigenvector_centrality(G_lcc, max_iter=1000, seed=seed) 
            sorted_eig = print_top_nodes(eig_cent, "Eigenvector Centrality")
            centrality_summary["eig_avg"] = np.mean(list(eig_cent.values()))
            centrality_summary["eig_max"] = sorted_eig[0][1]
        except nx.NetworkXException as e:
            print(f"\nCould not compute Eigenvector Centrality: {e}")
            centrality_summary["eig_avg"] = np.nan
            centrality_summary["eig_max"] = np.nan

        # 3. Betweenness Centrality (Approximate, using k sampled sources) 
        print(f"\n--- Betweenness Centrality (Computed via Sampling, k={self.sampling_k}) ---")
        try:
            bet_cent = nx.betweenness_centrality(G_lcc, k=self.sampling_k, seed=seed)
            sorted_bet = print_top_nodes(bet_cent, "Betweenness Centrality")
            centrality_summary["bet_avg"] = np.mean(list(bet_cent.values()))
            centrality_summary["bet_max"] = sorted_bet[0][1]
        except Exception as e:
            print(f"  Calculation failed: {e}")
            centrality_summary["bet_avg"] = np.nan
            centrality_summary["bet_max"] = np.nan

        # 4. Closeness Centrality (Approximate, computing only for k sampled nodes) 
        print(f"\n--- Closeness Centrality (Computed via Sampling, k={self.sampling_k}) ---")
        try:
            nodes_to_sample = np.random.choice(G_lcc.nodes(), self.sampling_k, replace=False)
            clo_cent = {node: nx.closeness_centrality(G_lcc, u=node) for node in nodes_to_sample}
            sorted_clo = print_top_nodes(clo_cent, "Closeness Centrality")
            centrality_summary["clo_avg"] = np.mean(list(clo_cent.values()))
            centrality_summary["clo_max"] = sorted_clo[0][1]
        except Exception as e:
            print(f"  Calculation failed: {e}")
            centrality_summary["clo_avg"] = np.nan
            centrality_summary["clo_max"] = np.nan

        # Add to network statistics
        self.network_stats[self.real_network_key].update(centrality_summary)
        print("\n--- Centrality Analysis Complete ---\n")
        
    # -------------------------------------------------------------------------------- #
    # 6. Print Summary Table (Includes Centrality Measures)
    # -------------------------------------------------------------------------------- #
    def print_summary_statistics(self):
        """Prints the comparative summary table including centrality metrics."""
        stats_data = []
        for name, stats in self.network_stats.items():
            stats_data.append({
                "Network": name,
                "Nodes": f"{stats['nodes']:,}",
                "Edges": f"{stats['edges']:,}",
                "Avg. Clustering": f"{stats['avg_clustering']:.4f}",
                "Avg. Path Length": f"{stats['avg_path_length']:.2f}",
                "Assortativity": f"{stats['assortativity']:.4f}",
                "Avg Degree Cent.": f"{stats.get('deg_avg', np.nan):.4f}" if not np.isnan(stats.get('deg_avg', np.nan)) else "N/A",
                "Max Degree Cent.": f"{stats.get('deg_max', np.nan):.4f}" if not np.isnan(stats.get('deg_max', np.nan)) else "N/A",
                "Avg Eigenvector Cent.": f"{stats.get('eig_avg', np.nan):.4f}" if not np.isnan(stats.get('eig_avg', np.nan)) else "N/A",
                "Max Eigenvector Cent.": f"{stats.get('eig_max', np.nan):.4f}" if not np.isnan(stats.get('eig_max', np.nan)) else "N/A",
                "Avg Betweenness Cent.": f"{stats.get('bet_avg', np.nan):.4f}" if not np.isnan(stats.get('bet_avg', np.nan)) else "N/A",
                "Max Betweenness Cent.": f"{stats.get('bet_max', np.nan):.4f}" if not np.isnan(stats.get('bet_max', np.nan)) else "N/A",
                "Avg Closeness Cent.": f"{stats.get('clo_avg', np.nan):.4f}" if not np.isnan(stats.get('clo_avg', np.nan)) else "N/A",
                "Max Closeness Cent.": f"{stats.get('clo_max', np.nan):.4f}" if not np.isnan(stats.get('clo_max', np.nan)) else "N/A"
            })
            
        stats_df = pd.DataFrame(stats_data)
        print("\n--- Summary Statistics Table ---")
        print(stats_df.to_string(index=False))


# -------------------------------------------------------------------------------- #
# MAIN FUNCTION
# -------------------------------------------------------------------------------- #
def main_analysis():
    """Main function to run the network visualization suite."""
    print("Network Visualization Suite for Week Two - Networks")
    print("="*60)
    
    # Initialize suite
    suite = NetworkVisualizationSuite()
    
    # 1. Load Real Network Data (YouTube)
    G_lcc = suite.load_youtube_network()
    
    # 2. Generate Theoretical Networks
    suite.generate_theoretical_networks(G_lcc)
    
    # 3. Analyze Properties of all networks (Real and Models)
    suite.analyze_network_properties()
    
    # 4. Run Centrality Analysis (including sampled metrics)
    suite.run_centrality_analysis()
    
    # 5. Print Summary Table
    suite.print_summary_statistics()


# -------------------------------------------------------------------------------- #
# RUN SCRIPT
# -------------------------------------------------------------------------------- #
if __name__ == "__main__":
    main_analysis()

SyntaxError: invalid syntax (3780400585.py, line 1)