In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# Load clean dataset
df = pd.read_csv('cleaned_data.csv')

# Just in case: remove self-links
df = df[df['person_A'] != df['person_B']]

# Count frequency of contact (proxy for edge weight)
edge_df = df.groupby(['person_A', 'person_B']).size().reset_index(name='weight')

# Build weighted graph
G = nx.Graph()
for _, row in edge_df.iterrows():
    G.add_edge(row['person_A'], row['person_B'], weight=row['weight'])

print(f"Nodes: {G.number_of_nodes()}\nEdges: {G.number_of_edges()}")

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_data.csv'

In [None]:
# Compute centrality metrics
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G, weight='weight')
closeness_centrality = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=500, weight='weight')

# Combine into a single DataFrame
centrality_df = pd.DataFrame({
    'Degree': degree_centrality,
    'Betweenness': betweenness_centrality,
    'Closeness': closeness_centrality,
    'Eigenvector': eigenvector_centrality
}).reset_index().rename(columns={'index': 'Person'})

# Rank each metric
centrality_df = centrality_df.sort_values(by='Degree', ascending=False)
print(centrality_df.head(10))

In [None]:
# Visualize results
metrics = ['Degree', 'Betweenness', 'Closeness', 'Eigenvector']

for metric in metrics:
    top_nodes = centrality_df.nlargest(10, metric)
    plt.figure(figsize=(8, 4))
    plt.bar(top_nodes['Person'].astype(str), top_nodes[metric], color='skyblue')
    plt.title(f"Top 10 Nodes by {metric} Centrality")
    plt.xticks(rotation=45)
    plt.ylabel(metric)
    plt.grid(alpha=0.3)
    plt.show()


NameError: name 'centrality_df' is not defined

In [None]:
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)

# Node color represents betweenness, size represents degree
node_sizes = [v * 2000 for v in centrality_df['Degree']]
node_colors = list(centrality_df['Betweenness'])

nx.draw_networkx(
    G, pos,
    node_color=node_colors,
    node_size=node_sizes,
    cmap='coolwarm',
    with_labels=False,
    edge_color='gray',
    alpha=0.7
)

plt.title("Network Visualization — Node Size = Degree, Color = Betweenness", fontsize=12)
plt.show()

In [None]:
# Get the largest connected component
largest_component = max(nx.connected_components(G), key=len)
G_sub = G.subgraph(largest_component)

# Compute overall network metrics
clustering_coef = nx.average_clustering(G_sub)
avg_path_length = nx.average_shortest_path_length(G_sub)

print(f"Average Clustering Coefficient: {clustering_coef:.3f}")
print(f"Average Path Length: {avg_path_length:.3f}")


In [None]:
# Normalize metrics and compute average influence
# This is to identify consistent super-spreaders
centrality_df['Mean Influence'] = centrality_df[['Degree','Betweenness','Closeness','Eigenvector']].mean(axis=1)

top_influencers = centrality_df.nlargest(10, 'Mean Influence')
print(top_influencers)
