## Social Network Analysis Project 12

Mapping Covid-19 Vaccine Discussions in a Blog Forum

In [None]:
import matplotlib.pyplot as plt
import numpy as np

To use this notebook, you need to dump the forum thread and scrape the post data using the following:

```
$ python3 html_dump.py
$ python3 scrape.py
```

In [None]:
import pickle

with open('dump.pickle', 'rb') as f:
    post_data = pickle.load(f)

### Analyze the relationship between location and number of posts

In [None]:
# Number of posts from a location
location_count = {}

for post in post_data:
    location = post['location']
    location_count[location] = 1 if location not in location_count else location_count[location] + 1

# Sort the amounts in descending order
sorted_locations = sorted(location_count.keys(), key=lambda loc: location_count[loc], reverse=True)

# Plot 
x = np.array(list(range(1, len(sorted_locations) + 1)))
y = np.array(list(map(lambda loc: location_count[loc], sorted_locations)))
plt.plot(x, y)
plt.xscale('log')
plt.yscale('log')
plt.title('logarithmic distribution of post locations and counts')

Taking into account the small size of the dataset, the log-log plot looks quite linear. Based on this, the data obeys the power law distribution.

### Analyze post lengths

In [None]:
post_lengths = []

for post in post_data:
    post_text = post['text']
    post_len = len(post_text)
    post_words = len(post_text.split())
    print(f'length: {post_len}, words: {post_words}')
    post_lengths.append(post_words)

print(f'min: {min(post_lengths)}, max: {max(post_lengths)}')

### Display a histogram of post lengths

In [None]:
def display_length_histogram(post_lengths):
    plt.hist(post_lengths, bins=30, range=(0, 1300))
    plt.title('histogram of word counts')

    print(f'mean length: {np.mean(post_lengths)}')


display_length_histogram(post_lengths)

The histogram is positively skewed, with the bulk of posts being under 200 words in length, and only some outliers above 400 words, with some in between.

#### Within the top 5 regions

In [None]:
top_regions = sorted_locations[:5]
top_region_post_lengths = []

for post in post_data:
    if post['location'] in top_regions:
        post_text = post['text']
        post_words = len(post_text.split())
        top_region_post_lengths.append(post_words)

display_length_histogram(top_region_post_lengths)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].hist(post_lengths, bins=30, range=(0, 1300))
ax[0].set_title('histogram of word counts')
ax[1].hist(top_region_post_lengths, bins=30, range=(0, 1300))
ax[1].set_title('histogram of word counts in top regions')

Both distributions look very similar. Therefore, it seems the location can not be discriminated based on the post lengths.

### Quote graph

In [None]:
import networkx as nx

G = nx.Graph()

for post in post_data:
    for q in post['quotes']:
        # Ignore self-quotes
        if q != post['username']:
            G.add_edge(post['username'], q)

In [None]:
plt.figure(figsize=(10,10))
nx.draw(G, with_labels=True, pos=nx.kamada_kawai_layout(G))

In [None]:
# Number of nodes and edges
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Diameter
# diameter = nx.diameter(G)
# This fails because the diameter is infinite (there are multiple components)

# Connected components
num_components = nx.number_connected_components(G)

# Avg clustering coefficient
avg_clustering = nx.average_clustering(G)

# Degree centrality
degree_centrality = nx.degree_centrality(G)
avg_degree_centrality = sum(degree_centrality.values()) / num_nodes

# Closeness centrality
closeness_centrality = nx.closeness_centrality(G)
avg_closeness_centrality = sum(closeness_centrality.values()) / num_nodes

print(f'Nodes: {num_nodes} Edges: {num_edges}')
print(f'Connected components: {num_components}')
print(f'Diameter: infinite')
print(f'Average clustering: {avg_clustering}')
print(f'Avg. degree centrality: {avg_degree_centrality}')
print(f'Avg. closeness centrality: {avg_closeness_centrality}')

In [None]:
degree_centrality = nx.degree_centrality(G)
local_clustering = nx.clustering(G)

fig, ax = plt.subplots(2, 2, figsize=(15, 10))
centrality_hist = ax[0,0].hist(list(degree_centrality.values()), bins=10)
ax[0,0].set_xlabel('degree centrality')
ax[0,0].set_ylabel('nodes')
ax[0,0].set_title('histogram of degree centrality')
clustering_hist = ax[0,1].hist(list(local_clustering.values()), bins=10)
ax[0,1].set_xlabel('clustering coefficient')
ax[0,1].set_ylabel('nodes')
ax[0,1].set_title('histogram of local clustering')


def bin_centers(bins):
    centers = []
    for b in range(len(bins)-1):
        centers.append((bins[b] + bins[b+1])/2)
    return centers


ax[1,0].plot(bin_centers(centrality_hist[1]), centrality_hist[0])
ax[1,0].set_xlabel('degree centrality')
ax[1,0].set_ylabel('nodes')
ax[1,0].set_title('degree centrality log-log')
ax[1,0].set_xscale('log')
ax[1,0].set_yscale('log')
ax[1,1].plot(bin_centers(clustering_hist[1]), clustering_hist[0])
ax[1,1].set_xlabel('clustering coefficient')
ax[1,1].set_ylabel('nodes')
ax[1,1].set_title('local clustering log-log')
ax[1,1].set_xscale('log')
ax[1,1].set_yscale('log')

Again, accounting for the small sample size, from the log-log plots it seems that the degree centrality distribution does obey the power law, but the local clustering is a bit unclear.

### Using the Girvan-Newman algorithm to find communities

In [None]:
communities = nx.community.girvan_newman(G)

# List the community sizes in each step
for comm in communities:
    print(list(map(len, comm)))

There is clearly a certain set of users who are enaging with each other the most, as the first community remains the largest even 10 steps into the algorithm tearing it apart. This might suggest that there was, for example, a prominent conversation that many users took part in.

#### Weighing users by reputation

In [None]:
from networkx import edge_betweenness_centrality

user_reputation = {}

for post in post_data:
    user_reputation[post['username']] = post['reputation']

# Add weights to edges based on sum of reputation
for u, v in G.edges:
    # There is at least one user who is mentioned but has no posts currently in the thread
    u_rep = user_reputation[u] if u in user_reputation else 0
    v_rep = user_reputation[v] if v in user_reputation else 0
    G.edges[u, v]['weight'] = u_rep + v_rep


def most_central_edge(G):
    centrality = edge_betweenness_centrality(G, weight="weight")
    return max(centrality, key=centrality.get)


weighted_communities = nx.community.girvan_newman(G, most_valuable_edge=most_central_edge)

# List the community sizes in each step
for comm in weighted_communities:
    print(list(map(len, comm)))

This time the algorithm breaks down the first community into large chunks faster.

### Final comments

Most users in the thread are not against vaccines, even though they openly lament their side effects.