# Set up Session

In [None]:
!python --version

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

## Install and Import Libraries

In [None]:
!pip install cdlib

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import cdlib
from cdlib import viz
from cdlib import evaluation
import community
from scipy.sparse import csr_matrix
from scipy.stats import spearmanr
import sklearn.metrics
import scipy.spatial.distance
from sklearn.metrics.pairwise import chi2_kernel

from operator import itemgetter
import copy
import itertools

import matplotlib.pyplot as plt

# most_valuable_edge Functions for Girvin-Newman
from networkx.algorithms.centrality import edge_betweenness_centrality

# Clustering Algorithms
from networkx.algorithms.community.asyn_fluid import asyn_fluidc
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community.kernighan_lin import kernighan_lin_bisection
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
from networkx.algorithms.community.quality import modularity
from networkx.algorithms.community.quality import performance


## Load Data

In [None]:
discourse_df = pd.read_pickle('NST03_extracted_features.pickle')

discourse_df.head()

In [None]:
discourse_df.columns

In [None]:
feats_cols = ['id', 'tweet category', 'retweet_count', 'favorite_count', 
             'is_quote_status', 'is_retweet', 'is_reply', 'been_retweeted', 
             'incl_affil_score', 'incl_assoc_score', 'excl_affil_score', 
             'excl_assoc_score', 'abs_terms_score'
             ]

discourse_feats_df = discourse_df[feats_cols]

discourse_feats_df.head()

# Matrix Construction

## Helper Functions

Used for testing different similarity and adjacency methods.

In [None]:
# Probability threshold function


def probThreshold(data, threshold: float = 0.01):
  return np.where(data < threshold, 0, data)

# Similarity measures

def simchiSq(data):
  chi_sim = chi2_kernel(data)
  return chi_sim


def simAbsCorr(data):
  S = np.absolute(np.corrcoef(data))
  return S


def simSignedCorr(data):
  S = (1 + np.corrcoef(data)) / 2
  return S

# Adjacency functions


def powerAdj(SimMat, Beta: int = 6):
  A = SimMat ** Beta
  np.fill_diagonal(A, 0)
  return A


def signumAdj(SimMat, tau: float = 0.0):
  A = np.where(SimMat < tau, 0, 1)
  np.fill_diagonal(A, 0)
  return A

# Topological Overlap Matrix function


def TOMadjacency(AdjMat, threshold_quantile: float = 0.8):
  '''
  TOMadjacency calculates an adjacency matrix by the network overlap of nodes
  in a weighted, undirected graph.
  '''
  # Calculate common neighbors of each node
  L = AdjMat.dot(AdjMat.T)

  # Calculate connectivity of node
  Krow = AdjMat.sum(axis=1)
  Kcol = AdjMat.sum(axis=0)
  Kmin = np.array([np.minimum(k_i, Kcol) for k_i in Krow])

  # Topological overlap
  TOM = (L + AdjMat) / (Kmin + 1 - AdjMat)

  TOM_filtered = np.where(
    TOM >= np.quantile(
      TOM, threshold_quantile), TOM, 0)

  np.fill_diagonal(TOM_filtered, 0)

  TOMlower = np.tril(TOM_filtered)

  TOMsparse = csr_matrix(TOMlower)

  return TOMsparse

## Discourse Network

Create `numpy` array of discourse features: 

In [None]:
discourse_feats_df.info()

In [None]:
disc_cols = ['incl_affil_score', 'incl_assoc_score', 'excl_affil_score',
             'excl_assoc_score', 'abs_terms_score']

tweet_ids = discourse_feats_df['id'].values

disc_arr = discourse_feats_df[disc_cols].values


#### Find and filter tweets with no discourse scores

When tweets have no scores at all across all features, their similarity and adjacency cannot be calculated and so need to be filtered from the analysis. These "no feature" tweets constitute a separate block that can be considerd its own cluster in *addition* to the network communities identified below. These shoudl be analyzed by other
methods.


In [None]:
# Find tweets with no values for any feature
non_disc_tweets = np.where(np.sum(disc_arr, axis=1)==0)[0].tolist()

# List of tweets to keep
tweet_keep = np.delete(tweet_ids, non_disc_tweets)

# Filter analysis array
disc_arr_filt = np.delete(disc_arr, non_disc_tweets, axis=0)

disc_arr_filt.shape

### Calculate similarity matrix

In [None]:
disc_sim = simSignedCorr(disc_arr_filt)

disc_sim[:5,:5]

### Calculate adjacency matrix and TOM

In [None]:
disc_adj = powerAdj(disc_sim, Beta=6)
disc_TOM = TOMadjacency(disc_adj, threshold_quantile=0.85)

### Build discourse graph network 

In [None]:
# Create graph from TOMatrix (scipy sparse matrix)
disc_graph = nx.from_scipy_sparse_matrix(disc_TOM)

# Add tweet id's as node labels
tweet_labels = dict(zip(disc_graph, tweet_keep))
disc_graph = nx.relabel_nodes(disc_graph, tweet_labels)

Basic graph description.

In [None]:
%%time
print("Nodes: \t\t", disc_graph.number_of_nodes())
print("Edges: \t\t", disc_graph.number_of_edges())
print("Isolates: \t", nx.number_of_isolates(disc_graph))

Write the graph to a `.csv` file for later use

In [None]:
# Write edgelist to .csv
nx.write_weighted_edgelist(disc_graph, path='NST04_discourse_graph.csv', delimiter=',')


In [None]:
%%time

# remove isolates
disc_graph.remove_nodes_from(list(nx.isolates(disc_graph)))
# nx.draw(disc_graph, node_size=4)
# nx.draw_networkx_nodes(disc_graph, pos=nx.spring_layout(disc_graph), node_size=4)
# nx.draw_networkx_nodes(disc_graph, pos=nx.spectral_layout(disc_graph), node_size=4)
# nx.draw_networkx_nodes(disc_graph, pos=nx.kamada_kawai_layout(disc_graph), node_size=4)

# Some exploratoration of features

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm

corrmat = discourse_feats_df[disc_cols].corr(method='spearman')
f, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corrmat, ax=ax, cmap="YlGnBu", linewidths=0.1)

In [None]:
discourse_feats_df[disc_cols].describe().T

In [None]:
discourse_feat_df[disc_cols].sum(axis=1).plot(kind='hist')

# Community Detection

The library `cdlib` requires `python >= 3.8.0` to allow all modules to work. Unfortunately, Google's colab currently uses `python == 3.7.10`.

In [None]:
# cdlib requires Python verion at least 3.8.x

from cdlib import algorithms, viz
from community import community_louvain

The following methods are appropriate for community detection in *weighted* ans *undirected* acyclic graphs. Each one searches a the local optima for number of clusters using a variety of heuristics.

This code block optimizes parameters and selects the best clustering solution for each of four methods:
* The "Chinese Whispers" fuzzy clustering algorithm ([Biemann 2006](https://dl.acm.org/doi/abs/10.5555/1654758.1654774))
* The InfoMap methods of random walks ([Rosvall 2008](https://www.pnas.org/content/105/4/1118/))
* The Louvain method of community modularity ([Blondel et. al. 2008](https://iopscience.iop.org/article/10.1088/1742-5468/2008/10/P10008/meta/))
* The Leiden method improvement on Louvain ([Traag 2018](https://arxiv.org/abs/1810.08473/))

In [None]:
# load graph

G = disc_graph

# remove isolates
G.remove_nodes_from(list(nx.isolates(G)))

# Community detection methods for weighted acyclic graphs

from cdlib import ensemble

methods = [algorithms.chinesewhispers, algorithms.infomap, algorithms.louvain, algorithms.leiden]

# Chinese whisper parameters
iterations = ensemble.Parameter(name="iterations", start=15, end=25, step=5)
chinese_conf = [iterations]

# Louvain parameters to search
resolution = ensemble.Parameter(name="resolution", start=0.5, end=1, step=0.1)
randomize = ensemble.BoolParameter(name="randomize")
louvain_conf = [resolution, randomize]

# Leiden parameters
leiden_conf = [ensemble.BoolParameter(name='weights', value='weight')]

# Loop through grid search and store best community in a list

comms = []

for coms, scoring in ensemble.pool_grid_filter(G, methods, [chinese_conf, [],louvain_conf, leiden_conf], quality_score=evaluation.erdos_renyi_modularity, aggregate=max):
    print("%s\nCommunities:\t %s \nConfiguration: %s \nScoring: %s\n" %(coms.method_name, len(coms.communities), coms.method_parameters, scoring))

The method `algorithms.chinesewhispers` performs best with a modularity score of $0.7855965109087125$, returning $9$ communities.

In [None]:
comms_chinesewhispers = algorithms.chinesewhispers(G, weighting='top', iterations=15)
comms_infomap = algorithms.infomap(G)
comms_louvain = algorithms.louvain(G, resolution=0.6, randomize=True)
comms_leiden = algorithms.leiden(G, weights='weight')

Calculate `performance` for each

In [None]:
import networkx.algorithms.community as nx_comm

comms = [comms_louvain, comms_leiden, comms_infomap, comms_chinesewhispers]

comm_perf = []

for c in comms:
    perf = nx_comm.performance(G, c.communities)
    comm_perf.append(perf)


In [None]:
print("Performance:\n", comm_perf)


The method `algorithms.chinesewhispers` performs best with a performance score of $0.9792224330305016$.

It appears that the communities detected by `algorithms.chinesewhispers` are the optimal solution.

### Calulate PageRank for each node

In [None]:

G_pr = nx.pagerank(G)


### Assign communities to Nodes and internal Edges

Adapted from [Community detection using NetworkX](https://orbifold.net/default/community-detection-using-networkx/)

In [None]:
def set_node_community(G, communities):
    '''Add community to node attributes'''
    for c, v_c in enumerate(communities):
        for v in v_c:
            # Add 1 to save 0 for external edges
            G.nodes[v]['community'] = c + 1

def set_edge_community(G):
    '''Find internal edges and add their community to their attributes'''
    for v, w, in G.edges:
        if G.nodes[v]['community'] == G.nodes[w]['community']:
            # Internal edge, mark with community
            G.edges[v, w]['community'] = G.nodes[v]['community']
        else:
            # External edge, mark as 0
            G.edges[v, w]['community'] = 0

# Set node and edge communities
set_node_community(G, comms_chinesewhispers.communities)
set_edge_community(G)

Create `pandas` dataframe of node attributes including assigned community and page rank.

Adapted from [stackoverflow: Converting Networkx graph to data frame with its attributes](https://stackoverflow.com/a/62386579)

In [None]:
def make_node_df(G):
    nodes = {}
    for node, attribute in G.nodes(data=True):
        if not nodes.get('node'):
            nodes['node'] = [node]
        else:
            nodes['node'].append(node)

        for key, value in attribute.items():
            if not nodes.get(key):
                nodes[key] = [value]
            else:
                nodes[key].append(value)

    return pd.DataFrame(nodes)



In [None]:

tweet_community_df = make_node_df(G)

tweet_community_df.head()


In [None]:
tweet_community_df['community'].value_counts()

In [None]:

tweet_pr_df = pd.DataFrame(list(G_pr.items()), columns=['id','page_rank'])

tweet_pr_df.head()

In [None]:
discourse_df.shape

In [None]:

output_df = discourse_df.join(tweet_community_df[['node','community']].set_index('node'), on='id')

output_df.shape

In [None]:
output_df = output_df.join(tweet_pr_df.set_index('id'), on='id')

output_df.info()

In [None]:
output_df['community'].fillna(-1, inplace=True)

output_df['community'].value_counts()

In [None]:
output_df['page_rank'].fillna(0, inplace=True)

output_df.info()

In [None]:

output_df.to_pickle('NST04_tweet_communities.pickle')

# Graph and community visualizations

In [None]:
# draw the graph of communities

viz.plot_community_graph(G, comms_chinesewhispers, plot_labels=True)

In [None]:
# Calculate node positions, spring

pos = nx.spring_layout(G, k=0.1)


In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
# plt.style.use('dark_background')

# set node color by community
node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]

nx.draw_networkx_nodes(G, pos=pos, node_size=8, node_color= node_color, alpha=0.25)

In [None]:
def get_color(i, r_off=1, g_off=1, b_off=1):
    '''Assign a color to a vertex.'''
    r0, g0, b0 = 0, 0, 0
    n = 16
    low, high = 0.1, 0.9
    span = high - low
    r = low + span * (((i + r_off) * 3) % n) / (n - 1)
    g = low + span * (((i + g_off) * 5) % n) / (n - 1)
    b = low + span * (((i + b_off) * 7) % n) / (n - 1)
    return (r, g, b)            


In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
plt.style.use('dark_background')

# Set community color for internal edges
external = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] == 0]
internal = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] > 0]
internal_color = ["black" for e in internal]
node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]

# external edges
nx.draw_networkx(
    G, 
    pos=pos, 
    node_size=0, 
    edgelist=external, 
    edge_color="silver",
    node_color=node_color,
    alpha=0.2, 
    with_labels=False)

# internal edges
nx.draw_networkx(
    G, 
    pos=pos, 
    edgelist=internal, 
    edge_color=internal_color,
    node_color=node_color,
    alpha=0.05, 
    with_labels=False)