In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import numpy as np
import random
from scipy.stats import wasserstein_distance, energy_distance
from matplotlib.backends.backend_pdf import PdfPages
import json
import pandas as pd

triton/project/ecanet

conda install --freeze-installed -c conda-forge networkx

conda env export  --file gt.json
cat gt.json


In [None]:
SAVE_MODE = 1

### I. Load the data

In [None]:
def easier_indexing(Gs):
    """
        Quick tool for easier indexing 
    """
    keys = [Gs[i][1][:-5] for i in range(len(Gs))]
    topic_dict = dict(zip(keys, list(range(len(keys)))))
    return topic_dict

In [None]:
#GL = pickle.load(open("data/graphlistnew.pickle", "rb" ))
#topic_dict = easier_indexing(GL)

GL = pickle.load(open("data/garimellanets.pickle", "rb" ))

### II. Identify the cores

In [5]:
def find_main_core(G, k=None):
    
    core_subgraph = nx.k_core(G, k=None, core_number=None)
    
    return core_subgraph

In [6]:
def get_colormap(G, core_subgraph, colors):
    
    color_map = []
    for node in G:
        if node in core_subgraph.nodes():
            color_map.append(color[0])
        else: 
            color_map.append(color[1]) 
    
    return color_map

In [7]:
def find_n_cores(network, n):
    
    colorlist = ["red", "blue", "orange", "green", "yellow", "brown"]
    cores = dict()
    
    G = network.copy()
    
    for i in range(n):
    
        cores[i] = find_main_core(G)
        
        for node in cores[i].nodes:
            G.remove_node(node)
           
    return cores

In [8]:
def color_cores(cores, network):
    
    color_map = []

    for node in network:
        if node in cores[0]:
            color_map.append("red")
        elif node in cores[1]:
            color_map.append("blue")
        else: 
            color_map.append("gray") 
    
    return color_map

### III. Measure the core polarization

1. Updated random walk based score. Pick from each cores the node with highest closeness centrality to be influencer. Start random walk.

In [9]:
def perform_randomwalk(G, starting_node, li, ri):
    
    found = 0
    end_side = 0
    which_random_starting_node = starting_node
   
    while (found != 1):
        neighbors = list(G.neighbors(which_random_starting_node))
        next_node = random.choice(neighbors)
        if (next_node in li):
            end_side = "A"
            found = 1  
        elif (next_node in ri):
            end_side = "B"
            found = 1
        else:
            which_random_starting_node = next_node       
    return end_side

def random_walk(backbone, cores, n_iter):
    
    centrality_core1 = nx.closeness_centrality(cores[0])
    centrality_core2 = nx.closeness_centrality(cores[1])
    
    A = max(centrality_core1, key=centrality_core1.get)
    B = max(centrality_core2, key=centrality_core2.get)
    
    AA = 0
    AB = 0
    BB = 0
    BA = 0
    
    starting_nodes = random.choices(list(cores[0]) + list(cores[1]), k=n_iter)
    
    for node in starting_nodes:
        
        res = perform_randomwalk(backbone, node, [A], [B])
        
        if node in cores[0] and res == "A":
            AA += 1
        
        elif node in cores[0] and res == "B":
            AB += 1
            
        elif node in cores[1] and res == "B":
            BB += 1
            
        elif node in cores[1] and res == "A":
            BA += 1
            
        else:
            print("Not possible!")
            
            
    pAA = AA/(AA+AB)
    pAB = 1-pAA
    
    pBB = BB/(BB+BA)
    pBA = 1-pBB
        
    
    return(pAA*pBB - pAB*pBA)

2. Accessibility (flow) based score, where p=0.5 and k=3. **A** measures the expected number of times that node **v** will hear about a cascade originating at node **u**, using walks of length up to **k**. The distance between the core-specific distributions is measured with *wasserstein* metric. 

In [10]:
def accessibility_score(backbone, cores, p=0.5):
    
    nodelist = list(cores[0]) + list(cores[1])
    M = nx.to_numpy_matrix(backbone, nodelist=nodelist)
    
    A = p*M + p**2 * M**2 + p**3*M**3 + p**4*M**4 + p**5*M**5
    n_core1 = len(cores[0])
    
    core1_intra = A[:n_core1, :n_core1].flatten().tolist()[0]
    core2_intra = A[n_core1:, n_core1:].flatten().tolist()[0]
    core_inter = A[:n_core1, n_core1:].flatten().tolist()[0]
    
    EMD = wasserstein_distance(core1_intra + core2_intra, core_inter)
    ENR = energy_distance(core1_intra + core2_intra, core_inter)
    
    return(np.var(A), EMD, ENR)

3. Assortativity

In [11]:
def assortativity(backbone):
    r=nx.degree_assortativity_coefficient(backbone)
    return r

4. Spectral approach

In [12]:
def algebraic_connectivity(backbone):
    
    ac = nx.algebraic_connectivity(backbone)
    normalized_ac = nx.algebraic_connectivity(backbone, normalized=True)
    
    return (ac, normalized_ac)

### IV. Analytical pipeline

In [13]:
spectral_snails = ["jotainrajaa", "samasuunta", "piraatit"]
disconnected_cores = ["vaalikone"]

In [14]:
def compute_core_polarization(net):
    
    G = net.copy()
    
    # Remove self-retweets
    G.remove_edges_from(nx.selfloop_edges(G))
    
    # Find two cores
    cores = find_n_cores(G, 2)
    
    # Take core subgraph
    backbone_nodes = (nx.compose_all(cores.values())).nodes
    backbone = G.subgraph(backbone_nodes)
    
    # Save values into a dictionary
    score_dict = dict()
    
    #if t not in disconnected_cores:
    #    score_dict["RW"] = random_walk(backbone, cores, 1000)
    #    print("RW completed.")
    #else:
    #    score_dict["RW"] = 0
    #    print("Disconnected core detected...")
    
    #if t not in spectral_snails:
    #    score_dict["AC"] = algebraic_connectivity(backbone)[0]
    #    score_dict["nAC"] = algebraic_connectivity(backbone)[1]
    #    print("AC completed.")
   # else:
     #   score_dict["AC"] = (0,0)
     #   print("Snail detected...")
        
    score_dict["varFLOW"] = accessibility_score(backbone, cores)[0]
    score_dict["wasFLOW"] = accessibility_score(backbone, cores)[1]
    score_dict["eneFLOW"] = accessibility_score(backbone, cores)[2]
    print("FLOW completed.")
    
    #score_dict["ASSOR"] = assortativity(backbone)
    #print("ASSOR completed.")

    return score_dict

Compute the core polarization scores for selected topics

In [15]:
core_polarization = dict()

In [16]:
all_topics = list(topic_dict.keys())

polarized_topics = ["kokoomus", 
                    "vihreät", 
                    "perussuomalaiset", 
                    "vasemmisto", 
                    "vihapuhe", 
                    "ilmastonmuutos", 
                    "maahanmuutto",
                    "sote",
                    "ilmastovaalit"]

unpolarized_topics = ["mielenterveys", 
                      "liikenne", 
                      "kunnat", 
                      "tekoäly", 
                      "nuoret", 
                      "urheilu", 
                      "yrittäjät",
                      "yrittäjyys",
                      "rekry",
                      "osaaminen",
                      "johtaminen"]

In [15]:
garimella_topics = list(GL.keys())

In [16]:
garimella_topics

['baltimore',
 'jurassicworld',
 'ultralive',
 'leadersdebate',
 'wcw',
 'ukraine',
 'sxsw',
 'mothersday',
 'nemtsov',
 'beefban',
 'onedirection',
 'russia',
 'gunsense',
 'indiana',
 'germanwings',
 'nepal',
 'indiasdaughter',
 'ff',
 'nationalkissingday',
 'netanyahu']

In [22]:
nx.info(GL["ukraine"])

'Name: \nType: Graph\nNumber of nodes: 5495\nNumber of edges: 9452\nAverage degree:   3.4402'

In [17]:
for t in all_topics:
    
    print("Starting to process the topic: ", t)
    #Load the network
    network = GL[topic_dict[t]][0][0][0]
    #network = GL[t]
    
    #Compute all scores for core polarization
    core_polarization[t] = compute_core_polarization(network)

    print("Following topic completed: : ", t)
    print("\n")

Starting to process the topic:  aktiivimalli
FLOW completed.
Following topic completed: :  aktiivimalli


Starting to process the topic:  ammatillinenkoulutus
FLOW completed.
Following topic completed: :  ammatillinenkoulutus


Starting to process the topic:  arvot
FLOW completed.
Following topic completed: :  arvot


Starting to process the topic:  demokratia
FLOW completed.
Following topic completed: :  demokratia


Starting to process the topic:  digitalisaatio
FLOW completed.
Following topic completed: :  digitalisaatio


Starting to process the topic:  eduskunta
FLOW completed.
Following topic completed: :  eduskunta


Starting to process the topic:  eduskuntavaalit2019
FLOW completed.
Following topic completed: :  eduskuntavaalit2019


Starting to process the topic:  eduskuntavaalit
FLOW completed.
Following topic completed: :  eduskuntavaalit


Starting to process the topic:  eriarvoisuus
FLOW completed.
Following topic completed: :  eriarvoisuus


Starting to process the topic:

In [None]:
if SAVE_MODE:

    # Save results to file
    with open('results_garimellanets.json', 'w') as fp:
        json.dump(core_polarization, fp)

### V. Visualization pipeline

*later*

### VI. Results

ORDER

In [18]:
df = pd.DataFrame.from_dict(core_polarization).T
df

Unnamed: 0,varFLOW,wasFLOW,eneFLOW
aktiivimalli,71030.200206,315.816919,16.727006
ammatillinenkoulutus,95446.909164,432.666244,23.612231
arvot,360.489091,9.687693,1.675790
demokratia,969.951841,39.251255,6.661550
digitalisaatio,1815.724123,33.016479,4.428810
...,...,...,...
yleastudio,280.195687,14.370284,3.382727
yliopisto,55346.546800,115.109172,6.392755
ympäristö,52876.974724,361.682824,22.654237
yrittäjyys,385.540591,17.406374,3.951542


RW

In [None]:
df.sort_values(by=['RW'], ascending=False).head(15)

AC

In [None]:
df = df.drop(["jotainrajaa", "samasuunta", "piraatit", "vaalikone"])
df.sort_values(by=["AC"], ascending=True).head(20)

nAC

In [None]:
df.sort_values(by=["nAC"], ascending=True).head(20)

varFLOW

In [None]:
df.sort_values(by=["varFLOW"], ascending=False).head(20)

wasFLOW

**TOP 25**

In [20]:
df.sort_values(by=["wasFLOW"], ascending=False).head(25)

Unnamed: 0,varFLOW,wasFLOW,eneFLOW
eu,7237009000000.0,2494299.0,1277.342706
ilmasto,4939783000.0,89110.42,309.512004
vaalit,3854019000.0,52394.94,189.199639
vaalit2019,891091200.0,43550.25,230.052107
ilmastovaalit,119187100.0,18129.13,161.652912
ilmastonmuutos,88590270.0,12750.08,123.694577
kokoomus,70494440.0,7936.597,76.726757
sote,34879840.0,6689.002,75.881198
ilmastolakko,17823240.0,5326.84,77.9705
työ,6842755.0,3589.723,65.140704


**LEAST 15**

In [21]:
df.sort_values(by=["wasFLOW"], ascending=True).head(25)

Unnamed: 0,varFLOW,wasFLOW,eneFLOW
urheilu,0.266167,0.064254,0.044306
rekry,1.658725,0.083363,0.039166
tekoäly,15.611631,0.917212,0.471502
tampere,178.175817,2.873481,0.545717
eriarvoisuus,131.756294,3.130147,0.634189
osallisuus,367.260959,5.392717,1.147858
viestintä,370.916329,5.78343,1.218312
arvot,360.489091,9.687693,1.67579
kunnat,219.654256,10.46625,2.245306
yhdenvertaisuus,206.440531,13.233894,3.12132


eneFLOW

In [None]:
df.sort_values(by=["eneFLOW"], ascending=False).head(15)

ASSOR

In [None]:
df.sort_values(by=["ASSOR"], ascending=False).head(15)

PLOTS (Discriminative power)

In [None]:
polarized_flow = []
unpolarized_flow = []

polarized_ac = []
unpolarized_ac = []

polarized_rw = []
unpolarized_rw = []

for t in polarized_topics:
    polarized_flow += [core_polarization[t]["wasFLOW"]]
    polarized_ac += [core_polarization[t]["AC"]]
    polarized_rw += [core_polarization[t]["RW"]]
    
for t in unpolarized_topics:
    unpolarized_flow += [core_polarization[t]["wasFLOW"]]
    unpolarized_ac += [core_polarization[t]["AC"]]
    unpolarized_rw += [core_polarization[t]["RW"]]

In [None]:
plt.figure(figsize=(12,12))

plt.subplot(3,1,1)
plt.title("Random Walk")
plt.hist(polarized_rw, label="polarized", color="darkred", alpha=0.8, bins=100)
plt.hist(unpolarized_rw, label="unpolarized", alpha=0.8, bins=100)
plt.legend()

plt.subplot(3,1,2)
plt.title("Algebraic Connectivity")
plt.hist(polarized_ac, label="polarized", color="darkred", alpha=0.8, bins=100)
plt.hist(unpolarized_ac, label="unpolarized", alpha=0.8, bins=100)

plt.subplot(3,1,3)
plt.title("wasFlow Score")
plt.hist(polarized_flow, label="polarized", color="darkred", alpha=0.8, bins=100)
_ = plt.hist(unpolarized_flow, label="unpolarized", alpha=0.8, bins=100)

correlation between the scores

In [None]:
x, y, z, w, v = [], [], [], [], []
for t in core_polarization:
    x += [core_polarization[t]["varFLOW"]]
    y += [core_polarization[t]["wasFLOW"]]
    z += [core_polarization[t]["RW"]]
    w += [core_polarization[t]["AC"]]
    v += [core_polarization[t]["ASSOR"]]

In [None]:
df_num = df.astype("float")

In [None]:
corrmat = df_num.corr(method="spearman")

In [None]:
import seaborn as sn
sn.heatmap(corrmat, annot=True)
plt.show()

## STOP HERE 

In [None]:
with PdfPages('unpolarized_garimella.pdf') as pdf:
    
    for t in garimella_topics:

        plt.figure(figsize=(12,12)) 
        #plt.title(t + ", RW: " + str(round(core_polarization[t]["RW"],3)) + ", ACC: " + str(round(core_polarization[t]["AC"],3)) + ", FLOW: " + str(round(core_polarization[t]["wasFLOW"],3)))
        plt.title(t)
        
        #Load the network
        #network = GL[topic_dict[t]][0][0][0]
        network = GL[t]
        G = network.copy()
    
        # Remove self-retweets
        G.remove_edges_from(nx.selfloop_edges(G))
    
        # Find two cores
        cores = find_n_cores(G, 2)
    
        # Take core subgraph
        cm = color_cores(cores, G)
        
        nx.draw_spring(G, node_size=30, width=0.5, alpha=0.1, node_color=cm)
        pdf.savefig()
        #plt.close()

        print("Topic visualized: ", t)

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
df.sort_values(by=["wasFLOW"], ascending=False)

In [None]:
ax = plt.subplot()
_ = ax.hist(y, bins=80)
_ = ax.set_xlim(0,100)

In [None]:
np.median(y)

In [None]:
df_num = df_num.drop(["eu"])

In [None]:
df_num.columns

In [None]:
plt.figure(figsize=(12,30)) 
#plt.pcolor(df_num)
df_num_final=(df_num-df_num.mean())/df_num.std()
sn.heatmap(df_num_final, annot=True)
_ = plt.yticks(np.arange(0.5, len(df_num.index), 1), df_num.index)
_ = plt.xticks(np.arange(0.5, len(df_num.columns), 1), df_num.columns)

In [None]:
list(df_num_final[df_num_final["wasFLOW"] > 0].index)