Notebook to create networks using the WMD

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/MyDrive/MUSE

/content/drive/MyDrive/MUSE


In [None]:
!pip install wmd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wmd
  Downloading wmd-1.3.2.tar.gz (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.6/104.6 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wmd
  Building wheel for wmd (setup.py) ... [?25l[?25hdone
  Created wheel for wmd: filename=wmd-1.3.2-cp38-cp38-linux_x86_64.whl size=629451 sha256=5bdf54517c1eca356c3472148f064f732f85f0976b199f51403d48b5fe44aecf
  Stored in directory: /root/.cache/pip/wheels/eb/4c/cd/40ec1e13bfd149162c9a69f5b07728410ea9af264e66cea28d
Successfully built wmd
Installing collected packages: wmd
Successfully installed wmd-1.3.2


In [None]:
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from wmd import WMD
from networkx.algorithms import community
from google.colab import output
import networkx as nx

In [None]:
topics_df = pd.read_csv("../data/topics_df.csv")
to_list = lambda l : l.topic.replace("[", "").replace("]", "").replace("\'", "").split(", ")
topics_df["topic_id"] = topics_df.apply(lambda row: row["country"] + "_" + row["period"] + "_" + to_list(row)[0] + "_" + to_list(row)[1] + "_" + to_list(row)[2], axis=1)

In [None]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [None]:
embeddings_fr, id2word_fr, word2id_fr = load_vec("./dumped/debug/g3n9myrpjg/vectors-fr.txt")
embeddings_us, id2word_us, word2id_us = load_vec("./dumped/debug/g3n9myrpjg/vectors-en.txt")
embeddings_de, id2word_de, word2id_de = load_vec("./dumped/debug/q0t6x4r8j2/vectors-de.txt")
embeddings_it, id2word_it, word2id_it = load_vec("./dumped/debug/ebomrrgqiy/vectors-it.txt")
embeddings_es, id2word_es, word2id_es = load_vec("./dumped/debug/wzpg359r5l/vectors-es.txt")

In [None]:
def add_embedding(t, word2id, embd, reduced_emb, reduced_word2id, idx):
    for w in to_list(t[1]):
          if w in word2id.keys():
              reduced_emb = [embd[word2id[w]]] if not len(reduced_emb) else reduced_emb + [embd[word2id[w]]]
              reduced_word2id[w] = idx
              idx += 1
    return reduced_emb, reduced_word2id, idx

In [None]:
def create_one_nbow(topic, reduced_word2id):
    bow0 = []
    w0 = [] #np.zeros((len(reduced_emb)), dtype=np.float32)
    for i, w in enumerate(topic):
        try :
            bow0 = [reduced_word2id[w]] if not len(bow0) else bow0 + [reduced_word2id[w]]
            w0 += [1 - 0.1*i]
        except :
            pass
    return bow0, w0

In [None]:
def compute_graph_data(topics_kw, threshold, threshold_relax):
    # aggregate all embeddings together
    reduced_emb = []
    reduced_word2id = dict()
    idx = 0
    for t in topics_kw[topics_kw["country"]=="fr"].iterrows():
        reduced_emb, reduced_word2id, idx = add_embedding(t, word2id_fr, embeddings_fr, reduced_emb, reduced_word2id, idx)
    for t in topics_kw[topics_kw["country"]=="us"].iterrows():
        reduced_emb, reduced_word2id, idx = add_embedding(t, word2id_us, embeddings_us, reduced_emb, reduced_word2id, idx)
    for t in topics_kw[topics_kw["country"]=="it"].iterrows():
        reduced_emb, reduced_word2id, idx = add_embedding(t, word2id_it, embeddings_it, reduced_emb, reduced_word2id, idx)
    for t in topics_kw[topics_kw["country"]=="de"].iterrows():
        reduced_emb, reduced_word2id, idx = add_embedding(t, word2id_de, embeddings_de, reduced_emb, reduced_word2id, idx)
    for t in topics_kw[topics_kw["country"]=="es"].iterrows():
        reduced_emb, reduced_word2id, idx = add_embedding(t, word2id_es, embeddings_es, reduced_emb, reduced_word2id, idx)
    reduced_emb = np.array(reduced_emb, dtype=np.float32)

    # create nbow
    nbow = dict()
    for i, t in enumerate(topics_kw[topics_kw["country"]=="fr"].iterrows()):
        bow, w = create_one_nbow(to_list(t[1]), reduced_word2id)
        nbow[t[1].topic_id] = (t[1].topic_id, bow, np.ones(len(w))/len(w))
    for i, t in enumerate(topics_kw[topics_kw["country"]=="us"].iterrows()):
        bow, w = create_one_nbow(to_list(t[1]), reduced_word2id)
        nbow[t[1].topic_id] = (t[1].topic_id, bow, np.ones(len(w))/len(w))
    for i, t in enumerate(topics_kw[topics_kw["country"]=="it"].iterrows()):
        bow, w = create_one_nbow(to_list(t[1]), reduced_word2id)
        nbow[t[1].topic_id] = (t[1].topic_id, bow, np.ones(len(w))/len(w))
    for i, t in enumerate(topics_kw[topics_kw["country"]=="de"].iterrows()):
        bow, w = create_one_nbow(to_list(t[1]), reduced_word2id)
        nbow[t[1].topic_id] = (t[1].topic_id, bow, np.ones(len(w))/len(w))
    for i, t in enumerate(topics_kw[topics_kw["country"]=="es"].iterrows()):
        bow, w = create_one_nbow(to_list(t[1]), reduced_word2id)
        nbow[t[1].topic_id] = (t[1].topic_id, bow, np.ones(len(w))/len(w))
    
    # https://github.com/src-d/wmd-relax
    reduced_emb_t = np.array(reduced_emb, dtype=np.float32)
    calc = WMD(reduced_emb_t, nbow, vocabulary_min=3, vocabulary_max=2000)

    # calculate weight between each topic 
    graph_topics_df = pd.DataFrame(columns = ["from", "to", "weigth"])
    for t in topics_kw.iterrows():
        topic = t[1].topic_id
        nn = calc.nearest_neighbors(topic, k=100, early_stop=0.99)
        output.clear()
        for n in nn:
            # same country
            if (t[1].topic_id.split("_")[0] == topics_kw[topics_kw["topic_id"]==n[0]].iloc[0].topic_id.split("_")[0]) and n[1] < threshold:
                graph_topics_df.loc[len(graph_topics_df.index)] = [topic,topics_kw[topics_kw["topic_id"]==n[0]].iloc[0].topic_id, threshold - n[1]]
            if not(t[1].topic_id.split("_")[0] == topics_kw[topics_kw["topic_id"]==n[0]].iloc[0].topic_id.split("_")[0]) and n[1] < threshold_relax:
                graph_topics_df.loc[len(graph_topics_df.index)] = [topic,topics_kw[topics_kw["topic_id"]==n[0]].iloc[0].topic_id, threshold_relax - n[1]]

    return graph_topics_df

In [None]:
def find_communities(G):
    communities_greedy = community.greedy_modularity_communities(G)
    cov_greedy, perf_greedy = community.partition_quality(G, communities_greedy)
    mod_greedy = community.modularity(G, communities_greedy)
    communities_louvain = community.louvain_communities(G)
    cov_louvain, perf_louvain = community.partition_quality(G, communities_louvain)
    mod_louvain = community.modularity(G, communities_louvain)
    return cov_greedy, perf_greedy, mod_greedy, cov_louvain, perf_louvain, mod_louvain

In [None]:
thresholds = np.arange(0.5,2.5, 0.25)
communities_df = pd.DataFrame(columns=["keyword", "threshold", "threshold_relax", "algo", "modularity", "performance", "coverage", "nodes"])

for t in thresholds:
  for tr in np.arange(t, 2.5, 0.25):
    graph_telegraph_df = compute_graph_data(topics_df[topics_df["keyword"]=="telegraph"], threshold=t, threshold_relax=tr)
    graph_steel_df = compute_graph_data(topics_df[topics_df["keyword"]=="steel"], threshold=t, threshold_relax=tr)
    graph_elec_df = compute_graph_data(topics_df[topics_df["keyword"]=="elec"], threshold=t, threshold_relax=tr)
    graph_coal_df = compute_graph_data(topics_df[topics_df["keyword"]=="coal"], threshold=t, threshold_relax=tr)
    
    G_telegraph = nx.from_pandas_edgelist(graph_telegraph_df, source='from', target='to')
    G_steel = nx.from_pandas_edgelist(graph_steel_df, source='from', target='to')
    G_elec = nx.from_pandas_edgelist(graph_elec_df, source='from', target='to')
    G_coal = nx.from_pandas_edgelist(graph_coal_df, source='from', target='to')

    try:
        cov_greedy, perf_greedy, mod_greedy, cov_louvain, perf_louvain, mod_louvain = find_communities(G_telegraph)
        communities_df = communities_df.append({"keyword" : "telegraph", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": mod_greedy, "performance": perf_greedy, "coverage": cov_greedy, "nodes": G_telegraph.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "telegraph", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": mod_louvain, "performance": perf_louvain, "coverage": cov_louvain, "nodes": G_telegraph.number_of_nodes()},ignore_index=True)
    except:
        communities_df = communities_df.append({"keyword" : "telegraph", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_telegraph.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "telegraph", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_telegraph.number_of_nodes()},ignore_index=True)

    
    try:
        cov_greedy, perf_greedy, mod_greedy, cov_louvain, perf_louvain, mod_louvain = find_communities(G_steel)
        communities_df = communities_df.append({"keyword" : "steel", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": mod_greedy, "performance": perf_greedy, "coverage": cov_greedy, "nodes": G_steel.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "steel", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": mod_louvain, "performance": perf_louvain, "coverage": cov_louvain, "nodes":  G_steel.number_of_nodes()},ignore_index=True)
    except:
        communities_df = communities_df.append({"keyword" : "steel", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_steel.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "steel", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_steel.number_of_nodes()},ignore_index=True)

    try:
        cov_greedy, perf_greedy, mod_greedy, cov_louvain, perf_louvain, mod_louvain = find_communities(G_elec)
        communities_df = communities_df.append({"keyword" : "elec", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": mod_greedy, "performance": perf_greedy, "coverage": cov_greedy, "nodes": G_elec.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "elec", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": mod_louvain, "performance": perf_louvain, "coverage": cov_louvain, "nodes": G_elec.number_of_nodes()},ignore_index=True)
    except:
        communities_df = communities_df.append({"keyword" : "elec", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_elec.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "elec", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_elec.number_of_nodes()},ignore_index=True)


    try:
        cov_greedy, perf_greedy, mod_greedy, cov_louvain, perf_louvain, mod_louvain = find_communities(G_coal)
        communities_df = communities_df.append({"keyword" : "coal", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": mod_greedy, "performance": perf_greedy, "coverage": cov_greedy, "nodes": G_coal.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "coal", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": mod_louvain, "performance": perf_louvain, "coverage": cov_louvain, "nodes": G_coal.number_of_nodes()},ignore_index=True)
    except:
        communities_df = communities_df.append({"keyword" : "coal", "threshold" : t, "threshold_relax" : tr, "algo" : "greedy", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_coal.number_of_nodes()},ignore_index=True)
        communities_df = communities_df.append({"keyword" : "coal", "threshold" : t, "threshold_relax" : tr, "algo" : "louvain", "modularity": "Nan", "performance": "Nan", "coverage": "Nan", "nodes": G_coal.number_of_nodes()},ignore_index=True)

    communities_df.to_csv("../data/communities_df.csv")
        

In [None]:
communities_df = pd.read_csv("../communities_df.csv")
communities_df["nodes_percentage"] = communities_df.apply(lambda row: row["nodes"]/len(topics_df[topics_df["keyword"]==row["keyword"]].index), axis=1)

In [None]:
reduced_comm_df = communities_df[communities_df["nodes_percentage"] > 0.45]
reduced_comm_df[["modularity", "performance", "coverage"]] = reduced_comm_df[["modularity", "performance", "coverage"]].apply(pd.to_numeric)

In [None]:
def find_best_thresholds(df, keyword):
    df = df[df["keyword"]==keyword]
    df["score"] = df.apply(lambda row: 0.7*row["modularity"] + 0.15*row["performance"] +0.15*row["coverage"], axis=1)
    t = df.loc[df["score"].sort_values().tail(1).index[0]]["threshold"]
    tr = df.loc[df["score"].sort_values().tail(1).index[0]]["threshold_relax"]
    return t, tr

In [None]:
t_steel, tr_steel = find_best_thresholds(reduced_comm_df, "steel") #louvain
t_coal, tr_coal = find_best_thresholds(reduced_comm_df, "coal") #greedy
t_telegraph, tr_telegraph = find_best_thresholds(reduced_comm_df, "telegraph") #greedy
t_elec, tr_elec = find_best_thresholds(reduced_comm_df, "elec") #louvain

In [None]:
print(t_steel, tr_steel)
print(t_coal, tr_coal)
print(t_telegraph, tr_telegraph)
print(t_elec, tr_elec)

In [None]:
# Save Graphs
compute_graph_data(topics_df[topics_df["keyword"]=="steel"], t_steel, tr_steel).to_csv("graph_steel_df.csv", index=False)
compute_graph_data(topics_df[topics_df["keyword"]=="coal"], t_coal, tr_coal).to_csv("graph_coal_df.csv", index=False)
compute_graph_data(topics_df[topics_df["keyword"]=="telegraph"], t_telegraph, tr_telegraph).to_csv("graph_telegraph_df.csv", index=False)
compute_graph_data(topics_df[topics_df["keyword"]=="elec"], t_elec, tr_elec).to_csv("graph_elec_df.csv", index=False)