In [26]:
import pandas as pd

df = pd.read_csv("data/genetics_filtered_surface_cooccurrences.csv", encoding='UTF-8')

In [27]:
df.head()

Unnamed: 0,Word1,Word2,Frequency
0,aacute,aacute,17.010178
1,gerais,minas,16.99355
2,aires,buenos,16.988152
3,buenos,aires,16.977012
4,compostela,santiago,16.884648


In [28]:
import numpy as np

# Getting vocab set.
nodes = set()
for word in set(df['Word1']).union(set(df['Word2'])):
    nodes.add(word)
nodes = sorted(list(nodes))
    
# Creating adjacency matrix.
N = len(nodes)
adjacency_matrix = np.zeros((N, N))

for index, row in df.iterrows():
    word1, word2, frequency = row['Word1'], row['Word2'], row['Frequency']
    i, j = nodes.index(word1), nodes.index(word2)
    adjacency_matrix[i, j] = adjacency_matrix[j, i] = frequency

print(adjacency_matrix)
    

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.         16.48934624  0.         ...  0.          0.
   0.        ]
 [ 0.          0.         17.0101784  ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ... 14.13570929  0.
   0.        ]
 [ 0.          0.          0.         ...  0.         14.96578428
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
  16.17523765]]


In [29]:
import networkx as nx

G = nx.from_numpy_array(adjacency_matrix, create_using=nx.DiGraph())
label_mapping = {i: label for i, label in enumerate(nodes)}
G = nx.relabel_nodes(G, label_mapping)

In [30]:
nx.write_graphml(G, "graph.graphml")