In [9]:
!pip install node2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting node2vec
  Downloading node2vec-0.4.3.tar.gz (4.6 kB)
Building wheels for collected packages: node2vec
  Building wheel for node2vec (setup.py) ... [?25l[?25hdone
  Created wheel for node2vec: filename=node2vec-0.4.3-py3-none-any.whl size=5980 sha256=18736ed321e9d5ed8a01ab937b7b1d0363b6eee47295b70329d93285b2c87d06
  Stored in directory: /root/.cache/pip/wheels/07/62/78/5202cb8c03cbf1593b48a8a442fca8ceec2a8c80e22318bae9
Successfully built node2vec
Installing collected packages: node2vec
Successfully installed node2vec-0.4.3


In [10]:
import networkx as nx
from node2vec import Node2Vec as n2v

In [11]:
def run_node2vec(G: nx.Graph, emb_dimension = 64, WINDOW=10, MIN_COUNT=1, WALK_LENGTH=16, NUM_WALKS = 2000, weight_key = 'tweet_id'):
  '''
  get networkx graph and apply node2vec on it
  '''
  g_emb = n2v(G, dimensions=emb_dimension, walk_length=WALK_LENGTH, num_walks=NUM_WALKS, weight_key=weight_key)
  model = g_emb.fit(window=WINDOW, min_count=MIN_COUNT)
  return model

In [12]:
def get_node_embedding(n2v_model, node):
  return n2v_model.wv.get_vector(str(node))
def get_most_similar_nodes(n2v_model, node, number_of_similar_nodes = 10):
  return list(n2v_model.wv.most_similar(node, topn = number_of_similar_nodes))

#Run on misinfo

In [2]:
import os
import numpy as np
import pandas as pd
import networkx as nx

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
path_to_data_dir = 'drive/MyDrive/master deg/NLP_And_Social_Dynamics/Data/preprocessed_data'
interactions_df_name = 'weighted_interactions.csv'
interactions_df_path = os.path.join(path_to_data_dir, interactions_df_name)
interactions_df = pd.read_csv(interactions_df_path)

In [15]:
interactions_df

Unnamed: 0.1,Unnamed: 0,userid_hash,target_id,tweet_id
0,0,02fdbbce9f36a93ed8056e0a1b02a857a0bce32e7d96d0...,3177941678,3
1,1,02fdbbce9f36a93ed8056e0a1b02a857a0bce32e7d96d0...,330564612,1
2,2,0994abf9fb8fe1bf699d0e101e9603b30f369e94a0eec1...,0994abf9fb8fe1bf699d0e101e9603b30f369e94a0eec1...,1
3,3,0994abf9fb8fe1bf699d0e101e9603b30f369e94a0eec1...,0e45d2af1bbca512ab1d8fa1c2216c038f9627bb773ab4...,1
4,4,0994abf9fb8fe1bf699d0e101e9603b30f369e94a0eec1...,100638300,7
...,...,...,...,...
80543,80543,fd1c978b6d412419d37b646d5f0ac359db4b38f4879b0e...,26642006,1
80544,80544,fd1c978b6d412419d37b646d5f0ac359db4b38f4879b0e...,29611918,1
80545,80545,fd1c978b6d412419d37b646d5f0ac359db4b38f4879b0e...,480930665,1
80546,80546,fd1c978b6d412419d37b646d5f0ac359db4b38f4879b0e...,74773917,1


In [18]:
trolls_network = nx.from_pandas_edgelist(interactions_df, source='userid_hash', target='target_id', edge_attr=['tweet_id'], create_using=nx.DiGraph())

In [19]:
model = run_node2vec(trolls_network)

Computing transition probabilities:   0%|          | 0/43269 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2000/2000 [05:10<00:00,  6.44it/s]


In [20]:
df_lst = []
for node in trolls_network.nodes:
  emb = list(get_node_embedding(model, node))
  df_lst.append([node]+emb)
cols = ['user'] + [f'emb_{i}' for i in range(1,65)]
emb_df = pd.DataFrame(df_lst, columns = cols)
emb_df.to_csv(os.path.join(path_to_data_dir, 'node_embeddings.csv'))  

65

# A test its not relevant for our project

In [None]:
import urllib.request
import io
import zipfile

import matplotlib.pyplot as plt

url = "http://www-personal.umich.edu/~mejn/netdata/football.zip"

sock = urllib.request.urlopen(url)  # open URL
s = io.BytesIO(sock.read())  # read into BytesIO "file"
sock.close()

zf = zipfile.ZipFile(s)  # zipfile object
txt = zf.read("football.txt").decode()  # read info file
gml = zf.read("football.gml").decode()  # read gml data
# throw away bogus first line with # from mejn files
gml = gml.split("\n")[1:]
G = nx.parse_gml(gml)  # parse gml data

In [None]:
G.nodes()

NodeView(('BrighamYoung', 'FloridaState', 'Iowa', 'KansasState', 'NewMexico', 'TexasTech', 'PennState', 'SouthernCalifornia', 'ArizonaState', 'SanDiegoState', 'Baylor', 'NorthTexas', 'NorthernIllinois', 'Northwestern', 'WesternMichigan', 'Wisconsin', 'Wyoming', 'Auburn', 'Akron', 'VirginiaTech', 'Alabama', 'UCLA', 'Arizona', 'Utah', 'ArkansasState', 'NorthCarolinaState', 'BallState', 'Florida', 'BoiseState', 'BostonCollege', 'WestVirginia', 'BowlingGreenState', 'Michigan', 'Virginia', 'Buffalo', 'Syracuse', 'CentralFlorida', 'GeorgiaTech', 'CentralMichigan', 'Purdue', 'Colorado', 'ColoradoState', 'Connecticut', 'EasternMichigan', 'EastCarolina', 'Duke', 'FresnoState', 'OhioState', 'Houston', 'Rice', 'Idaho', 'Washington', 'Kansas', 'SouthernMethodist', 'Kent', 'Pittsburgh', 'Kentucky', 'Louisville', 'LouisianaTech', 'LouisianaMonroe', 'Minnesota', 'MiamiOhio', 'Vanderbilt', 'MiddleTennesseeState', 'Illinois', 'MississippiState', 'Memphis', 'Nevada', 'Oregon', 'NewMexicoState', 'SouthCa

In [None]:
model = run_node2vec(G)

Computing transition probabilities:   0%|          | 0/115 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [00:15<00:00, 12.84it/s]


In [None]:
get_node_embedding(model, 'SouthernCalifornia')

array([ 0.38229024, -0.09148575, -0.31526002,  0.16141161, -0.24374466,
        0.14576365, -0.16996497,  0.39103973,  0.10152379,  0.34311682,
        0.34396493,  0.00563566,  0.27069488,  0.2554789 , -0.17536959,
        0.14294732,  0.30914894,  0.22419801, -0.02432709,  0.00450403,
       -0.22786587, -0.5212519 , -0.3421619 ,  0.55607146, -0.06473765,
       -0.16062392, -0.00653969, -0.10622738, -0.31545305,  0.4738114 ,
        0.04440127, -0.04152583, -0.3643993 , -0.14808053, -0.15176737,
       -0.05594526,  0.047637  , -0.15302071,  0.05621442,  0.00194352,
        0.4720466 , -0.33625117, -0.04623657, -0.18353303,  0.3659258 ,
       -0.07202698,  0.23674083, -0.42837492, -0.10324931, -0.10424562,
        0.0594402 , -0.21324578, -0.06334193, -0.23506409,  0.01149494,
        0.01001751, -0.5453423 ,  0.06866325,  0.2717852 ,  0.10835633,
       -0.31819394, -0.040488  , -0.00620449,  0.16011234], dtype=float32)

In [None]:
get_most_similar_nodes(model, 'Washington')

[('WashingtonState', 0.757064163684845),
 ('ArizonaState', 0.7516475915908813),
 ('SouthernCalifornia', 0.7418372631072998),
 ('Oregon', 0.7299061417579651),
 ('OregonState', 0.7204449772834778),
 ('Stanford', 0.7150375247001648),
 ('Arizona', 0.701085090637207),
 ('California', 0.6790341734886169),
 ('UCLA', 0.6449398398399353),
 ('Idaho', 0.5427091121673584)]

In [None]:
DG = nx.DiGraph()
DG.add_nodes_from(['1', '2', '3', '4', '5', '6'])
DG.add_weighted_edges_from([('1', '2', 0.1), ('1', '3', 0.9), ('1', '6', 0.8),
                            ('4', '2', 0.7), ('4', '3', 0.2), ('4', '5', 0.2)])

In [None]:
D_model = run_node2vec(DG)

Computing transition probabilities:   0%|          | 0/6 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [00:00<00:00, 4065.09it/s]


In [None]:
get_node_embedding(D_model, '1')

array([ 0.01299362, -0.00031729, -0.01831908, -0.01978118,  0.0192002 ,
       -0.02572578,  0.01713924,  0.01151392, -0.01446222, -0.02596478,
       -0.02611492,  0.03103474,  0.00239234, -0.01118409, -0.00340295,
        0.02253726], dtype=float32)