In [1]:
#import all the necessary libraries and modules
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
#import re
import matplotlib.pyplot as plt
import csv
import Graph_Sampling
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [2]:
# load nodes details
with open("train.txt") as f:
    reader = csv.reader(f, delimiter = "\t")
    train_nodes = list(reader)
print(len(train_nodes))

    

20000


In [3]:
mydict = {}
for node_line in train_nodes:
    key = node_line.pop(0)
    mydict[key] = node_line




In [4]:
#form our source and target nodes list
node_list1 = []
node_list2 = []
for key,val in mydict.items():
    for i in range(len(val)):
        node_list1.append(key)
        node_list2.append(val[i])
    



In [5]:
network_df = pd.DataFrame({'source':node_list1, 'sink':node_list2})
network_df.shape

(24004361, 2)

In [6]:
#create a list of tuples where each tuple represent an edge
node_list = list(zip(network_df['source'],network_df['sink']))
#remove duplicates from list
node_list = list(set([i for i in node_list])) 


In [7]:
G_complete = nx.DiGraph()
G_complete.add_edges_from(node_list)



In [9]:
G_complete.nodes()

KeyboardInterrupt: 

In [8]:
#Metropolis Hastings Sampling using Graph_Sampling library
import Graph_Sampling

sampler = Graph_Sampling.MHRW()
initial_seed_node = '540762'
nodes_to_sample = 1000
#output is a NetworkX graph of sampled edges
G = sampler.mhrw(G_complete, initial_seed_node, nodes_to_sample)

#unzip the pairs into source and sink
source_sink = list(zip(*G.edges()))

#create a dataframe with the sampled edges
network_df_sample = pd.DataFrame({"source":source_sink[0], "sink":source_sink[1]})

MemoryError: 

In [None]:
#Random Sampling (Either run this or the above cell)
#network_df_sample = network_df.sample(frac = 0.0001)
#network_df_sample.shape
#G = nx.from_pandas_edgelist(network_df_sample,source='source',target='sink', edge_attr=None, create_using=nx.DiGraph())

In [None]:
# plot graph

plt.figure(figsize=(20,20))
pos = nx.random_layout(G, seed=23)
nx.draw(G, with_labels=False,  pos = pos, node_size = 40, alpha = 0.6, width = 0.7, connectionstyle='arc3, rad = 0.1')

plt.show()



# combine all nodes in a list
node_list = node_list1_sampled + node_list2_sampled

# remove duplicate items from the list
node_list = sorted(list(dict.fromkeys(node_list)))

# build adjacency matrix
adj_G = nx.to_numpy_matrix(G, node_list)
adj_G.shape

In [None]:
# In this block, we find all the unconnected pairs and assign the 'link' label as zero
#find unconnected pairs
gEdges = G.edges()
unconnected_pairs = set()
for a in G.nodes():
    for b in G.nodes():
        if a != b and (a,b) not in gEdges:
            unconnected_pairs.add( (a, b) )

#set -> list
for pairs in unconnected_pairs:
    pairs = list(pairs)
    
unconnected_pairs = list(unconnected_pairs)

#create negative samples aka. samples with link = 0
node_1_unlinked = [i[0] for i in unconnected_pairs]
node_2_unlinked = [i[1] for i in unconnected_pairs]

data = pd.DataFrame({'source':node_1_unlinked, 
                     'sink':node_2_unlinked})

# add target variable 'link'
data['link'] = 0

In [None]:
#In this block, we find all the droppable pairs and assign the 'link' label as one, this represents our positive samples
#Remove Links from Connected Node Pairs – Positive Samples
# make sure that in the process of dropping edges, all the nodes of the graph should remain connected.
initial_node_count = len(G.nodes)
network_df_temp = network_df_sample.copy()

# empty list to store removable links
omissible_links_index = []

for i in tqdm(network_df_sample.index.values):
  
  # remove a node pair and build a new graph
  G_temp = nx.from_pandas_edgelist(network_df_temp.drop(index = i), "source", "sink", create_using=nx.DiGraph())
  
  # check there is no spliting of graph and number of nodes is same
  if (nx.number_weakly_connected_components(G_temp) == 1) and (len(G_temp.nodes) == initial_node_count):
    omissible_links_index.append(i)
    network_df_temp = network_df_temp.drop(index = i)

In [None]:
len(omissible_links_index)

In [None]:
# create dataframe of removable edges
network_df_ghost = network_df_sample.loc[omissible_links_index]

# add the target variable 'link'
network_df_ghost['link'] = 1

# Remember data already contains edges labeled '0', now we add edges labeled '1'
data = data.append(network_df_ghost[['source', 'sink', 'link']], ignore_index=True)

In [None]:
data['link'].value_counts()

In [None]:
# drop removable edges
network_df_sample_partial = network_df_sample.drop(index=network_df_ghost.index.values)

# create a new graph after dropping the removable links
G_data = nx.from_pandas_edgelist(network_df_sample_partial, "source", "sink", create_using=nx.DiGraph())

In [None]:
# Generate walks
node2vec = Node2Vec(G_data, dimensions=100, walk_length=16, num_walks=50)

# train node2vec model
n2w_model = node2vec.fit(window=7, min_count=1)

In [None]:
#apply the trained node2vec model on each and every node pair in the dataframe ‘data’
x = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['source'], data['sink'])]

In [None]:
#Validate performance of model
xtrain, xtest, ytrain, ytest = train_test_split(np.array(x), data['link'], 
                                                test_size = 0.3, 
                                                random_state = 35)

In [None]:
#train and predict a logistic regression model
lr = LogisticRegression(class_weight="balanced")

lr.fit(xtrain, ytrain)

In [None]:
predictions = lr.predict_proba(xtest)

In [None]:
roc_auc_score(ytest, predictions[:,1])