In [12]:
import networkx as nx
import pandas as pd
from stellargraph.data import EdgeSplitter
from networkx import write_edgelist

In [6]:
edges = pd.read_csv("edges.csv", sep = "\t")
edges.columns = ["source", "target"]

## Extract biggest connected component

In [9]:
G = nx.Graph()
G.add_edges_from(edges.values.tolist())

#retrieve largest connected component
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])

## Split in train and test

From the original graph G, extract a randomly sampled subset of test edges (true and false citation links) and the reduced graph G_test with the positive test edges removed.

The Test Graph is the reduced graph we obtain from removing the test set of links from the full graph.

In [10]:
edge_splitter_test = EdgeSplitter(G0)
G_test, edge_ids_test, edge_labels_test =edge_splitter_test.train_test_split(
    p = 0.1, method = "global", keep_connected = True
)

** Sampled 85353 positive and 85353 negative edges. **


The reduced graph G_test, together with the test ground truth set of links (edge_ids_test, edge_labels_test), will be used for testing the model.

 From the reduced graph G_test, extract a randomly sampled subset of train edges (true and false citation links) and the reduced graph G_train with the positive train edges removed:

In [17]:
edge_splitter_train = EdgeSplitter(G_test)
G_train, edge_ids_train, edge_labels_train =edge_splitter_train.train_test_split(
    p = 0.2, method = "global", keep_connected = True
)

** Sampled 153637 positive and 153637 negative edges. **


G_train, together with the train ground truth set of links (edge_ids_train, edge_labels_train), will be used for training the model.

G_train and G_test have the same set of nodes, only differing in their edge sets.


**Features for the train_set needs to  be computed on G_train, while feautures on the test_set needs to be computed on G_test**

In [14]:
# if we want we can conver the list of edges with the corresponding label in a dataframe
# (not necessary, for example in the kaggle we created the features using lists)
test_edges = pd.DataFrame(edge_ids_test, columns = ["Source", "Target"])
test_edges["Edge"] = edge_labels_test
test_edges.head(5)

Unnamed: 0,Source,Target,Edge
0,rGBhluYrbzU,XZ8tdf1gyHI,1
1,ClQHYZqW_Ns,fvjGMoE7oSU,1
2,EfvQAu6NRrk,1Ex1ZcQJEv8,1
3,3jiTk_diJCw,PshwMfxvI-I,1
4,KTNXjOsXpUc,QsHbd8nPFbc,1


In [15]:
train_edges = pd.DataFrame(edge_ids_train, columns = ["Source", "Target"])
train_edges["Edge"] = edge_labels_train
train_edges.head(5)

Unnamed: 0,Source,Target,Edge
0,xuOtyEjQ-v4,WUvCeCflehk,1
1,-V1Nw7kIY7o,WNsrs_cjZH8,1
2,zhdUqcvcdDY,3xdO4n-Vk4Y,1
3,RVbDwqSq4Ec,J7X2ieNiixE,1
4,aKWP6f_Wvoc,EQbLNEiM17M,1


In [16]:
#save all files
test_edges.to_csv("test_set.csv")
train_edges.to_csv("train_set.csv")
write_edgelist(G_test, "test.edgelist", delimiter=',', data=False)
write_edgelist(G_train, "train.edgelist", delimiter=',', data=False)