 Here, we enhance the original graph by introducing additional edges based on the TF-IDF vector similarity between each pair of nodes. We approximate the nearest neighbors for each node and utilize the resulting enhanced graph to create the graph vectors. The classifier is then trained using these vectors to predict the labels for new incoming edges.

## Offline 

### requires text_vectors.pkl which is ommited by runing D.ipynb

In [11]:
import pickle 
import pandas as pd
sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)

build index for aproximate Nearest Neighboards using tf-idf vectors

Install pysparn

git clone https://github.com/facebookresearch/pysparnn.git

cd pysparnn && python setup.py install

In [None]:
import pysparnn.cluster_index as ci
from sklearn.feature_extraction.text import TfidfVectorizer


dfdesc= pd.read_csv('nodes/nodes.tsv', sep='\t', header=0)

dfdesc=dfdesc.set_index('id')

dfdesc = dfdesc.fillna("")
corpus = dfdesc.values[:,0]

vectorizer = TfidfVectorizer()


Xtfidf = vectorizer.fit_transform(corpus)



cp = ci.MultiClusterIndex(Xtfidf, dfdesc.index)


with open('cluterIndex.bin', 'wb') as fh_out:
    pickle.dump(cp, fh_out)

Load Index and perform queries to build new edges (and store them in new_edges.txt)

In [None]:
def getsimilartups(idn):
    res=cp2.search(sparsevectors[idn], k=25, k_clusters=1, return_distance=True)
    tups=[ (iddd[1],idn) for iddd in res[0][1:] if iddd[0]<0.90]
    with open("new_edges.txt","a+") as f:
        for tup in tups:
            f.write(f"{tup[0]},{tup[1]}\n")

with open('cluterIndex.bin', 'rb') as file_:
    cp2 =pickle.load(file_)
    import warnings
    warnings.filterwarnings("ignore")
    from multiprocessing import Pool
    start = time.time()
    with Pool(avpool) as p:
        data=p.map(getsimilartups, list(sparsevectors.keys()))
    end = time.time()
    print(end - start)

Build the enhanced graph using new_edges.txt

In [13]:
import pickle 
import networkx as nx
import pandas as pd

df_edges=pd.read_csv('train.csv')

df_edges_all = df_edges[['id1', 'id2']]

sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)
# load edges from tzt file (new_edges.txt)
dfouredges=pd.read_csv("new_edges.txt",header=None,names=["id1","id2"])

dfouredges["id1"]=dfouredges["id1"].astype(int)
dfouredges["id2"]=dfouredges["id2"].astype(int)

df_edges_all=pd.concat([df_edges_all, dfouredges], ignore_index=True)


nodes=list(sparsevectors.keys())



G= nx.Graph()
#add nodes
G.add_nodes_from(nodes)
#add edges
edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==1].itertuples()]
G.add_edges_from(edges)
edges2 = [(row.id1,row.id2) for row in dfouredges.itertuples()]
G.add_edges_from(edges2)


Define graph metrics (same as G solution).

In [14]:
import math
# Adding methods proposed in the https://link.springer.com/content/pdf/10.1038/s41598-019-57304-y.pdf
def shortest_path(G:nx.Graph,source:int,target:int)->float:
    try:
        p=nx.shortest_path(G,source=source,target=target)
        return len(p)-1
    except nx.NetworkXNoPath:
        return float("inf")

def distance(G:nx.Graph,source:int,target:int)->float:
   '''
   This method can be changed to any distance
   '''
   return shortest_path(G,source,target)

def closeness_centrality(G:nx.Graph, source:int, target:int)->float:
    dxy = nx.shortest_path(G,source=source,target=target)
    return G.number_of_nodes()/shortest_path(G,source,target)

def common_neighbors(G:nx.Graph, source:int, target:int)->list:
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    return common_neighbors

def CCPA(G:nx.Graph, source:int, target:int,a:float = 0.5)->float:
    '''
    Common Neighbor and Centrality based Parameterized Algorithm
    '''
    return a*closeness_centrality(G,source,target)+(1-a)*len(common_neighbors(G,source,target))

def CND(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor and Distance
    '''
    cn = common_neighbors(G,source,target)
    if len(cn)>0:
        return (len(cn)+1)/2
    else:
        if distance(G,source,target)==0:
            return 0
        return 1/distance(G,source,target)

def PA(G:nx.Graph, source:int, target:int)->float:
    '''
    Preferential Attachment
    '''
    return G.degree[source]*G.degree[target]

def AA(G:nx.Graph, source:int, target:int)->float:
    '''
    Adamic Adar
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / math.log(degree)
    return similarity

def CN(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor
    '''
    return len(common_neighbors(G,source,target))

def SI(G:nx.Graph, source:int, target:int)->float:
    '''
    Sorensen Index
    '''
    if (G.degree[source]+G.degree[target]) ==0:
        return 0
    return 2*CN(G,source,target)/(G.degree[source]+G.degree[target])

def JI(G:nx.Graph, source:int, target:int)->float:
    '''
    Jaccard Index
    '''
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    all_neighbors = set(s_neighbors).union(t_neighbors)
    if len(all_neighbors)==0:
        return 0
    return len(common_neighbors)/len(all_neighbors)

def RA(G:nx.Graph, source:int, target:int)->float:
    '''
    Resource Allocation
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / degree
    return similarity

def HPI(G:nx.Graph, source:int, target:int)->float:
    '''
    Hub Promoted Index
    '''
    if min([G.degree[source],G.degree[target]]) ==0 :
        return 0
    return CN(G,source,target)/min([G.degree[source],G.degree[target]])

def create_metric_vector(tup):
    G=tup[0]
    source= tup[1]
    target= tup[2]
    functions = [CND, PA, AA, CN, SI, JI, RA, HPI]
    outputs = []
    for func in functions:
        output = func(G, source, target)
        outputs.append(output)
    return outputs
from multiprocessing import Pool


def makedatasetapply(df):
    label=df.label
    datadf = df.apply(lambda row: create_metric_vector((G,row['id1'],row['id2'])), axis=1,result_type='expand')
    return datadf,label

Create vectors for training data using enhanced graph and store them

In [9]:
X,Y=makedatasetapply(df_edges)

Xcopy=X.copy()
Xcopy["label"]=Y
Xcopy.to_csv("enhance_vectors_with_labels.csv",index=False)

Train classifier 

In [8]:
Xcopy=pd.read_csv("enhance_vectors_with_labels.csv")
Y=Xcopy["label"]
X=Xcopy.drop(["label"],axis=1)

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

model = XGBClassifier()

model.fit(X, Y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

## Online

Create test vectors

In [15]:
def makedataset_TEST(df):
    datadf = df.apply(lambda row: create_metric_vector((G,row['id1'],row['id2'])), axis=1,result_type='expand')
    return datadf

df_edges_all_test=pd.read_csv('test.csv')
X_test=makedataset_TEST(df_edges_all_test)


In [18]:
Xcopy=X_test.copy()
Xcopy.to_csv("enhance_vectors_test.csv",index=False)

produce predictions

In [16]:
preds = model.predict(X_test)

Store them

In [17]:
dfexpe=pd.DataFrame(zip(df_edges_all_test["id"].values,preds),columns=["id","label"])
dfexpe.to_csv("enhance_graph_submission.csv",index=False)