## load sparse vectors

In [4]:
import pickle 
sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)

## load Data (and keep trai and test set 0.8/0.2 )

In [5]:
#Import train dataset
import pandas as pd

df_edges_all=pd.read_csv('train.csv')

df_edges= df_edges_all.iloc[:int(len(df_edges_all)*0.8)]
df_edges_test = df_edges_all.iloc[int(len(df_edges_all)*0.8):]

## Build Graph

In [6]:
# The existing graph
import networkx as nx
G= nx.Graph()
#add nodes
nodes = pd.unique(df_edges_all[['id1', 'id2']].values.ravel())
G.add_nodes_from(nodes)
#add edges
edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==1].itertuples()]
G.add_edges_from(edges)

# keeping all the negative edges so we can compare results
no_edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==0].itertuples()]

# Feature Engineering (Graph and text based)

In [7]:
import math
# Adding methods proposed in the https://link.springer.com/content/pdf/10.1038/s41598-019-57304-y.pdf
def shortest_path(G:nx.Graph,source:int,target:int)->float:
    try:
        p=nx.shortest_path(G,source=source,target=target)
        return len(p)-1
    except nx.NetworkXNoPath:
        return float("inf")

def distance(G:nx.Graph,source:int,target:int)->float:
   '''
   This method can be changed to any distance
   '''
   return shortest_path(G,source,target)

from sklearn.metrics.pairwise import cosine_similarity
def distance_description(G,source:int,target:int)->float:
    '''
    This method can be changed to any distance
    '''
    
    
    sp1=sparsevectors[source]
    sp2=sparsevectors[target]
    
    return cosine_similarity(sp1,sp2)[0][0]


def closeness_centrality(G:nx.Graph, source:int, target:int)->float:
    dxy = nx.shortest_path(G,source=source,target=target)
    return G.number_of_nodes()/shortest_path(G,source,target)

def common_neighbors(G:nx.Graph, source:int, target:int)->list:
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    return common_neighbors

def CCPA(G:nx.Graph, source:int, target:int,a:float = 0.5)->float:
    '''
    Common Neighbor and Centrality based Parameterized Algorithm
    '''
    return a*closeness_centrality(G,source,target)+(1-a)*len(common_neighbors(G,source,target))

def CND(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor and Distance
    '''
    cn = common_neighbors(G,source,target)
    if len(cn)>0:
        return (len(cn)+1)/2
    else:
        if distance(G,source,target)==0:
            return 0
        return 1/distance(G,source,target)

def PA(G:nx.Graph, source:int, target:int)->float:
    '''
    Preferential Attachment
    '''
    return G.degree[source]*G.degree[target]

def AA(G:nx.Graph, source:int, target:int)->float:
    '''
    Adamic Adar
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / math.log(degree)
    return similarity

def CN(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor
    '''
    return len(common_neighbors(G,source,target))

def SI(G:nx.Graph, source:int, target:int)->float:
    '''
    Sorensen Index
    '''
    if (G.degree[source]+G.degree[target]) ==0:
        return 0
    return 2*CN(G,source,target)/(G.degree[source]+G.degree[target])

def JI(G:nx.Graph, source:int, target:int)->float:
    '''
    Jaccard Index
    '''
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    all_neighbors = set(s_neighbors).union(t_neighbors)
    if len(all_neighbors)==0:
        return 0
    return len(common_neighbors)/len(all_neighbors)

def RA(G:nx.Graph, source:int, target:int)->float:
    '''
    Resource Allocation
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / degree
    return similarity

def HPI(G:nx.Graph, source:int, target:int)->float:
    '''
    Hub Promoted Index
    '''
    if min([G.degree[source],G.degree[target]]) ==0 :
        return 0
    return CN(G,source,target)/min([G.degree[source],G.degree[target]])

def create_metric_vector(tup):
    G=tup[0]
    source= tup[1]
    target= tup[2]
    functions = [CND, PA, AA, CN, SI, JI, RA, HPI,distance_description]
    outputs = []
    for func in functions:
        output = func(G, source, target)
        outputs.append(output)
    return outputs

# Build Dataset

In [13]:
from multiprocessing import Pool
from tqdm import tqdm

def makedataset(df):
    label=df.label
    data=[]
    allind=len(df.index)
    counter=0
    tups=[]
    for i in tqdm(range(len(df.index))):
        row = df.iloc[i]

        tups.append((G,row.id1,row.id2))
    with Pool(14) as p:
        data=p.map(create_metric_vector, tups)
    
    datadf=pd.DataFrame(data)
    return datadf,label

def makedatasetapply(df):
    label=df.label
    datadf = df.progress_apply(lambda row: create_metric_vector((G,row['id1'],row['id2'])), axis=1,result_type='expand')
    return datadf,label

## Build test and Train 
In case of tqdm used:
!pip install tqdm

In [14]:
import time

start = time.time()
Xtest,Ytest=makedataset(df_edges_test)
end = time.time()
print(f" Test data ready after: {end - start}")



100%|████████████████████████████| 189647/189647 [00:06<00:00, 30079.56it/s]


 Test data ready after: 186.9898030757904


In [15]:
start = time.time()
X,Y=makedataset(df_edges)
end = time.time()
print(f" Train data ready after: {end - start}")

100%|████████████████████████████| 758585/758585 [00:24<00:00, 30638.39it/s]


 Train data ready after: 1263.8054313659668


# Fit Classifier and predic

In [19]:
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

model = XGBClassifier()

#X_train, X_test, y_trainn, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
model.fit(X.values, Y.values)
preds = model.predict(Xtest.values)
f1=f1_score(Ytest.values, preds)
print(f1)

0.007842600825296947


In [20]:
print(Ytest[:10])
print(Xtest[:10])

758585    1
758586    1
758587    0
758588    1
758589    0
758590    1
758591    1
758592    0
758593    0
758594    1
Name: label, dtype: int64
     0  1    2  3    4    5    6    7         8
0  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.233860
1  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.367158
2  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.015094
3  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.503240
4  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.001706
5  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.351744
6  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.061002
7  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.005097
8  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.041449
9  0.0  0  0.0  0  0.0  0.0  0.0  0.0  0.204484


# SIMPLE TF IDF

In [31]:
mypreds=[vec[-1]>0.0505 for vec in Xtest.values]

f1=f1_score(Ytest.values, mypreds)
print(f1)

0.9323874044136888
