In [2]:
#Import train dataset
import pandas as pd

pd.read_csv('train.csv')

df_edges_all=pd.read_csv('train.csv')


df_edges= df_edges_all.iloc[:int(len(df_edges_all)*0.8)]

df_edges_test = df_edges_all.iloc[int(len(df_edges_all)*0.8):]
df_edges.head()


Unnamed: 0,id,id1,id2,label
0,1,9202,9202,1
1,2,410411,460254,0
2,3,211858,312074,1
3,6,253901,504325,0
4,7,415071,63239,0


In [3]:
df_edges_test.head()

Unnamed: 0,id,id1,id2,label
758585,976717,345657,467621,1
758586,976719,825394,20456,1
758587,976720,720678,447520,0
758588,976721,181991,649051,1
758589,976723,663566,821902,0


In [4]:
dfdesc= pd.read_csv('nodes/nodes.tsv', sep='\t', header=0)



NameError: name 'df' is not defined

In [6]:
print(dfdesc.shape)
dfdesc=dfdesc.set_index('id')
dfdesc.head()


(837834, 2)


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1,{{infobox person | name = clayton jacobson | i...
2,a '''cobra probe''' is a device to measure the...
3,the '''harmon foundation''' was established in...
4,'''structured finance''' is a sector of financ...
5,'''al-shohada'a stadium''' is a multi-use stad...


In [21]:
# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

dfdesc = dfdesc.fillna("")
corpus = dfdesc.values[:,0]
print(corpus[0])
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)



In [7]:
# The existing graph
import networkx as nx
G= nx.Graph()
#add nodes
nodes = pd.unique(df_edges_all[['id1', 'id2']].values.ravel())
G.add_nodes_from(nodes)
#add edges
edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==1].itertuples()]
G.add_edges_from(edges)

# keeping all the negative edges so we can compare results
no_edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==0].itertuples()]

In [29]:
import math
# Adding methods proposed in the https://link.springer.com/content/pdf/10.1038/s41598-019-57304-y.pdf
def shortest_path(G:nx.Graph,source:int,target:int)->float:
    try:
        p=nx.shortest_path(G,source=source,target=target)
        return len(p)-1
    except nx.NetworkXNoPath:
        return float("inf")

def distance(G:nx.Graph,source:int,target:int)->float:
   '''
   This method can be changed to any distance
   '''
   return shortest_path(G,source,target)

from sklearn.metrics.pairwise import cosine_similarity
def distance_description(G,source:int,target:int)->float:
    '''
    This method can be changed to any distance
    '''
    textsource=dfdesc.loc[source]
    texttarget=dfdesc.loc[target]
    
    sp1=vectorizer.transform(textsource)
    sp2=vectorizer.transform(texttarget)
    
    return cosine_similarity(sp1,sp2)[0][0]


def closeness_centrality(G:nx.Graph, source:int, target:int)->float:
    dxy = nx.shortest_path(G,source=source,target=target)
    return G.number_of_nodes()/shortest_path(G,source,target)

def common_neighbors(G:nx.Graph, source:int, target:int)->list:
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    return common_neighbors

def CCPA(G:nx.Graph, source:int, target:int,a:float = 0.5)->float:
    '''
    Common Neighbor and Centrality based Parameterized Algorithm
    '''
    return a*closeness_centrality(G,source,target)+(1-a)*len(common_neighbors(G,source,target))

def CND(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor and Distance
    '''
    cn = common_neighbors(G,source,target)
    if len(cn)>0:
        return (len(cn)+1)/2
    else:
        return 1/distance(G,source,target)

def PA(G:nx.Graph, source:int, target:int)->float:
    '''
    Preferential Attachment
    '''
    return G.degree[source]*G.degree[target]

def AA(G:nx.Graph, source:int, target:int)->float:
    '''
    Adamic Adar
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / math.log(degree)
    return similarity

def CN(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor
    '''
    return len(common_neighbors(G,source,target))

def SI(G:nx.Graph, source:int, target:int)->float:
    '''
    Sorensen Index
    '''
    if (G.degree[source]+G.degree[target]) ==0:
        return 0
    return 2*CN(G,source,target)/(G.degree[source]+G.degree[target])

def JI(G:nx.Graph, source:int, target:int)->float:
    '''
    Jaccard Index
    '''
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    all_neighbors = set(s_neighbors).union(t_neighbors)
    if len(all_neighbors)==0:
        return 0
    return len(common_neighbors)/len(all_neighbors)

def RA(G:nx.Graph, source:int, target:int)->float:
    '''
    Resource Allocation
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / degree
    return similarity

def HPI(G:nx.Graph, source:int, target:int)->float:
    '''
    Hub Promoted Index
    '''
    if min([G.degree[source],G.degree[target]]) ==0 :
        return 0
    return CN(G,source,target)/min([G.degree[source],G.degree[target]])

def create_metric_vector(G: nx.Graph, source: int, target: int):
    functions = [CND, PA, AA, CN, SI, JI, RA, HPI,distance_description]
    outputs = []
    for func in functions:
        output = func(G, source, target)
        outputs.append(output)
    return outputs




In [30]:
# metrics names
print("[CND, PA, AA, CN, SI, JI, RA, HPI]")
#This for connected nodes
print(create_metric_vector(G,211858,312074))
#This is for unconnected nodes
print(create_metric_vector(G,410411,460254))


[CND, PA, AA, CN, SI, JI, RA, HPI]
[1.0, 18, 0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.11322475417027192]
[0.0, 0, 0.0, 0, 0.0, 0.0, 0.0, 0, 0.021393415850265]


In [31]:
# TEST SET

# metrics names
print("[CND, PA, AA, CN, SI, JI, RA, HPI]")
#This for connected nodes
print(create_metric_vector(G,345657,467621))
#This is for unconnected nodes
print(create_metric_vector(G,720678,447520))

[CND, PA, AA, CN, SI, JI, RA, HPI]
[0.0, 0, 0.0, 0, 0, 0, 0.0, 0, 0.23386042157952985]
[0.0, 0, 0.0, 0, 0, 0, 0.0, 0, 0.015094458857096909]


In [14]:
# Testing Tf-idf calculation





{{infobox person | name = clayton jacobson | image = | imagesize = | caption = | birth_name = | birth_date = | birth_place = | death_date = <!-- death date then birth --> | death_place = | other_names = | occupation = director, producer, writer, actor, editor | years_active = 1989 – present | spouse = | partner = | website = }} '''clayton jacobson''' (born 26 october 1963) is an australian film director, writer, producer, actor, musician and editor. his debut feature film was'' kenny (2006 film)''. ''kenny'' was released in 2006 in australia to critical acclaim, winning a number of awards. jacobson has also acted in a number of films, including 2010's ''animal kingdom (film)''. he is the brother of ''kenny'' actor shane jacobson. he also plays bass in the appalachian folk band the duck downpickers.


<1x1655398 sparse matrix of type '<class 'numpy.float64'>'
	with 75 stored elements in Compressed Sparse Row format>

In [16]:
s1=vectorizer.transform(dfdesc.loc[1])
s2=vectorizer.transform(dfdesc.loc[2])

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(s2,s1)[0][0]

0.022432842577040344