## Offline time of TF-IDF in all descriptions (nodes), plus similarities of training pairs

In [5]:
avpool=4

Time of tf-idf vectors

In [1]:
import pandas as pd
import time
dfdesc= pd.read_csv('nodes/nodes.tsv', sep='\t', header=0)

dfdesc=dfdesc.set_index('id')


from sklearn.feature_extraction.text import TfidfVectorizer
start = time.time()
dfdesc = dfdesc.fillna("")
corpus = dfdesc.values[:,0]

vectorizer = TfidfVectorizer()


Xtfidf = vectorizer.fit_transform(corpus)
end = time.time()
print(end-start)

In [3]:
# write in file
from tqdm import tqdm
tqdm.pandas()

sparsevectors={}

for i in tqdm(range(X.shape[0])):
    sparsevectors[dfdesc.index[i]]=Xtfidf[i]

import pickle 

with open('text_vectors.pkl', 'wb') as f:
    pickle.dump(sparsevectors, f)

100%|████████████████████████████████| 837834/837834 [01:00<00:00, 13880.49it/s]


Time for pair similarities in train

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

df_edges_test=pd.read_csv('train.csv')
#df_edges_test= df_edges_test.iloc[:int(len(df_edges_test)*0.01)]

def distance_description(tup)->float:
    '''
    This method can be changed to any distance
    '''
    source=tup[0]
    target=tup[1]
    sp1=sparsevectors[source]
    sp2=sparsevectors[target]
    
    return cosine_similarity(sp1,sp2)[0][0]
import time

from tqdm import tqdm
from multiprocessing import Pool

my_preds=[]
start = time.time()

tups=[]
for ind, row in tqdm(df_edges_test.iterrows()):
    tups.append((row["id1"],row["id2"]))
with Pool(avpool) as p:
    data=p.map(distance_description, tups)
    end = time.time()
    print(end-start)
    dfsimilarities=pd.DataFrame(zip(df_edges_test["id"],data),columns=["id","similarity"])
    dfsimilarities.to_csv("train_similarities.csv")

9482it [00:00, 23603.19it/s]


18.718411445617676


AttributeError: 'builtin_function_or_method' object has no attribute 'get_indexer'

## Time to build graph amd vector calculation in train

In [1]:
#Import train dataset
import pandas as pd

df_edges_all=pd.read_csv('train.csv')


df_edges= df_edges_all

import pickle 
import pandas as pd
sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)

Time for building graph in train nodes

In [2]:
# The existing graph
import networkx as nx
import time
start = time.time()


G= nx.Graph()
#add nodes
nodes = pd.unique(df_edges_all[['id1', 'id2']].values.ravel())
G.add_nodes_from(nodes)
#add edges
edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==1].itertuples()]
G.add_edges_from(edges)

# keeping all the negative edges so we can compare results
no_edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==0].itertuples()]

end = time.time()
print(end-start)

3.832526922225952


In [3]:
import math
# Adding methods proposed in the https://link.springer.com/content/pdf/10.1038/s41598-019-57304-y.pdf
def shortest_path(G:nx.Graph,source:int,target:int)->float:
    try:
        p=nx.shortest_path(G,source=source,target=target)
        return len(p)-1
    except nx.NetworkXNoPath:
        return float("inf")

def distance(G:nx.Graph,source:int,target:int)->float:
   '''
   This method can be changed to any distance
   '''
   return shortest_path(G,source,target)

from sklearn.metrics.pairwise import cosine_similarity
def distance_description(G,source:int,target:int)->float:
    '''
    This method can be changed to any distance
    '''
    
    
    sp1=sparsevectors[source]
    sp2=sparsevectors[target]
    
    return cosine_similarity(sp1,sp2)[0][0]


def closeness_centrality(G:nx.Graph, source:int, target:int)->float:
    dxy = nx.shortest_path(G,source=source,target=target)
    return G.number_of_nodes()/shortest_path(G,source,target)

def common_neighbors(G:nx.Graph, source:int, target:int)->list:
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    return common_neighbors

def CCPA(G:nx.Graph, source:int, target:int,a:float = 0.5)->float:
    '''
    Common Neighbor and Centrality based Parameterized Algorithm
    '''
    return a*closeness_centrality(G,source,target)+(1-a)*len(common_neighbors(G,source,target))

def CND(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor and Distance
    '''
    cn = common_neighbors(G,source,target)
    if len(cn)>0:
        return (len(cn)+1)/2
    else:
        if distance(G,source,target)==0:
            return 0
        return 1/distance(G,source,target)

def PA(G:nx.Graph, source:int, target:int)->float:
    '''
    Preferential Attachment
    '''
    return G.degree[source]*G.degree[target]

def AA(G:nx.Graph, source:int, target:int)->float:
    '''
    Adamic Adar
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / math.log(degree)
    return similarity

def CN(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor
    '''
    return len(common_neighbors(G,source,target))

def SI(G:nx.Graph, source:int, target:int)->float:
    '''
    Sorensen Index
    '''
    if (G.degree[source]+G.degree[target]) ==0:
        return 0
    return 2*CN(G,source,target)/(G.degree[source]+G.degree[target])

def JI(G:nx.Graph, source:int, target:int)->float:
    '''
    Jaccard Index
    '''
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    all_neighbors = set(s_neighbors).union(t_neighbors)
    if len(all_neighbors)==0:
        return 0
    return len(common_neighbors)/len(all_neighbors)

def RA(G:nx.Graph, source:int, target:int)->float:
    '''
    Resource Allocation
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / degree
    return similarity

def HPI(G:nx.Graph, source:int, target:int)->float:
    '''
    Hub Promoted Index
    '''
    if min([G.degree[source],G.degree[target]]) ==0 :
        return 0
    return CN(G,source,target)/min([G.degree[source],G.degree[target]])

def create_metric_vector(tup):
    G=tup[0]
    source= tup[1]
    target= tup[2]
    functions = [CND, PA, AA, CN, SI, JI, RA, HPI]
    outputs = []
    for func in functions:
        output = func(G, source, target)
        outputs.append(output)
    return outputs
from multiprocessing import Pool


def makedataset(df):
    label=df.label
    data=[]
    allind=len(df.index)
    counter=0
    tups=[]
    for i in tqdm(range(len(df.index))):
        row = df.iloc[i]

        tups.append((G,row.id1,row.id2))
    with Pool(avpool) as p:
        data=p.map(create_metric_vector, tups)
    
    datadf=pd.DataFrame(data)
    return datadf,label

In [6]:
from tqdm import tqdm

start = time.time()
X,Y=makedataset(df_edges)
end = time.time()
print(end - start)

Xcopy=X.copy()
Xcopy["label"]=Y
Xcopy.to_csv("simple_vectors_with_labels.csv",index=False)


100%|████████████████████████████████| 948232/948232 [00:43<00:00, 21553.27it/s]


97.30477118492126


Train classifier time

In [21]:
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
model = XGBClassifier()

X_train = X
y_trainn=Y
start = time.time()

model.fit(X_train, y_trainn)
end = time.time()
print(end - start)

NameError: name 'Y' is not defined

## Enhance time ( (1) + NN + Query Time )

In [None]:
import pickle 
import pandas as pd
sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)
    
dfdesc = dfdesc.fillna("")


Time to build index

In [None]:
import pysparnn.cluster_index as ci

start = time.time()

cp = ci.MultiClusterIndex(Xtfidf, dfdesc.index)

end = time.time()
print(end - start)

with open('cluterIndex.bin', 'wb') as fh_out:
    pickle.dump(cp, fh_out)

Query Time

In [22]:
def getsimilartups(idn):
    res=cp2.search(sparsevectors[idn], k=25, k_clusters=1, return_distance=True)
    tups=[ (iddd[1],idn) for iddd in res[0][1:] if iddd[0]<0.90]
    with open("new_edges.txt","a+") as f:
        for tup in tups:
            f.write(f"{tup[0]},{tup[1]}\n")

In [23]:
with open('cluterIndex.bin', 'rb') as file_:
    cp2 =pickle.load(file_)
    import warnings
    warnings.filterwarnings("ignore")
    from multiprocessing import Pool
    start = time.time()
    with Pool(avpool) as p:
        data=p.map(getsimilartups, list(sparsevectors.keys()))
    end = time.time()
    print(end - start)

KeyboardInterrupt: 

## building enhance graph vectors

Build graph time

In [None]:
import pickle 
import networkx as nx


sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)

import pandas as pd

df_edges=pd.read_csv('train.csv')

df_edges_all = df_edges[['id1', 'id2']]

# load edges from tzt file (new_edges.txt)
dfouredges=pd.read_csv("new_edges.txt",header=None,names=["id1","id2"])

dfouredges["id1"]=dfouredges["id1"].astype(int)
dfouredges["id2"]=dfouredges["id2"].astype(int)

df_edges_all=pd.concat([df_edges_all, dfouredges], ignore_index=True)


nodes=list(sparsevectors.keys())

start = time.time()


G= nx.Graph()
#add nodes
G.add_nodes_from(nodes)
#add edges
edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==1].itertuples()]
G.add_edges_from(edges)
edges2 = [(row.id1,row.id2) for row in dfouredges.itertuples()]
G.add_edges_from(edges2)
end = time.time()
print(end -start)

In [24]:
import math
from tqdm import tqdm
# Adding methods proposed in the https://link.springer.com/content/pdf/10.1038/s41598-019-57304-y.pdf
def shortest_path(G:nx.Graph,source:int,target:int)->float:
    try:
        p=nx.shortest_path(G,source=source,target=target)
        return len(p)-1
    except nx.NetworkXNoPath:
        return float("inf")

def distance(G:nx.Graph,source:int,target:int)->float:
   '''
   This method can be changed to any distance
   '''
   return shortest_path(G,source,target)

from sklearn.metrics.pairwise import cosine_similarity
def distance_description(G,source:int,target:int)->float:
    '''
    This method can be changed to any distance
    '''
    
    
    sp1=sparsevectors[source]
    sp2=sparsevectors[target]
    
    return cosine_similarity(sp1,sp2)[0][0]


def closeness_centrality(G:nx.Graph, source:int, target:int)->float:
    dxy = nx.shortest_path(G,source=source,target=target)
    return G.number_of_nodes()/shortest_path(G,source,target)

def common_neighbors(G:nx.Graph, source:int, target:int)->list:
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    return common_neighbors

def CCPA(G:nx.Graph, source:int, target:int,a:float = 0.5)->float:
    '''
    Common Neighbor and Centrality based Parameterized Algorithm
    '''
    return a*closeness_centrality(G,source,target)+(1-a)*len(common_neighbors(G,source,target))

def CND(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor and Distance
    '''
    cn = common_neighbors(G,source,target)
    if len(cn)>0:
        return (len(cn)+1)/2
    else:
        if distance(G,source,target)==0:
            return 0
        return 1/distance(G,source,target)

def PA(G:nx.Graph, source:int, target:int)->float:
    '''
    Preferential Attachment
    '''
    return G.degree[source]*G.degree[target]

def AA(G:nx.Graph, source:int, target:int)->float:
    '''
    Adamic Adar
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / math.log(degree)
    return similarity

def CN(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor
    '''
    return len(common_neighbors(G,source,target))

def SI(G:nx.Graph, source:int, target:int)->float:
    '''
    Sorensen Index
    '''
    if (G.degree[source]+G.degree[target]) ==0:
        return 0
    return 2*CN(G,source,target)/(G.degree[source]+G.degree[target])

def JI(G:nx.Graph, source:int, target:int)->float:
    '''
    Jaccard Index
    '''
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    all_neighbors = set(s_neighbors).union(t_neighbors)
    if len(all_neighbors)==0:
        return 0
    return len(common_neighbors)/len(all_neighbors)

def RA(G:nx.Graph, source:int, target:int)->float:
    '''
    Resource Allocation
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / degree
    return similarity

def HPI(G:nx.Graph, source:int, target:int)->float:
    '''
    Hub Promoted Index
    '''
    if min([G.degree[source],G.degree[target]]) ==0 :
        return 0
    return CN(G,source,target)/min([G.degree[source],G.degree[target]])

def create_metric_vector(tup):
    G=tup[0]
    source= tup[1]
    target= tup[2]
    functions = [CND, PA, AA, CN, SI, JI, RA, HPI]
    outputs = []
    for func in functions:
        output = func(G, source, target)
        outputs.append(output)
    return outputs

def makedataset(df):
    label=df.label
    data=[]
    allind=len(df.index)
    counter=0
    tups=[]
    for i in tqdm(range(len(df.index))):
        row = df.iloc[i]

        tups.append((G,row.id1,row.id2))
    with Pool(avpool) as p:
        data=p.map(create_metric_vector, tups)
    
    datadf=pd.DataFrame(data)
    return datadf,label


Build vectors

In [None]:
start = time.time()
X,Y=makedataset(df_edges)
end = time.time()
print(end - start)

Xcopy=X.copy()
Xcopy["label"]=Y
Xcopy.to_csv("enhance_vectors_with_labels.csv",index=False)

train in enhaced

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
start = time.time()

model = XGBClassifier()

model.fit(X, Y)
end = time.time()
print(end - start)