In this approach, we solely utilize the descriptions of the nodes. We construct TF-IDF vectors based on the node descriptions and calculate the similarity between nodes. By applying an optimized threshold, we predict the presence of a link between two nodes if their similarity exceeds this threshold.

## Offline processing

Load data and create TF-IDF vectors

In [1]:
import pandas as pd
import time
dfdesc= pd.read_csv('nodes/nodes.tsv', sep='\t', header=0)

dfdesc=dfdesc.set_index('id')


from sklearn.feature_extraction.text import TfidfVectorizer
start = time.time()
dfdesc = dfdesc.fillna("")
corpus = dfdesc.values[:,0]

vectorizer = TfidfVectorizer()


Xtfidf = vectorizer.fit_transform(corpus)


53.81601428985596


Save vectors for future use

In [3]:
# write in file
from tqdm import tqdm
tqdm.pandas()

sparsevectors={}

for i in tqdm(range(Xtfidf.shape[0])):
    sparsevectors[dfdesc.index[i]]=Xtfidf[i]

import pickle 

with open('text_vectors.pkl', 'wb') as f:
    pickle.dump(sparsevectors, f)

100%|████████████████████████████| 837834/837834 [00:44<00:00, 18637.41it/s]


Calculate description similarities using vvectors from tf-idf (cosine similarity) on training data

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

df_edges_test=pd.read_csv('train.csv')
#df_edges_test= df_edges_test.iloc[:int(len(df_edges_test)*0.01)]

def distance_description(tup)->float:
    '''
    This method can be changed to any distance
    '''
    source=tup[0]
    target=tup[1]
    sp1=sparsevectors[source]
    sp2=sparsevectors[target]
    
    return cosine_similarity(sp1,sp2)[0][0]
import time

from tqdm import tqdm
from multiprocessing import Pool

my_preds=[]
start = time.time()

tups=[]
for ind, row in tqdm(df_edges_test.iterrows()):
    tups.append((row["id1"],row["id2"]))
with Pool(avpool) as p:
    data=p.map(distance_description, tups)
    end = time.time()
    print(end-start)
    dfsimilarities=pd.DataFrame(zip(df_edges_test["id"],data),columns=["id","similarity"])
    dfsimilarities.to_csv("train_similarities.csv")

## Online

Load description vectors of nodes

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle 
import pandas as pd
sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)

df_edges_test=pd.read_csv('test.csv')
#df_edges_test= df_edges_test.iloc[:int(len(df_edges_test)*0.01)]

def distance_description(tup)->float:
    '''
    This method can be changed to any distance
    '''
    source=tup[0]
    target=tup[1]
    sp1=sparsevectors[source]
    sp2=sparsevectors[target]
    
    return cosine_similarity(sp1,sp2)[0][0]
import time



Calculate similarity for test pairs, and use threshold to propose edge

In [5]:
#threads
avpool=4

from tqdm import tqdm
from multiprocessing import Pool

my_preds=[]
start = time.time()

tups=[]
for ind, row in tqdm(df_edges_test.iterrows()):
    tups.append((row["id1"],row["id2"]))
with Pool(avpool) as p:
    data=p.map(distance_description, tups)
    end = time.time()
    print(end-start)
    dfsimilarities=pd.DataFrame(zip(df_edges_test["id"],data),columns=["id","similarity"])
    dfsimilarities.to_csv("test_similarities.csv")
    # produce submission
    preds = [int(d>0.0505) for d in data]
    dfexpe=pd.DataFrame(zip(df_edges_test["id"],preds),columns=["id","label"])
    dfexpe.to_csv("D_submission.csv",index=False)

238364it [00:06, 35109.64it/s]


181.13813757896423
