# BUILD GRAPH (WITH ALL NODES THIS TIME) AND LOAD RESOURCES

In [7]:
import pickle 
import networkx as nx


sparsevectors={}
with open('text_vectors.pkl', 'rb') as f:
    sparsevectors = pickle.load(f)

import pandas as pd

df_edges=pd.read_csv('train.csv')

df_edges_all = df_edges[['id1', 'id2']]

# load edges from tzt file (new_edges.txt)
dfouredges=pd.read_csv("new_edges.txt",header=None,names=["id1","id2"])

dfouredges["id1"]=dfouredges["id1"].astype(int)
dfouredges["id2"]=dfouredges["id2"].astype(int)

df_edges_all=pd.concat([df_edges_all, dfouredges], ignore_index=True)


nodes=list(sparsevectors.keys())



G= nx.Graph()
#add nodes
G.add_nodes_from(nodes)
#add edges
edges = [(row.id1,row.id2) for row in df_edges[df_edges["label"]==1].itertuples()]
G.add_edges_from(edges)
edges2 = [(row.id1,row.id2) for row in dfouredges.itertuples()]
G.add_edges_from(edges2)

In [8]:
import math
from tqdm import tqdm
# Adding methods proposed in the https://link.springer.com/content/pdf/10.1038/s41598-019-57304-y.pdf
def shortest_path(G:nx.Graph,source:int,target:int)->float:
    try:
        p=nx.shortest_path(G,source=source,target=target)
        return len(p)-1
    except nx.NetworkXNoPath:
        return float("inf")

def distance(G:nx.Graph,source:int,target:int)->float:
   '''
   This method can be changed to any distance
   '''
   return shortest_path(G,source,target)

from sklearn.metrics.pairwise import cosine_similarity
def distance_description(G,source:int,target:int)->float:
    '''
    This method can be changed to any distance
    '''
    
    
    sp1=sparsevectors[source]
    sp2=sparsevectors[target]
    
    return cosine_similarity(sp1,sp2)[0][0]


def closeness_centrality(G:nx.Graph, source:int, target:int)->float:
    dxy = nx.shortest_path(G,source=source,target=target)
    return G.number_of_nodes()/shortest_path(G,source,target)

def common_neighbors(G:nx.Graph, source:int, target:int)->list:
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    return common_neighbors

def CCPA(G:nx.Graph, source:int, target:int,a:float = 0.5)->float:
    '''
    Common Neighbor and Centrality based Parameterized Algorithm
    '''
    return a*closeness_centrality(G,source,target)+(1-a)*len(common_neighbors(G,source,target))

def CND(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor and Distance
    '''
    cn = common_neighbors(G,source,target)
    if len(cn)>0:
        return (len(cn)+1)/2
    else:
        if distance(G,source,target)==0:
            return 0
        return 1/distance(G,source,target)

def PA(G:nx.Graph, source:int, target:int)->float:
    '''
    Preferential Attachment
    '''
    return G.degree[source]*G.degree[target]

def AA(G:nx.Graph, source:int, target:int)->float:
    '''
    Adamic Adar
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / math.log(degree)
    return similarity

def CN(G:nx.Graph, source:int, target:int)->float:
    '''
    Common Neighbor
    '''
    return len(common_neighbors(G,source,target))

def SI(G:nx.Graph, source:int, target:int)->float:
    '''
    Sorensen Index
    '''
    if (G.degree[source]+G.degree[target]) ==0:
        return 0
    return 2*CN(G,source,target)/(G.degree[source]+G.degree[target])

def JI(G:nx.Graph, source:int, target:int)->float:
    '''
    Jaccard Index
    '''
    s_neighbors = list(G.adj[source].keys())
    t_neighbors = list(G.adj[target].keys())
    common_neighbors = set(s_neighbors).intersection(t_neighbors)
    all_neighbors = set(s_neighbors).union(t_neighbors)
    if len(all_neighbors)==0:
        return 0
    return len(common_neighbors)/len(all_neighbors)

def RA(G:nx.Graph, source:int, target:int)->float:
    '''
    Resource Allocation
    '''
    similarity = 0.0
    for neighbor in common_neighbors(G,source,target):
        degree = G.degree[neighbor]
        if degree > 1:
            similarity += 1 / degree
    return similarity

def HPI(G:nx.Graph, source:int, target:int)->float:
    '''
    Hub Promoted Index
    '''
    if min([G.degree[source],G.degree[target]]) ==0 :
        return 0
    return CN(G,source,target)/min([G.degree[source],G.degree[target]])

def create_metric_vector(tup):
    G=tup[0]
    source= tup[1]
    target= tup[2]
    functions = [CND, PA, AA, CN, SI, JI, RA, HPI,distance_description]
    outputs = []
    for func in functions:
        output = func(G, source, target)
        outputs.append(output)
    return outputs




# TRAIN CLASSIFIER TO TRAIN VECTORS

In [9]:
import pandas as pd
Xcopy = pd.read_csv("vectotr_with_labels.csv")
Xcopy.head()
Y=Xcopy["label"]
X=Xcopy.drop(["label"],axis=1)

In [11]:
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

model = XGBClassifier()

X_train, X_test, y_trainn, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
model.fit(X_train, y_trainn)
preds = model.predict(X_test)
f1=f1_score(y_test, preds)
print(f1)

0.998829308603434


# BUILD SUBMISION VECTORS

In [12]:
df_edges_test=pd.read_csv('test.csv')
df_edges_test.head()

Unnamed: 0,id,id1,id2
0,4,253077,253077
1,5,235274,65408
2,10,172772,677546
3,14,378856,175720
4,20,825250,35839


In [15]:
tqdm.pandas()
def makedatasetapply_TEST(df):
    datadf = df.progress_apply(lambda row: create_metric_vector((G,row['id1'],row['id2'])), axis=1,result_type='expand')
    return datadf



Xsub=makedatasetapply_TEST(df_edges_test)


100%|███████████████████████████████████| 238364/238364 [54:36<00:00, 72.75it/s]


In [16]:
Xtest=Xsub.copy()
Xtest["id"]=df_edges_test["id"]
Xtest.to_csv("submission_vectors_with_ids.csv",index=False)

In [17]:
my_preds = model.predict(Xsub)
dfmysub=pd.DataFrame(zip(df_edges_test["id"].values,my_preds),columns=["id","label"])
dfmysub.to_csv("enhanced_submission.csv",index=False)

In [35]:
dfoldsub=pd.read_csv("submission",index_col=0)
print(Xsub.head())
dfoldsub.shape

           0      1         2     3         4         5         6    7  \
0  14.500000  784.0  5.709591  28.0  1.000000  1.000000  0.289523  1.0   
1   0.200000  168.0  0.000000   0.0  0.000000  0.000000  0.000000  0.0   
2   0.166667  957.0  0.000000   0.0  0.000000  0.000000  0.000000  0.0   
3   0.200000  696.0  0.000000   0.0  0.000000  0.000000  0.000000  0.0   
4   6.000000  407.0  3.169911  11.0  0.458333  0.297297  0.345267  1.0   

          8  
0  1.000000  
1  0.003589  
2  0.042467  
3  0.005677  
4  0.634040  


(238364, 1)

In [39]:
preds = model.predict(Xsub)
preds2= [1 if d>0.0505 else 0 for d in Xsub[Xsub.columns[8]]]
final_preds=[p or p2 for p,p2 in zip(preds,preds2)]
f1=f1_score(dfoldsub.label, final_preds)

In [40]:
f1

0.9886038155490868

In [34]:
print(model.feature_importances_)

[9.9799126e-01 2.0889821e-04 2.2802259e-04 0.0000000e+00 3.9583005e-04
 1.2665906e-04 5.8138010e-04 2.0926987e-04 2.5863256e-04]


In [41]:
dfexpe=pd.DataFrame(zip(df_edges_test["id"].values,final_preds),columns=["id","label"])
dfexpe.to_csv("not_submission.csv",index=False)