### Loading & preparing edgelist

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib
import os
from time import sleep
from tqdm import tqdm
ROOTPATH = os.getcwd()[:-3]

In [169]:
wikipedia = pd.read_csv(ROOTPATH + "data\\preprocessed\\wikipedia_edges.csv")
wikipedia_no_self_references=wikipedia.loc[wikipedia['from']!=wikipedia['to'],]
wikipedia.head(3)

Unnamed: 0,from,to,sign,timestamp
0,1,2,1,1095171960
1,3,2,-1,1095173580
2,4,2,1,1095174480


In [3]:
G = nx.Graph([tuple(x) for x in wikipedia[['from','to']].values])
wikipedia_undir_edgelist=nx.convert_matrix.to_pandas_edgelist(G)
wikipedia_undir_edgelist_switched=wikipedia_undir_edgelist[['target','source']]
wikipedia_undir_edgelist_switched.columns=['source','target']
wikipedia_undir_edgelist_both_dir=pd.concat([wikipedia_undir_edgelist,wikipedia_undir_edgelist_switched])
wikipedia_undir_edgelist_both_dir.drop_duplicates(inplace=True)
wikipedia_undir_edgelist_both_dir.head(3)

Unnamed: 0,source,target
0,1,2
1,1,26
2,1,169


In [5]:
wiki_triads=wikipedia_undir_edgelist_both_dir.merge(wikipedia_undir_edgelist_both_dir, how='inner',left_on='target',right_on='source')[['source_x','target_x','target_y']]
wiki_triads.columns=['node_1','node_2','node_3']
wiki_triads.head(3)

Unnamed: 0,node_1,node_2,node_3
0,1,2,3
1,1,2,4
2,1,2,5


### Calculating list of eventual triads

In [36]:
list_dfs=[]

for node in tqdm(list(wiki_triads['node_3'].unique())):
    
    if np.isin(node,wikipedia_undir_edgelist_both_dir['source'].unique()):
        
        df_temp_1=wiki_triads.loc[wiki_triads['node_3']==node]
        df_temp_2=wikipedia_undir_edgelist_both_dir.loc[wikipedia_undir_edgelist_both_dir['source']==node]
        df_temp_3=df_temp_1.merge(df_temp_2,how='left',left_on='node_3',right_on='source')
        df_temp_3=df_temp_3.loc[df_temp_3['node_1']==df_temp_3['target']]
        
        list_dfs.append(df_temp_3)

eventual_triads=pd.concat(list_dfs)
eventual_triads.head(3)

100%|██████████████████████████████████████████████████████████████████████████████| 7118/7118 [06:48<00:00, 17.44it/s]


Unnamed: 0,node_1,node_2,node_3,source,target
86,1,2,3,3,1
353,6,2,3,3,6
530,8,2,3,3,8


### Removing duplicated triads & saving

In [42]:
eventual_triads=eventual_triads[['node_1','node_2','node_3']]

list_row_sets=[]

for index, row in tqdm(eventual_triads.iterrows()):
    
    list_row_sets.append(frozenset(row))
    
set_of_triads=[list(e) for e in list(set(list_row_sets))]

eventual_triads_no_duplicates=pd.DataFrame(set_of_triads,columns=['node_1','node_2','node_3'])
eventual_triads_no_duplicates=eventual_triads_no_duplicates[~eventual_triads_no_duplicates.isnull().any(axis=1)]
eventual_triads_no_duplicates=eventual_triads_no_duplicates.astype(int)

In [85]:
eventual_triads_no_duplicates.to_csv('eventual_triads_wikipedia.csv')

### Adding sequence of linkages to eventual triads

In [154]:
eventual_triads_no_duplicates[['first_link_node_1','first_link_node_2','first_link_sign',
                               'second_link_node_1','second_link_node_2','second_link_sign',
                               'third_link_node_1','third_link_node_2','third_link_sign',
                               'fourth_link_node_1','fourth_link_node_2','fourth_link_sign',
                               'fifth_link_node_1','fifth_link_node_2','fifth_link_sign',
                               'sixth_link_node_1','sixth_link_node_2','sixth_link_sign']]=np.nan
eventual_triads_no_duplicates.head(3)

Unnamed: 0,node_1,node_2,node_3,first_link_node_1,first_link_node_2,first_link_sign,second_link_node_1,second_link_node_2,second_link_sign,third_link_node_1,...,third_link_sign,fourth_link_node_1,fourth_link_node_2,fourth_link_sign,fifth_link_node_1,fifth_link_node_2,fifth_link_sign,sixth_link_node_1,sixth_link_node_2,sixth_link_sign
0,2762,2349,2431,,,,,,,,...,,,,,,,,,,
1,2466,5051,757,,,,,,,,...,,,,,,,,,,
2,1673,52,863,,,,,,,,...,,,,,,,,,,


In [183]:
new_rows=[]

#eventual_triads_no_duplicates=eventual_triads_no_duplicates[['node_1','node_2','node_3','sign']]

for index, row in tqdm(eventual_triads_no_duplicates.iterrows()):
    
    tmp_list_nodes=list(row[['node_1','node_2','node_3']])
    
    df_temp=wikipedia_no_self_references.loc[wikipedia_no_self_references['from'].isin(tmp_list_nodes)&wikipedia_no_self_references['to'].isin(tmp_list_nodes),]
    df_temp=df_temp[['from','to','sign']]
    col_len=len(sum(df_temp.values.tolist(), []))

    row.iloc[3:3+col_len]=sum(df_temp.values.tolist(), [])
    
    new_rows.append(row)

eventual_triads_no_duplicates_with_sequence=pd.DataFrame(new_rows)
#eventual_triads_no_duplicates_with_sequence.to_csv('eventual_triads_list_final.csv')

607279it [12:28:31, 13.52it/s] 


PermissionError: [Errno 13] Permission denied: 'eventual_triads_list_final.csv'

### Filtering out only those triads in the df for which the sets of two nodes for the first three links are different (i.e. become a triad after the first three linkages)

In [None]:
eventual_triads_indices=[]

list_row_sets=[]

for index, row in tqdm(eventual_triads_no_duplicates_with_sequence.iterrows()):
    
    if len(set([frozenset(tuple([row['first_link_node_1'],row['first_link_node_2']])),
               frozenset(tuple([row['second_link_node_1'],row['second_link_node_2']])),
               frozenset(tuple([row['third_link_node_1'],row['third_link_node_2']]))]))==3:
        
        eventual_triads_indices.append(index)
        
eventual_triads_no_duplicates_with_sequence=eventual_triads_no_duplicates_with_sequence.filter(eventual_triads_indices,axis=0)
#eventual_triads_no_duplicates_with_sequence.to_csv('eventual_triads_list_final.csv')