# Feature extraction from preprocessed data
The following code is used to extract local features of given preprocessed graph relations. It is structured such as to work with a large amount of relations and standard notebook capacities. The extracted features represent specifications for a directed, multirelational graph or in a certain sense a weighted graph with a finite and discret number of weights.

The input relations have to be preprocessed so that for each node all edges are gathered side by side. This means that all relations need to be listed twice in order to be able to gather them next to each node. This preprocessing allows to trade storage capacity for some computation speed. A unix script implementation for this is provided.

In [1]:
import numpy as np
import pandas as pd
import csv
from matplotlib import pyplot as plt
import time
import sys
import os
import subprocess
import datetime
import math
import timeit

def restart_line():
    sys.stdout.write('\r')
    sys.stdout.flush()

# defining input file
filename = 'relations_red_ext.csv'
path = "/Users/jonasmuller/Local_Folder/Project NTDS/"
file = path + filename

columns=["src", "dst","relation"]

The structure of the following code is to read the input *.csv* file in several chunks taking. The defined features are then computed locally in the chunk and appended to the feature DataFrame. An estimation of the remaining time is given periodically.

The features have the following interpretation where "#" denotes the feature is calculated for each relation individually.
* dtot_#: total degree
* dout_#: outgoing degree
* duni_#: number of unique neighbors
* dnbi_#: number of bidirectional edges
* n_bidir: total number of bidirectional edges

Note that multiple relations between two nodes are possible and therefore the number of unique neighbors is in general not equal to the degrees. The difference between *dtot* and *duni* represents the number of multiple edges. In other words dtot counts the number of "communications" from a node while duni counts the number of "recipients".

The number of bidirectional edges dnbi is a measure of how many interactions are actually answered at least once. This too, is in general different from the total degree and the unique neighbors. For example, in a social network a "like" from one node might not elicit a "like" from the recipient.

Furthermore the n_bidir is generally different from the sum of the dnbi since a given interaction might cause an interaction of a different kind (relation). For example, the "like" of one node might induce a "message" from the recepeient as a response. Note also that n_bidir can in fact even be smaller than the sum of dnbi in the case when a certain edge is actually bidirectional in multiple relations which is counted as one bidirectional relation in n_bidir.

In [2]:
# select execution mode
calc_stat = False
debug = True
save_features = False
bidirectionality = True
find_unique_neigh = True
weights = False
weight_KL = False

In [3]:
# initialize shit
chunksize = 4000000
i_chunk = 0
L = 1000000000
l = 0

carry = pd.DataFrame()
df_buffer = pd.DataFrame()
features = pd.DataFrame()
rel_dist= pd.DataFrame(np.zeros(7),list(range(1,8)),columns=["count"])
t1 = time.time()
told = t1
n_chunks = np.ceil(1716494198/chunksize).astype(np.uint32)
print(f"Number of Chunks: {n_chunks:}")
t_chunk = np.zeros([n_chunks])

Number of Chunks: 430


In [4]:
def get_degrees(chunk):
    # get degrees per relation
    deg_rel = pd.DataFrame(chunk.groupby('src').relation.value_counts()).rename(columns={'relation': "rel_count"})
    deg_rel = deg_rel.unstack(level=-1,fill_value=0)
    deg_rel.columns = deg_rel.columns.droplevel(0)
    del deg_rel.columns.name
    col_names = deg_rel.columns
    deg_rel_out = deg_rel[col_names[:7]]
    deg_rel_in = deg_rel[col_names[7:]]
    deg_rel_in.columns = np.mod(deg_rel_in.columns.to_numpy(),10)
    deg_rel = deg_rel_out.add(deg_rel_in)
    deg_rel = deg_rel.add_prefix('dtot_')
    deg_rel_out = deg_rel_out.add_prefix('dout_')
    deg_rel = deg_rel.join(deg_rel_out)
    return deg_rel

In [5]:
def find_unique_neighbors(chunk):
    unique_neighbors = chunk_copy.groupby(['src','relation']).dst.nunique()
    unique_neighbors = unique_neighbors.unstack(level=1,fill_value=0)
    del unique_neighbors.columns.name
    unique_neighbors = unique_neighbors.add_prefix('duni_')
    return unique_neighbors

In [6]:
def check_bidirectionality(chunk):
    chunk_copy = chunk.copy()
    chunk_copy["in"] = (chunk_copy["relation"] >= 10).astype(int)

    # check each src-dst tuple to be in & out relation
    nr_of_bidirect = chunk_copy.groupby(['src','dst'],squeeze=True).nunique()-1
    nr_of_bidirect = nr_of_bidirect.drop(columns=['src','dst','relation'])
    nr_of_bidirect = nr_of_bidirect.groupby(['src']).sum()['in']

    # bidirectionality per relation
    chunk_copy['relation'] = chunk_copy['relation'].mod(10)

    bidirect = chunk_copy.groupby(['src','relation','dst']).nunique()-1
    bidirect = bidirect.drop(columns=['src','dst','relation']).reset_index(level=-1,drop=True)
    bidirect = bidirect.groupby(['src','relation']).sum()

    bidirect = bidirect.unstack(level=1,fill_value=0)
    bidirect.columns = bidirect.columns.droplevel(level=0)
    del bidirect.columns.name
    df_nbi = bidirect.add_prefix('dnbi_')
    return nr_of_bidirect, df_nbi

In [7]:
# kick off the ordeal
for chunk in pd.read_csv(file, chunksize=chunksize, sep = "\t", names=columns):
    
    # slice chunk at specific src
    chunk = carry.append(chunk)
    chunk_copy = chunk.copy()
    chunk_copy['relation'] = chunk_copy['relation'].mod(10)
    
    # remove last source in chunk to add it to the next one
    last_src = chunk["src"].iloc[-1]
    carry = chunk.loc[chunk["src"] == last_src]
    chunk = chunk.loc[chunk["src"] != last_src]
    
    deg_rel = get_degrees(chunk)
    
    # find number of unique neighbors per relation
    if find_unique_neigh:
        unique_neighbors = find_unique_neighbors(chunk)
    
    # check bidirectionality
    if bidirectionality:
        nr_of_bidirect, df_nbi = check_bidirectionality(chunk)
        
    # relation distribution count
    if calc_stat:
        values, counts = np.unique(chunk.loc[chunk["relation"] < 10].relation.values, return_counts=True)
        rel_dist = rel_dist.add(pd.Series(counts, index=values),axis='index')
    
    # append degrees
    df_buffer = deg_rel
    if find_unique_neigh:
        df_buffer = df_buffer.join(unique_neighbors, on='src')
    if bidirectionality:
        df_buffer = df_buffer.join(df_nbi, on='src')
        df_buffer['n_bidir'] = pd.Series(nr_of_bidirect)
    if weights:
        df_buffer['w_out'] = pd.Series(w_out)
        df_buffer['w_in'] = pd.Series(w_in)
    if weight_KL:
        df_buffer['w_KL'] = pd.Series(w_KL)
    
    df_buffer = df_buffer.fillna(0)
    
    features = features.append(df_buffer.astype('uint32'))
    
    if save_bidirect_rel:
        l2 = len(bidirect)
        relations_bidirect[l:(l+l2),:] = bidirect.to_numpy()
        if l2 >= 0:
            l += l2+1
    
    # print status
    s = (time.time()-told)
    told = time.time()
    t_chunk[i_chunk] = s
    if i_chunk < 10:
        t_est = t_chunk[i_chunk]*(n_chunks-i_chunk)/60
    else:
        t_est = np.mean(t_chunk[(i_chunk-10):i_chunk])*(n_chunks-i_chunk)/60
        
    # break for debugging purpose
    if debug:
        if i_chunk >= 5 or (time.time()-t1) > 30:
            break

    restart_line()
    sys.stdout.write(f'Chunk Nr: {i_chunk}; \t feat_shape: {features.shape} \t tc: {s:.2f} sec \t te: {t_est:.1f} m')
    sys.stdout.flush()
    i_chunk += 1

if save_bidirect_rel:
    relations_bidirect = relations_bidirect[0:(l-1),:]

# print relevant key data
m = math.floor((time.time()-t1)/60)
s = (time.time()-t1)-m*60
print(f'\n\nExecution time {m} min {s:.2f} sec')

# statistics
total_relations = np.sum(rel_dist.values)
rel_dist = rel_dist.div(total_relations)
with open("Output.txt", "w") as text_file:
    print(f"Nr of total relations: {total_relations}\n", file=text_file)
    print("Relations Distribution:\n", file=text_file)
    print(rel_dist, file=text_file)

# save features
if save_features:
    exportName = "relations_features.csv"
    features.to_csv(exportName, sep = '\t')
    if save_bidirect_rel:
        np.savetxt("relations_bidirectional.csv", relations_bidirect, fmt='%i', delimiter="\t")
    
print("Happy Ending!")

Chunk Nr: 2; 	 feat_shape: (37436, 29) 	 tc: 9.64 sec 	 te: 68.8 mm

Execution time 0 min 39.26 sec
Happy Ending!


In [None]:
# visualize time vector
ax = plt.figure(1, figsize=(15, 1))
plt.plot(t_chunk);
ax = plt.figure(2, figsize=(15, 1))
plt.hist(t_chunk, bins=100);