# Feature extraction from preprocessed data
Relevant features are extracted directly from the relations data

In [1]:
import numpy as np
import pandas as pd
import csv
from matplotlib import pyplot as plt
import time
import sys
import os
import subprocess
import datetime
import math
import timeit

def restart_line():
    sys.stdout.write('\r')
    sys.stdout.flush()

filenames =['relations_extended.csv','relations_red_ext.csv']

path = "/Users/jonasmuller/Local_Folder/Project NTDS/"
file = path + filenames[1]

# using dataset without timestamps
reducedDataset = True
if reducedDataset:
    columns=["src", "dst","relation"]
else:
    columns=["day", "time_ms", "src", "dst","relation"]

In [4]:
chunksize = 1000000
i = 0
carry = pd.DataFrame()
df_buffer = pd.DataFrame()
features = pd.DataFrame()
rel_dist = pd.DataFrame(np.zeros(7),list(range(1,8)),columns=["count"])
t1 = time.time()
print(f"Number of Chunks: {np.ceil(1716494198/chunksize):}")
for chunk in pd.read_csv(file, chunksize=chunksize, sep = "\t", names=columns):
    
    # slice chunk at specific src
    chunk = carry.append(chunk)
    
    # remove last source in chunk
    last_src = np.unique(chunk["src"].values)[-1]
    carry = chunk.loc[chunk["src"] == last_src]
    
    chunk = chunk.loc[chunk["src"] != last_src]
    sources_in_chunk = np.unique(chunk["src"].values)
    
    # extract features        
    degree_tot = chunk.groupby('src').dst.count()
    degree_out = chunk.loc[chunk["relation"] < 10].groupby('src').dst.count()
    unique_neighbors = chunk.groupby('src').dst.nunique()
    
    chunk["relation"] = ((chunk["relation"] - 10) >= 0).astype(int)
    bidirect = chunk.groupby(['src','dst']).max() - chunk.groupby(['src','dst']).min()

    nr_of_bidirect = bidirect.groupby('src').relation.sum()
    
    # intersection between in and out relation nodes
    
    # weights acording to relation scarcity (then deg in deg out) / number of strong connections per node
    
    # relation distribution count
    values, counts = np.unique(chunk.loc[chunk["relation"] < 10].relation.values, return_counts=True)
    rel_dist = rel_dist.add(pd.Series(counts, index=values),axis='index')
    
    # append degrees
    df_buffer = df_buffer[0:0]
    df_buffer['deg_tot'] = pd.Series(degree_tot)
    df_buffer['deg_out'] = pd.Series(degree_out)
    df_buffer['uni_neigh'] = pd.Series(unique_neighbors)
    df_buffer['nr_of_bidirect'] = pd.Series(nr_of_bidirect)
    
    df_buffer = df_buffer.fillna(0)
    
    features = features.append(df_buffer)
    
    # break for debugging
    i = i+1
    if i < 1:
        break
        
    restart_line()
    sys.stdout.write(f'Chunk Nr: {i}; feature shape: {features.shape}')
    sys.stdout.flush()

# print relevant key data
m = math.floor((time.time()-t1)/60)
s = (time.time()-t1)-m*60
print(f'\nExecution time {m} min {s:.2f} sec')

# statistics
total_relations = np.sum(rel_dist.values)
rel_dist = rel_dist.div(total_relations)
with open("Output.txt", "w") as text_file:
    print(f"Nr of total relations: {total_relations}\n", file=text_file)
    print("Relations Distribution:\n", file=text_file)
    print(rel_dist, file=text_file)

# save features
if True:
    exportName = filenames[0][:-4] + "_features.csv"
    print(exportName)
    features.to_csv(exportName, sep = '\t')

Number of Chunks: 1717.0
Chunk Nr: 1717; feature shape: (5321960, 4)
Execution time 37 min 44.21 sec
relations_extended_features.csv


In [4]:
# save features
exportName = filenames[0][:-4] + "_features.csv"
print(exportName)
features.to_csv(exportName, sep = '\t')

relations_extended_features.csv


In [None]:
# visualize
ax = plt.figure(1, figsize=(15, 3))
plt.title("DegreeOut histogram")
plt.hist(dout.values, bins=200);
plt.ylabel('Number of Nodes')
plt.xlabel('Degree Out')

In [3]:
print(features.memory_usage)

<bound method DataFrame.memory_usage of       deg_tot  deg_out  uni_neigh
src                              
1       275.0     90.0      166.0
2       294.0     59.0      189.0
3        57.0     34.0       34.0
4        36.0     13.0       21.0
5        38.0     16.0       23.0
...       ...      ...        ...
3322      NaN      NaN        NaN
3323      NaN      NaN        NaN
3324      NaN      NaN        NaN
3325      NaN      NaN        NaN
3326      NaN      NaN        NaN

[632145 rows x 3 columns]>
