In [138]:
import os

import pandas as pd
import stem
from stem.descriptor import DocumentHandler, parse_file
import matplotlib.pyplot as plt

In [158]:
'''
DEFAULT VARIABLES
'''
fingerprint_default = 'fingerprint'
name_default = 'name'
bandwidth_default = 'bw'

In [186]:
files_path = "data/consensuses-2019-12/10/"
file_names = os.listdir(files_path)
file_paths = [files_path + file_name for file_name in file_names]

In [187]:
# METHODS

'''
Returns given consensus path as DataFrame with nicknames and bandwidths of relays
Inputs: *path       : path to consensus file
        fp_column   : (default: 'fingerprint') name of df column containing the fingerprint of relay
        name        : (default: 'name') name of df column containing the nickname of relay
        bandwidth   : (default: 'bw') name of df columns containing the bandwidth of relay
Returns: df         : DataFrame object containing 3 columns
'''
def getConsensusDataFrame(path, fp_column=fingerprint_default, name=name_default, bandwidth=bandwidth_default):
    consensus = next(parse_file(path, descriptor_type = 'network-status-consensus-3 1.0',\
                                document_handler = DocumentHandler.DOCUMENT,))
    # Get list of relays
    routers = consensus.routers.items()
    relays = []
    for fingerprint, relay in routers:
        relays.append(relay)

    # Return DF containing name and bandwidth
    df = pd.DataFrame()
    df[fp_column] = [r.nickname + ' ' + r.fingerprint for r in relays]
    df[name] = [r.nickname for r in relays]
    df[bandwidth] = [r.bandwidth for r in relays]
    return df

'''
Returns a list of the values that are present in every dataframe in the input list
Inputs:  *dfs        : list object containing all dataframes of interest
         column_name : (default: 'name') string of the name of the column of interest
Returns: values_present, values_all      : List object cotaining the values found present in all input dataframes
'''
def getPresentValueList(dfs, column_name=fingerprint_default):
    # Get list of unique values for nicknames in all dataframes
    df_all = pd.concat(dfs)
    values_all = df_all[column_name].unique()
    values_present = []
    
    # For each value check that it exists in every df and add it to the present value list
    for v in values_all:
        for i,df in enumerate(dfs):
            if df[column_name].isin([v]).any():
                if i==len(dfs)-1:
                    values_present.append(v)
            else:
                break
    return values_present, values_all


'''
Returns a dataframe using the selected column as the key and the 2nd selected column as the values.
IMPORTANT all input dataframes must be same length and names must be unique and equal in all dataframes
Inputs   *dfs        : list of dataframes
         key_name    : (default:'name') String name of column to be used for key. (Must be consistent through all dfs)
         values_name : (default:'bw') String name of columnt to be used as values
Returns: df          : Dataframe
'''
def rearrangeDataFrames(dfs, key_name=fingerprint_default, values_name=bandwidth_default, name_name=name_default):
    df = pd.DataFrame()
    n = len(dfs)
    
    # Check conditions are met
    lengths = [df.shape[0] for df in dfs]
    if len(set(lengths)) != 1:
        print('ERROR: Not all datframes have same length!')
        print('Returning empty dataframe ...')
        return df
    
    # Concat all dfs in a big dataframe and rearrange
    df_all = pd.concat(dfs)
    df_temp = df_all.groupby(key_name)[values_name].apply(list).reset_index(name='list')
    
    df_answer = pd.DataFrame(df_temp['list'].tolist(), columns=[str(i) for i in range(n)])
    df_answer[fingerprint_default] = df_temp[fingerprint_default]
    df_answer = df_answer.set_index(fingerprint_default)
    
    
    return df_answer

In [188]:
# Get all dataframes
dfs = [getConsensusDataFrame(path) for path in file_paths]

In [189]:
# Get the names of the relays present in every dataframe
fp_present, fp_all = getPresentValueList(dfs)

In [190]:
# Check if theres missing relays
len(fp_present)
print(len(fp_present),'==',len(fp_all))

5717 == 6943


In [191]:
# Remove relays that are not always present
dfs_present = [df[df[fingerprint_default].isin(fp_present)] for df in dfs]

In [192]:
t = rearrangeDataFrames(dfs_present)

In [193]:
t

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
fingerprint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0 21239CC313A28B3E6CEE48D949BF7787471B1519,5860,5840,5820,5790,5770,5740,5740,5760,5750,5710,...,5690,5700,5700,5720,5750,5770,5500,5500,5500,5230
0000001dxx B9609624E26C705289E57F95E30FD88D159BD8AD,798,798,798,798,798,950,950,950,950,810,...,828,828,828,828,828,720,720,720,720,720
0001 2BD1936E0B4D5BB615CF99B0CFF74EAF19426888,9200,9200,9200,9200,9200,9200,9200,9200,9200,9200,...,9300,9200,9200,9200,9200,9200,9200,9200,9200,9390
0ZQIX7g6 BB0C636DE89CAC6C995CB380AAC8C4AAAB731BA8,4100,4100,4100,4100,4100,4100,4100,4100,4000,4000,...,4000,4000,4000,4000,4000,4000,4000,4000,4000,4000
0s1nt0 60275B52E810407658FD4278EDB1749C75F956F9,110000,110000,110000,110000,110000,109000,109000,110000,110000,109000,...,109000,109000,109000,109000,110000,110000,110000,110000,110000,110000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zwiebelringXX DA4B488C2826DFBBD04D635DA1E71A2BA5B20747,73000,72000,72000,72000,72000,72000,72000,72000,72000,75000,...,74300,74400,74700,75200,75200,75400,76100,76500,76700,77000
zwiebeltoralf 63BF46A63F9C21FD315CD061B3EAA3EB05283A0A,83200,83000,82700,82300,81900,81500,81500,81600,81200,80700,...,79000,79000,79000,79000,79000,79000,79000,79000,79000,79000
zwiebeltoralf2 509EAB4C5D10C9A9A24B4EA0CE402C047A2D64E6,80800,80900,80600,81000,91600,91300,91000,90500,90000,89500,...,88600,81000,81000,81000,81000,81000,81000,81000,81000,81000
zwiubel B0553175AADB0501E5A61FC61CEA3970BE130FF2,31700,31800,31700,31500,31400,41000,44000,44000,44000,63100,...,62800,62800,63000,63300,63400,45000,45000,45000,45000,45000


In [185]:
#t.to_csv('csvs/09-12-19.csv')