In [6]:
import os

import pandas as pd
import stem
from stem.descriptor import DocumentHandler, parse_file

In [23]:
files_path = "data/consensuses-2019-12/01/"
file_names = os.listdir(files_path)
file_paths = [files_path + file_name for file_name in file_names]

In [65]:
# METHODS

'''
Returns given consensus path as DataFrame with nicknames and bandwidths of relays
Inputs: *path     : path to consensus file
        name      : (default: 'name') name of df column containing the nickname of relay
        bandwidth : (default: 'bw') name of df columns containing the bandwidth of relay
Returns: df       : DataFrame object contatining 2 columns
'''
def getConsensusDataFrame(path, name='name', bandwidth='bw'):
    consensus = next(parse_file(file, descriptor_type = 'network-status-consensus-3 1.0',\
                                document_handler = DocumentHandler.DOCUMENT,))
    # Get list of relays
    routers = consensus.routers.items()
    relays = []
    for fingerprint, relay in routers:
        relays.append(relay)

    # Return DF containing name and bandwidth
    df = pd.DataFrame()
    df[name] = [r.nickname for r in relays]
    df[bandwidth] = [r.bandwidth for r in relays]
    return df

'''
Returns a list of the values that are present in every dataframe in the input list
Inputs:  *dfs        : list object containing all dataframes of interest
         column_name : (default: 'name') string of the name of the column of interest
Returns: values_present, values_all      : List object cotaining the values found present in all input dataframes
'''
def getPresentValueList(dfs, column_name='name'):
    # Get list of unique values for nicknames in all dataframes
    df_all = pd.concat(dfs)
    values_all = df_all[column_name].unique()
    values_present = []
    
    # For each value check that it exists in every df and add it to the present value list
    for v in values_all:
        for i,df in enumerate(dfs):
            if df[column_name].isin([v]).any():
                if i==len(dfs)-1:
                    values_present.append(v)
            else:
                break
    return values_present, values_all


'''
Returns a dataframe using the selected column as the key and the 2nd selected column as the values.
IMPORTANT all input dataframes must be same length and names must be unique and equal in all dataframes
Inputs   *dfs        : list of dataframes
         key_name    : (default:'name') String name of column to be used for key. (Must be consistent through all dfs)
         values_name : (default:'bw') String name of columnt to be used as values
Returns: df          : Dataframe
'''
def rearrangeDataFrames(dfs, key_name='name', values_name='bw'):
    df = pd.DataFrame()
    
    # Check conditions are met
    lengths = [df.shape[0] for df in dfs]
    if len(set(lengths)) != 1:
        print('ERROR: Not all datframes have same length!')
        print('Returning empty dataframe ...')
        return df
    
    # Concat all dfs in a big dataframe and rearrange
    df_all = pd.concat(dfs)
    df_temp = df_all.groupby(key_name)[values_name].apply(list).reset_index(name='list')
    
    # Add the lists as columns
    temp_var = []
    
    df[key_name] = list(df_temp[key_name])
    for i,name in enumerate(list(df_temp[key_name])):
        # Test
        temp_var.append(list(df_temp.iloc[i])[1])
        # End test
        #df[str(i)] = list(df_temp.iloc[i])[1]
    
    return temp_var

In [78]:
# Get a list with all dataframes in file_paths
# dfs = [getConsensusDataFrame(n) for n in file_paths]
# [df[df['name']=='0x616e6f6e'] for df in dfs]

t = dfs[0]
t[t['name']=='0x616e6f6e']

Unnamed: 0,name,bw
3,0x616e6f6e,11200
1496,0x616e6f6e,430
4476,0x616e6f6e,4130
4508,0x616e6f6e,11100
5336,0x616e6f6e,4200


In [31]:
# Get the names of the relays present in every dataframe
names_present, names_all = getPresentValueList(dfs)

In [38]:
# Check if theres missing relays
len(names_present)
print(len(names_present),'==',len(names_all))

5088 == 5088


In [75]:
df_temp.iloc[14]

name                                           0x616e6f6e
list    [11200, 430, 4130, 11100, 4200, 11200, 430, 41...
Name: 14, dtype: object

In [74]:
l = rearrangeDataFrames(dfs)
#[len(i) for i in l]
l[14]

[11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200,
 11200,
 430,
 4130,
 11100,
 4200]

In [59]:
df_temp[df_temp['list']

0                        0
1               0000001dxx
2                     0001
3                 0ZQIX7g6
4       0e4f10e596ff613288
               ...        
5083         zwiebelringXX
5084         zwiebeltoralf
5085        zwiebeltoralf2
5086               zwiubel
5087                  zyan
Name: name, Length: 5088, dtype: object

In [69]:
df_temp

Unnamed: 0,name,list
0,0,"[5760, 5760, 5760, 5760, 5760, 5760, 5760, 576..."
1,0000001dxx,"[886, 886, 886, 886, 886, 886, 886, 886, 886, ..."
2,0001,"[6720, 6720, 6720, 6720, 6720, 6720, 6720, 672..."
3,0ZQIX7g6,"[5700, 5700, 5700, 5700, 5700, 5700, 5700, 570..."
4,0e4f10e596ff613288,"[297, 297, 297, 297, 297, 297, 297, 297, 297, ..."
...,...,...
5083,zwiebelringXX,"[79900, 79900, 79900, 79900, 79900, 79900, 799..."
5084,zwiebeltoralf,"[80000, 80000, 80000, 80000, 80000, 80000, 800..."
5085,zwiebeltoralf2,"[81000, 81000, 81000, 81000, 81000, 81000, 810..."
5086,zwiubel,"[81700, 81700, 81700, 81700, 81700, 81700, 817..."
