In [1]:
from pandas import HDFStore, read_hdf
import pandas as pd
from os.path import expanduser

In [2]:
versions = [8, 11]
df_types = ['lit', 'gw']

fnames = ['edges_all_{0}{1}.h5'.format(t, v) for t, v in zip(df_types, versions)]
print(fnames)

['edges_all_lit8.h5', 'edges_all_gw11.h5']


## Check the keys in hdf files

In [3]:
keys = {}
index_list = []
for f in fnames:
    store = HDFStore(expanduser(f), mode='r')
    # store = HDFStore(f, mode='r')
    # store = read_hdf(f, '/y2013')
    # print(store)
    keys[f] = sorted(store.keys())
    print('{}: \n{}\n'.format(f, store.keys()))
    index_list += store.keys()

edges_all_lit8.h5: 
['/y1947', '/y1950', '/y1951', '/y1952', '/y1960', '/y1963', '/y1964', '/y1965', '/y1966', '/y1967', '/y1968', '/y1969', '/y1970', '/y1971', '/y1972', '/y1973', '/y1974', '/y1975', '/y1976', '/y1977', '/y1978', '/y1979', '/y1980', '/y1981', '/y1982', '/y1983', '/y1984', '/y1985', '/y1986', '/y1987', '/y1988', '/y1989', '/y1990', '/y1991', '/y1992', '/y1993', '/y1994', '/y1995', '/y1996', '/y1997', '/y1998', '/y1999', '/y2000', '/y2001', '/y2002', '/y2003', '/y2004', '/y2005', '/y2006', '/y2007', '/y2008', '/y2009', '/y2010', '/y2011', '/y2012', '/y2013']

edges_all_gw11.h5: 
['/y1975', '/y1976', '/y1977', '/y1978', '/y1979', '/y1980', '/y1981', '/y1982', '/y1983', '/y1984', '/y1985', '/y1986', '/y1987', '/y1988', '/y1989', '/y1990', '/y1991', '/y1992', '/y1993', '/y1994', '/y1995', '/y1996', '/y1997', '/y1998', '/y1999', '/y2000', '/y2001', '/y2002', '/y2003', '/y2004', '/y2005', '/y2006', '/y2007', '/y2008', '/y2009', '/y2010', '/y2011', '/y2012', '/y2013']



## merge the two datasets

In [4]:
def merge(year):
    
    """
    
    """
    
    if (year < 1947) or (year > 2013):
        raise Exception('year has to be >= 1947 and <=2013')
        
    
    # Read dataframes:
    vertices = set()
    edges= {}
    df = read_hdf('edges_all_lit8.h5', '/y{}'.format(year))
    for i in df.index:
        head, tail = df['up'][i], df['dn'][i]
        key = '{}_{}'.format(head, tail)
        value = df[0][i]
        edges[key] = value
        vertices.add(head)
        vertices.add(tail)
    
    if year >=1974:
        df = read_hdf('edges_all_gw11.h5', '/y{}'.format(year))
        for i in df.index:
            head, tail = df['up'][i], df['dn'][i]
            key = '{}_{}'.format(head, tail)
            value = df[0][i]
            if key in edges.keys():
                edges[key] = max(edges[key], value)
            else:
                edges[key] = value
            vertices.add(head)
            vertices.add(tail)
    
    # Get adjacency list and calculate Statistics:
    adjacency_list = {v: {} for v in vertices}
    num_loops, num_two_ways, num_one_ways = 0, 0, 0
    
    for edge in edges.keys():
        up, down = list(map(int, edge.split('_')))
        adjacency_list[up][down] = edges[edge]
        if up == down:
            num_loops += 1
        else:
            reversed_edge = '{}_{}'.format(down, up)
            if reversed_edge in edges:
                num_two_ways += 1
            else:
                num_one_ways += 1
    
    print('number of vertices = {},'.format(len(vertices)))
    print('total number of edges = {},'.format(len(edges)))
    print('number of loops = {},\nnumber of pairs of two-way edges = {},\nnumber of one-way edges = {}.\n'
          .format(num_loops, num_two_ways // 2, num_one_ways))
    
    return adjacency_list

In [5]:
year = 1980
adjacency_list = merge(year)

number of vertices = 1015,
total number of edges = 2043,
number of loops = 54,
number of pairs of two-way edges = 281,
number of one-way edges = 1427.



In [6]:
def remove_direction(adjacency_list):
    
    """
    Also remove loops
    """
    undirected_adjacency_list = {vertex: {} for vertex in adjacency_list.keys()}
    undirected_edges = set()
    
    for vertex, neighborhood in adjacency_list.items():
        
        for neighbor in neighborhood:

            if neighbor == vertex:
                continue
    
            if str(neighbor) + '_' + str(vertex) in undirected_edges:
                continue
            
            weight = adjacency_list[vertex][neighbor]
            undirected_adjacency_list[vertex][neighbor] = weight
    
            if vertex not in adjacency_list[neighbor]:
                undirected_adjacency_list[neighbor][vertex] = weight
            else:
                total_weight = weight + adjacency_list[neighbor][vertex]
                undirected_adjacency_list[vertex][neighbor] = total_weight
                undirected_adjacency_list[neighbor][vertex] = total_weight
                
            undirected_edges.add(str(vertex) + '_' + str(neighbor))
    
    return undirected_adjacency_list

In [7]:
undirected_adjacency_list = remove_direction(adjacency_list)

In [8]:
count = 0
for key in adjacency_list:
    d_nbh = adjacency_list[key]
    und_nbh = undirected_adjacency_list[key]
    print('{}:\n\t{}\n\t{}'.format(key, d_nbh, und_nbh))
    count += 1
    if count == 10:
        break

51201:
	{3859: 1, 3973: 1, 27102: 1, 58163: 2}
	{3859: 3, 3973: 1, 27102: 1, 58163: 4}
2:
	{213: 1, 973: 2, 1991: 1, 3818: 1, 5340: 3, 5972: 1, 7018: 1, 10993: 1, 11012: 1, 64006: 2}
	{213: 2, 973: 2, 1991: 1, 3818: 2, 5340: 7, 5972: 1, 7018: 1, 10993: 1, 11012: 1, 64006: 3, 12: 1}
2051:
	{1363: 1, 9623: 1}
	{1363: 1, 9623: 1, 56955: 1}
2052:
	{909: 1, 27306: 1}
	{909: 1, 27306: 2}
2056:
	{174: 1, 5741: 1, 8288: 1, 2056: 3, 5972: 1}
	{174: 1, 5741: 1, 8288: 1, 5972: 3, 353: 1}
10249:
	{}
	{7035: 1, 3930: 1}
12:
	{2: 1, 1506: 1}
	{2: 1, 1506: 2}
15:
	{}
	{7200: 1}
55312:
	{342184: 1}
	{342184: 1}
17:
	{}
	{11275: 1}


In [9]:
import pickle
filename = 'adjacency_list_{}'.format(year)
pickle_out = open(filename, 'wb')
pickle.dump(adjacency_list, pickle_out)
pickle_out.close()

In [10]:
filename = 'adjacency_list_{}_undirected'.format(year)
pickle_out = open(filename, 'wb')
pickle.dump(undirected_adjacency_list, pickle_out)
pickle_out.close()