In [1]:
import codes2network
import json
import random

data = "../data/TRIP_combined3.csv"

node_names, node_weights, edge_dict = codes2network.csv2network(data)

Processed 6605 articles


In [2]:
yr_dict = {}
for x in edge_dict.iteritems():
    yr = str(random.randint(0,16) + 2000)
    if yr in yr_dict:
        yr_dict[yr].append([x[0][0], x[0][1], x[1]])
    else:
        yr_dict[yr] = [[x[0][0], x[0][1], x[1]]]
        


In [5]:
def network2nodes(outfile_path, yr_dict, keep_strengths = False):
    
    """
    Params:
    
    String data: File path to the Trip database
    
    String outfile_path: File path to the file that you want to write to. Will be read by D3 force graphs
    
    Dictionary yr_dict: A dictionary with yrs as keys, edge information as keys. This information includes 
    what node(s) it attaches too, and the strength of that attachment
    
    Bool keep_strengths: boolean flag on whether to keep the strength of a connection as part of the data
    
    Initial structure of just_connects
    {
        "2000": {0: [......], 1: [.....]},
        "2001": {0: [......], 1: [.....]}
    }
    
    We then strip the index keys (0, 1, 2, ... etc.) from the inner dictionaries so that the final structure
    is as follows:
    {
        "2000": [[.....],[.....]],
        "2001": [[.....],[.....]]
    }
    
    For each year as a key, there is a list of nodes, denoted by the index of the list. At each index there is
    a sublist with indices of the nodes that that node is connected to. For example, if we wanted to see what
    node 50 was connected to in 2015, we would type connects['2015'][50]. This returns a sublist of other 
    indices that node 50 is connected too.
    """
    connects = {}

    for key in yr_dict.iterkeys():
        connects[key] = {}
        for connection in yr_dict[key]:
            if connection[0] in connects[key]:
                if keep_strengths:
                    #append tuple including strength
                    connects[key][connection[0]].append((connection[1], connection[2]))
                else:
                    connects[key][connection[0]].append(connection[1])
            else:
                if keep_strengths:
                    connects[key][connection[0]] = [(connection[1], connection[2])]
                else:
                    connects[key][connection[0]] = [connection[1]]

    
    #strip the numeric index key out so that we are left with only years and their values
    for key in connects.iterkeys():
        connects[key] = connects[key].values()
    
    #Check the indexes in each list and make sure they don't point to an index that is out of range
    for key,val in connects.iteritems():
        for node in xrange(len(val)):
            if keep_strengths:
                filtered_list = [connection for connection in val[node] if connection[0] < len(val)]
            else:
                filtered_list = [connection for connection in val[node] if connection < len(val)]
            val[node] = filtered_list
            
    #save file as a json
    with open(outfile_path, 'w') as outfile:
        json.dump(connects, outfile)
        
    return connects


In [6]:
with_strengths = network2nodes('../data/years_with_strength.json', yr_dict, keep_strengths=True)

In [7]:
with_strengths

{'2000': [[(74, 3.194549583648751),
   (35, 0.408781226343679),
   (98, 0.1968205904617714),
   (21, 3.951551854655564),
   (93, 0.09084027252081757),
   (95, 0.01514004542013626),
   (84, 0.03028009084027252)],
  [(62, 0.2573807721423164),
   (90, 0.5601816805450416),
   (16, 0.6964420893262679),
   (20, 0.9689629068887207),
   (107, 1.4837244511733536)],
  [(95, 0.10598031794095382),
   (5, 0.6056018168054504),
   (31, 0.27252081756245267),
   (109, 0.5450416351249053),
   (74, 0.8932626797880394),
   (15, 2.906888720666162),
   (16, 0.757002271006813)],
  [(12, 0.21196063588190764),
   (73, 0.5450416351249053),
   (72, 0.06056018168054504),
   (110, 3.8607115821347464),
   (61, 3.785011355034065),
   (63, 0.27252081756245267),
   (5, 1.3474640423921271),
   (7, 0.48448145344436033)],
  [(112, 0.757002271006813),
   (82, 0.34822104466313397),
   (90, 1.756245268735806),
   (91, 0.3028009084027252),
   (69, 0.3028009084027252),
   (109, 1.1506434519303559),
   (27, 0.6358819076457229)

In [25]:
len(strength_connects['2000'])

117

make dictionary with topics as key, values are list of tuples that node connects to with its weight.

In [68]:
just_connects

{'2000': [[75, 46, 53, 55, 20, 88],
  [27, 128, 134, 127, 119],
  [123, 120, 33, 58, 76, 21, 39],
  [13, 69, 18, 61, 63, 118, 33, 88, 4, 43, 99, 95, 94],
  [14, 43, 120, 92, 16, 103, 19, 70, 24, 131, 67, 33, 78],
  [105, 74],
  [132, 102, 14, 119, 80, 107],
  [78, 30, 120, 122, 18, 118, 90, 112, 87, 36],
  [39, 64, 102],
  [124, 17, 49, 45, 82, 28, 118],
  [25, 77, 118, 34, 80, 88, 68, 120],
  [56, 108, 111, 38, 61, 69, 12, 112, 78],
  [91, 122, 27, 79, 109, 110, 55, 134, 15, 64, 40, 67],
  [48, 83, 33, 91],
  [98, 19, 110, 111],
  [122, 40, 89, 68, 42, 22, 134, 60],
  [69, 130],
  [116, 29, 104, 72, 131, 132, 115],
  [111, 67, 34, 27, 52, 69, 132, 87],
  [64, 92, 78, 111, 84, 112, 31],
  [26, 63, 128],
  [37, 36],
  [73, 113, 29, 78, 64, 99],
  [47, 81, 38, 107, 61],
  [106, 36],
  [36, 46, 44, 96, 124, 56, 26, 32],
  [94, 62, 51, 77, 101, 90],
  [86, 104, 105, 77, 34, 78, 98, 48, 51, 88],
  [89, 81],
  [98, 97, 101, 65, 52, 74, 111, 56, 117],
  [36, 84, 75, 89, 68, 98, 50],
  [108, 1

**Problem:** The index of the node is supposed to be the id of the node, and the sublists at that index are supposed to point to other node ids. However, some of the node ids point to indexes that are outside of the range of the list. For example, at index 2, we could have a list like [27, 128, 134, 127, 119], but the full list is only of size 116. So when the network is being constructed, it will look at index 2 (node 2) and think that it's connected to node 128, but that node is not in our network.

In [69]:
len(just_connects['2000'])

116

In [74]:
just_connects

{'2000': [[75, 46, 53, 55, 20, 88],
  [27],
  [33, 58, 76, 21, 39],
  [13, 69, 18, 61, 63, 33, 88, 4, 43, 99, 95, 94],
  [14, 43, 92, 16, 103, 19, 70, 24, 67, 33, 78],
  [105, 74],
  [102, 14, 80, 107],
  [78, 30, 18, 90, 112, 87, 36],
  [39, 64, 102],
  [17, 49, 45, 82, 28],
  [25, 77, 34, 80, 88, 68],
  [56, 108, 111, 38, 61, 69, 12, 112, 78],
  [91, 27, 79, 109, 110, 55, 15, 64, 40, 67],
  [48, 83, 33, 91],
  [98, 19, 110, 111],
  [40, 89, 68, 42, 22, 60],
  [69],
  [29, 104, 72, 115],
  [111, 67, 34, 27, 52, 69, 87],
  [64, 92, 78, 111, 84, 112, 31],
  [26, 63],
  [37, 36],
  [73, 113, 29, 78, 64, 99],
  [47, 81, 38, 107, 61],
  [106, 36],
  [36, 46, 44, 96, 56, 26, 32],
  [94, 62, 51, 77, 101, 90],
  [86, 104, 105, 77, 34, 78, 98, 48, 51, 88],
  [89, 81],
  [98, 97, 101, 65, 52, 74, 111, 56],
  [36, 84, 75, 89, 68, 98, 50],
  [108, 109, 66, 80, 53, 75, 104],
  [68, 101, 34, 83],
  [91, 46, 87, 35],
  [105, 37, 53],
  [70, 46, 108],
  [102, 98, 58],
  [88, 55],
  [92, 61, 82, 111, 

In [75]:
with open('/Users/benjamindykstra/Documents/dynamic_graphs/data/years.json', 'w') as outfile:
    json.dump(just_connects, outfile)
