### Read in the data

In [78]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/cooccurrence.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,A. Dippet,A. Kirke,A. Lynch,A. Pye,A. Sinistra,Aberforth D.,Abraxas M.,Adrian P.,Alastor M.,...,Whomping Willow,William S.,William the Pukwudgie,Winky,Xenophilius L.,Yaxley,Zacharias S.,Zacharias S.] Megan J.,Zhang Fei,Úrsula F.
,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A. Dippet,0,24,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
A. Kirke,0,0,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A. Lynch,0,0,0,13,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A. Pye,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Some minor clean up 
See 'co-occurrence cleaned' notebook for reasoning

In [4]:
df.drop(df.index[0], inplace=True)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,A. Dippet,A. Kirke,A. Lynch,A. Pye,A. Sinistra,Aberforth D.,Abraxas M.,Adrian P.,Alastor M.,Albert R.,...,Whomping Willow,William S.,William the Pukwudgie,Winky,Xenophilius L.,Yaxley,Zacharias S.,Zacharias S.] Megan J.,Zhang Fei,Úrsula F.
A. Dippet,24,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
A. Kirke,0,11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A. Lynch,0,0,13,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A. Pye,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A. Sinistra,0,0,0,0,175,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
all_characters = []
for name in df.keys():
    all_characters.append({'name': name, 'count': df[name][name]})
    
sorted_characters = sorted(all_characters, key=lambda x: x['count'], reverse = True)

### Build node link json
In order to get this as a graph, we need nodes and links. I decided to structure this based off of [miserables.json](https://github.com/d3/d3-plugins/blob/master/graph/data/miserables.json).

We'll only look at the first 20 characters and their relations in order to keep the graph not too busy.

In [34]:
num_characters = 20
sorted_characters[:num_characters]

[{'count': 151951, 'name': 'Harry P.'},
 {'count': 127244, 'name': 'Hermione G.'},
 {'count': 110555, 'name': 'Draco M.'},
 {'count': 50641, 'name': 'Severus S.'},
 {'count': 46677, 'name': 'Lily Evans P.'},
 {'count': 45143, 'name': 'James P.'},
 {'count': 44733, 'name': 'Sirius B.'},
 {'count': 44599, 'name': 'Ginny W.'},
 {'count': 38037, 'name': 'Ron W.'},
 {'count': 37010, 'name': 'Remus L.'},
 {'count': 32822, 'name': 'OC'},
 {'count': 13813, 'name': 'Scorpius M.'},
 {'count': 12353, 'name': 'Voldemort'},
 {'count': 11944, 'name': 'George W.'},
 {'count': 11735, 'name': 'Luna L.'},
 {'count': 10712, 'name': 'Fred W.'},
 {'count': 10372, 'name': 'Albus D.'},
 {'count': 10266, 'name': 'Rose W.'},
 {'count': 8514, 'name': 'N. Tonks'},
 {'count': 7498, 'name': 'Tom R. Jr.'}]

Let's keep a list of the names of all of our characters that will appear on the graph. This will form the basis of our `nodes` list later.

In [52]:
names_only = []
for character in sorted_characters[:num_characters]:
    names_only.append(character['name'])

Here are some groupings I put together for the characters. This will only influence the color of the node so isn't that important. Some characters, like Tonks, were harder to categorize into a generation than others. I also refrained from sorting the next generation kids since the fandom is quite split on whether or not *Cursed Child* is part of the canon or not.

In [57]:
generations = {'main_gen' : ['Harry P.', 'Hermione G.', 'Draco M.', 'Ginny W.', 'Ron W.', 'George W.', 'Luna L.', 'Fred W.',
                       'Angelina J.', 'Neville L.', 'N. Tonks', 'Charlie W.'],
           'marauder_gen' : ['Severus S.', 'Lily Evans P.', 'James P.', 'Sirius B.', 'Remus L.', 'Bellatrix L.'],
           'riddle_gen' : ['Voldemort', 'Tom R. Jr.', 'Albus D.', 'Minerva M.', 'Gellert G.'],
           'next_gen' : ['Scorpius M.', 'Rose W.', 'Albus S. P.', 'Lily Luna P.', 'James S. P.'],
           'other' : ['OC']
         }

houses = {'Gryffindor': ['Harry P.', 'Hermione G.', 'Ginny W.', 'Ron W.', 'George W.', 'Luna L.', 'Fred W.',
                        'Angelina J.', 'Neville L.', 'Charlie W.', 'Lily Evans P.', 'James P.' , 'Sirius B.',
                        'Remus L.', 'Albus D.', 'Minerva M.'],
          'Slytherin': ['Draco M.', 'Severus S.', 'Bellatrix L.', 'Voldemort', 'Tom R. Jr.'],
          'Hufflepuff': ['N. Tonks'],
          'other': ['Gellert G.', 'Scorpius M.', 'Rose W.', 'Albus S. P.', 'Lily Luna P.', 'James S. P.', 'OC']
        }

Helper function to get each character's affiliation given the character name and the group

In [61]:
def findAffiliation(group, character):
    for key in group.keys():
        if character in group[key]:
            return key;
    
findAffiliation(generations, 'Scorpius M.')
findAffiliation(houses, 'N. Tonks')

'Hufflepuff'

Now we can build our links!

In [87]:
def alreadyLinked(source, target, links):
    for link in links:
        if link['source'] == source and link['target'] == target:
            return True
    return False

links = [{'source': 'Harry P.', 'target': 'Draco M.', 'value': 34761},
 {'source': 'Harry P.', 'target': 'Hermione G.', 'value': 27888},
 {'source': 'Harry P.', 'target': 'Ginny W.', 'value': 22153},
 {'source': 'Harry P.', 'target': 'Severus S.', 'value': 12273},
 {'source': 'Harry P.', 'target': 'Ron W.', 'value': 9173}]

source = 'Harry P.'
target1 = 'Ron W.'
target2 = 'asdf'

print(alreadyLinked(source, target1, links))
print(alreadyLinked(source, target2, links))

True
False


In [90]:
links = []
for character in sorted_characters[:num_characters]:
    relations = df[character['name']].nlargest(6)
    for relation in relations.keys():
        # skip yourself
        if relation != character['name']:
            if relation not in names_only:
                names_only.append(relation)
            if not alreadyLinked(relation, character['name'], links):
                links.append({'source': character['name'], 
                              'target': relation, 
                              'value': np.asscalar(relations[relation]) # json can't serialize np- convert to int
                             })
                
                

In [91]:
links

[{'source': 'Harry P.', 'target': 'Draco M.', 'value': 34761},
 {'source': 'Harry P.', 'target': 'Hermione G.', 'value': 27888},
 {'source': 'Harry P.', 'target': 'Ginny W.', 'value': 22153},
 {'source': 'Harry P.', 'target': 'Severus S.', 'value': 12273},
 {'source': 'Harry P.', 'target': 'Ron W.', 'value': 9173},
 {'source': 'Hermione G.', 'target': 'Draco M.', 'value': 44837},
 {'source': 'Hermione G.', 'target': 'Ron W.', 'value': 26687},
 {'source': 'Hermione G.', 'target': 'Severus S.', 'value': 11213},
 {'source': 'Hermione G.', 'target': 'Ginny W.', 'value': 3583},
 {'source': 'Draco M.', 'target': 'Ginny W.', 'value': 12946},
 {'source': 'Draco M.', 'target': 'OC', 'value': 4130},
 {'source': 'Draco M.', 'target': 'Ron W.', 'value': 2612},
 {'source': 'Severus S.', 'target': 'Lily Evans P.', 'value': 6619},
 {'source': 'Severus S.', 'target': 'OC', 'value': 2816},
 {'source': 'Severus S.', 'target': 'Remus L.', 'value': 2149},
 {'source': 'Lily Evans P.', 'target': 'James P.',

And our nodes!

In [92]:
nodes = []
for name in names_only:
    nodes.append({
        'id': name,
        'generation' : findAffiliation(generations, name),
        'house' : findAffiliation(houses, name)
    })
nodes

[{'generation': 'main_gen', 'house': 'Gryffindor', 'id': 'Harry P.'},
 {'generation': 'main_gen', 'house': 'Gryffindor', 'id': 'Hermione G.'},
 {'generation': 'main_gen', 'house': 'Slytherin', 'id': 'Draco M.'},
 {'generation': 'marauder_gen', 'house': 'Slytherin', 'id': 'Severus S.'},
 {'generation': 'marauder_gen', 'house': 'Gryffindor', 'id': 'Lily Evans P.'},
 {'generation': 'marauder_gen', 'house': 'Gryffindor', 'id': 'James P.'},
 {'generation': 'marauder_gen', 'house': 'Gryffindor', 'id': 'Sirius B.'},
 {'generation': 'main_gen', 'house': 'Gryffindor', 'id': 'Ginny W.'},
 {'generation': 'main_gen', 'house': 'Gryffindor', 'id': 'Ron W.'},
 {'generation': 'marauder_gen', 'house': 'Gryffindor', 'id': 'Remus L.'},
 {'generation': 'other', 'house': 'other', 'id': 'OC'},
 {'generation': 'next_gen', 'house': 'other', 'id': 'Scorpius M.'},
 {'generation': 'riddle_gen', 'house': 'Slytherin', 'id': 'Voldemort'},
 {'generation': 'main_gen', 'house': 'Gryffindor', 'id': 'George W.'},
 {'gen

### Output as JSON

In [93]:
out_json = {'nodes': nodes,
            'links' : links}

In [94]:
import json
with open('../data/potter.json', 'w') as outfile:
    json.dump(out_json, outfile)