# Types of Centrality

**Betweenness Centrality:** The number of shortest paths connecting other nodes that pass through a given node.

**Closeness Centrality:** The reciprocal of the sum of the shortest paths to other nodes in a network.

**Eigenvector Centrality:** The sum of the centrality scores of the nodes that a node is connected to.

**Degree Centrality:** The degree of a node.

<img src="images/Screen Shot 2019-02-05 at 3.48.30 PM.png" />


In [1]:
#            animal       speed   weight lifespan brain
#                          (mph)   (kg)  (years) mass (g)
animals = [("dog",          46,   35,     13,  280    ),
           ("elephant",     30, 3500,     50, 6250    ),
           ("frog",          5,    0.5,    8,    3    ),
           ("hippopotamus", 45, 1600,     45,  573    ),
           ("horse",        40,  385,     30, 642     ),
           ("human",        27,   80,     78, 2000    ),
           ("lion",         50,  250,     30,  454    ),
           ("mouse",         8,    0.025,  2,    0.625),
           ("rabbit",       25,    4,     12,   40    ), 
           ("shark",        26,  230,     20,   92    ),
           ("sparrow",      16,    0.024,  7,    2    )]

def importance_rank(items, weights):
    names = [item[0] for item in items]  # get the list of animal names
    scores = [sum([a*b for (a,b) in zip(item[1:], weights)]) for item in items]  # get the list of overall scores for each animal
    results = zip(scores,names) # make a list of tuple
    res2 = sorted(results) # sort the tuple based on the score
    return res2

answer = importance_rank(animals, (2,3,7,1))

for i in range(len(answer)):
    print(i, answer[i][1], "(", answer[i][0], ")")

0 mouse ( 30.7 )
1 frog ( 70.5 )
2 sparrow ( 83.072 )
3 rabbit ( 186 )
4 dog ( 568 )
5 shark ( 974 )
6 lion ( 1514 )
7 horse ( 2087 )
8 human ( 2840 )
9 hippopotamus ( 5778 )
10 elephant ( 17160 )


# Computing Statistics

<img src="images/Screen Shot 2019-02-05 at 4.12.28 PM.png" />

<img src="images/Screen Shot 2019-02-05 at 4.14.13 PM.png" />


In [11]:
matrix = []
with open("data/names.txt") as f:
    for line in f:
        if line.split(",")[1] == "F":
            matrix.append(tuple(line.split(",")))
        
name_counts = sorted(matrix, key=lambda x: int(x[2]))[::-1]
name_counts

[('Jessica', 'F', '27931\n'),
 ('Ashley', 'F', '26596\n'),
 ('Emily', 'F', '24374\n'),
 ('Samantha', 'F', '21639\n'),
 ('Sarah', 'F', '21346\n'),
 ('Taylor', 'F', '20422\n'),
 ('Hannah', 'F', '17001\n'),
 ('Brittany', 'F', '16477\n'),
 ('Amanda', 'F', '16339\n'),
 ('Elizabeth', 'F', '16173\n'),
 ('Kayla', 'F', '16081\n'),
 ('Rachel', 'F', '16030\n'),
 ('Megan', 'F', '15523\n'),
 ('Alexis', 'F', '14327\n'),
 ('Lauren', 'F', '13442\n'),
 ('Stephanie', 'F', '12979\n'),
 ('Courtney', 'F', '12770\n'),
 ('Jennifer', 'F', '12682\n'),
 ('Nicole', 'F', '12271\n'),
 ('Victoria', 'F', '12251\n'),
 ('Brianna', 'F', '11872\n'),
 ('Amber', 'F', '10961\n'),
 ('Morgan', 'F', '10875\n'),
 ('Danielle', 'F', '10645\n'),
 ('Jasmine', 'F', '10278\n'),
 ('Alexandra', 'F', '10158\n'),
 ('Alyssa', 'F', '10098\n'),
 ('Rebecca', 'F', '9887\n'),
 ('Madison', 'F', '9776\n'),
 ('Katherine', 'F', '8984\n'),
 ('Anna', 'F', '8538\n'),
 ('Haley', 'F', '8202\n'),
 ('Kelsey', 'F', '8181\n'),
 ('Allison', 'F', '8126\n'),

# Induction: Analysis of Top K Via Partitioning

<img src="images/Screen Shot 2019-02-05 at 5.26.17 PM.png" />


# Top K Summary

Although induction yields an expected value of n, it may be quadratic if the worse case scenario happens every time: 

`n(n-1)/2`

However, in practice, it has a higher likelihood of being closer to linear.

<img src="images/Screen Shot 2019-02-06 at 9.09.43 AM.png" />


# Introduction to Heaps

<img src="images/Screen Shot 2019-02-06 at 9.12.42 AM.png" />

<img src="images/Screen Shot 2019-02-06 at 9.35.55 AM.png" />


# Heap Properties

<img src="images/Screen Shot 2019-02-06 at 9.38.31 AM.png" />

<img src="images/Screen Shot 2019-02-06 at 9.38.52 AM.png" />

<img src="images/Screen Shot 2019-02-06 at 9.42.58 AM.png" />

<img src="images/Screen Shot 2019-02-06 at 10.02.11 AM.png" />

<img src="images/Screen Shot 2019-02-06 at 10.04.14 AM.png" />


In [1]:
#
# Implement remove_min
#

def remove_min(L):
    # your code here
    print("Before:", L)
    L[0], L[-1] = L[-1], L[0]
    del L[-1]
    down_heapify(L, 0)
    print("After:", L)
    return L

def parent(i): 
    return (i-1)/2
def left_child(i): 
    return 2*i+1
def right_child(i): 
    return 2*i+2
def is_leaf(L,i): 
    return (left_child(i) >= len(L)) and (right_child(i) >= len(L))
def one_child(L,i): 
    return (left_child(i) < len(L)) and (right_child(i) >= len(L))

# Call this routine if the heap rooted at i satisfies the heap property
# *except* perhaps i to its immediate children
def down_heapify(L, i):
    # If i is a leaf, heap property holds
    if is_leaf(L, i): 
        return
    # If i has one child...
    if one_child(L, i):
        # check heap property
        if L[i] > L[left_child(i)]:
            # If it fails, swap, fixing i and its child (a leaf)
            (L[i], L[left_child(i)]) = (L[left_child(i)], L[i])
        return
    # If i has two children...
    # check heap property
    if min(L[left_child(i)], L[right_child(i)]) >= L[i]: 
        return
    # If it fails, see which child is the smaller
    # and swap i's value into that child
    # Afterwards, recurse into that child, which might violate
    if L[left_child(i)] < L[right_child(i)]:
        # Swap into left child
        (L[i], L[left_child(i)]) = (L[left_child(i)], L[i])
        down_heapify(L, left_child(i))
        return
    else:
        (L[i], L[right_child(i)]) = (L[right_child(i)], L[i])
        down_heapify(L, right_child(i))
        return

#########
# Testing Code
#

# build_heap
def build_heap(L):
    for i in range(len(L)-1, -1, -1):
        down_heapify(L, i)
    return L

def test():
    L = range(10)
    build_heap(L)
    remove_min(L)
    # now, the new minimum should be 1
    assert L[0] == 1

test()

('Before:', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
('After:', [1, 3, 2, 7, 4, 5, 6, 9, 8])


# Building a Heap

<img src="images/Screen Shot 2019-02-06 at 12.22.44 PM.png" />


# Inserting into a Heap

<img src="images/Screen Shot 2019-02-06 at 12.22.44 PM.png" />






In [10]:
def minimize_square(L):
    min_val = sum(L)/float(len(L))
    min_diff = sum([(min_val-x)**2 for x in L])
    print("min_diff:", min_diff)
    
    for val in L:
        square_diff = sum([(val-x)**2 for x in L])
        if square_diff < min_diff:
            min_diff = square_diff
            min_val = val
            print("new min_diff:", min_diff)
            print("new min_val:", min_val)
    # your code here
    return min_val

minimize_square([2, 2, 3, 4])

('min_diff:', 2.75)


2.75

In [1]:
import numpy as np

arr = [8969, 10783, 6779, 641, 242, 8300, 10004, 12049, 5174, 10455, 8831, 1069, 2922, 2973, 3233, 6440, 3809, 821, 226, 9829, 1031, 8421, 1953, 940, 9741, 5194, 2677, 8316, 4178, 179, 252, 293, 1160, 10890, 6777, 11550, 5803, 1799, 5036, 3251, 5742, 2799, 7501, 7302, 475, 2186, 8949, 6660, 3637, 7033, 4661, 7902, 4774, 602, 1889, 9466, 11970, 10695, 7428, 5320, 887, 10666, 5848, 1212, 2268, 3169, 835, 4000, 11394, 324, 6198, 4659, 4942, 4966, 2741, 10901, 5056, 2048, 6615, 7209, 541, 1461, 11856, 11512, 11762, 6271, 3643, 1875, 1998, 1953, 1984, 10114, 3217, 1214, 1262, 7726, 104, 11975, 974, 7483, 8073, 648, 7954, 3323, 1782, 290, 4587, 480, 3554, 3738, 810, 1834, 1270, 7160, 3999, 166, 533, 12053, 9326, 11758, 2437, 461, 7581, 8945, 1974, 1323, 11086, 215, 9519, 3784, 10869, 7177, 1932, 1258, 745, 1340, 3906, 7638, 9528, 260, 1296, 2553, 728, 7728, 2260, 1238, 3974, 5042, 2791, 8101, 357, 423, 3532, 6018, 339, 10280, 8776, 6627, 10733, 5482, 2238, 11702, 649, 10638, 635, 1664, 10631, 136, 2740, 4075, 3730, 5166, 785, 757, 1403, 301, 4480, 6710, 8603, 157, 401, 1869, 113, 3666, 428, 5508, 1614, 2478, 7006, 609, 430, 5900, 1537, 685, 1199, 10260, 1718, 4616, 3368, 111, 1984, 7272, 7823, 1956, 1025, 403, 131, 235, 10533, 6268, 11254, 174, 2209, 7555, 2876, 7084, 6791, 7834, 10450, 3187, 5984, 10309, 1982, 289, 5443, 731, 4257, 236, 6328, 10689, 10621, 812, 1010, 5023, 409, 11170, 156, 8206, 1900, 203, 3575, 324, 10242, 6837, 434, 1544, 304, 10124, 4524, 6616, 6955, 618, 207, 11248, 9815, 3486, 5675, 1921, 5366, 10507, 9788, 3941, 100, 7162, 744, 892, 902, 2862, 3636, 599, 3821, 2898, 977, 3272, 10715, 10505, 2391, 407, 152, 3860, 794, 7490, 2764, 7001, 5278, 5792, 1081, 11230, 7371, 7850, 309, 10177, 11768, 10661, 512, 7613, 11633, 2745, 884, 3398, 10021, 8709, 2850, 3185, 481, 118, 11580, 2002, 319, 8252, 2780, 10704, 6422, 7518, 100, 123, 4632, 8404, 3394, 3460, 2949, 578, 4193, 5979, 8146, 4498, 7178, 141, 1453, 7497, 188, 9582, 8972, 2916, 1446, 8694, 1886, 1492, 153, 2397, 6588, 9159, 6533, 5931, 5358, 5153, 705, 3894, 7257, 4926, 4013, 7012, 6504, 6249, 464, 400, 9832, 3065, 12074, 912, 7092, 3722, 577, 675, 10197, 6359, 3242, 12059, 1095, 6071, 3983, 1627, 4100, 1496, 6319, 5450, 552, 9508, 11961, 10315, 4620, 436, 6183, 5701, 8700, 5620, 1570, 3199, 1642, 1532, 2168, 3181, 2054, 2711, 4789, 1002, 931, 9273, 10833, 4734, 11477, 9197, 1030, 262, 5259, 3884, 2534, 766, 9946, 7478, 9205, 5082, 3096, 470, 7509, 5332, 7450, 1844, 3549, 1129, 2818, 1800, 10995, 2347, 3775, 3673, 2053, 1748, 170, 359, 1183, 1382, 5471, 4871, 233, 1032, 315, 2224, 1283, 256, 7125, 142, 4850, 9883, 7687, 573, 3783, 5929, 11868, 199, 10891, 6271, 9826, 3401, 6215, 9701, 11837, 5795, 7146, 935, 884, 10106, 5155, 7135, 4995, 3121, 1572, 566, 4965, 6323, 916, 5838, 760, 4002, 1878, 5737, 941, 386, 379, 112, 715, 11625, 4477, 246, 3008, 303, 3010, 3979, 4140, 479, 104, 3720, 6694, 10671, 9943, 8457, 5531, 2112, 838, 522, 5227, 2317, 471, 738, 2357, 290, 7981, 4264, 445, 2789, 8762, 8297, 8068, 2707, 9427, 8989, 482, 6410, 7012, 5660, 6405, 5116, 3775, 2572, 959, 4237, 4396, 101, 3079, 9287, 5061, 1796, 620, 10814, 8007, 7617, 441, 1151, 3265, 3904, 4711, 524, 2161, 7368, 140, 8926, 7809, 3294, 4227, 7613, 4086, 117, 10279, 1084, 8818, 2158, 107, 11277, 10138, 9528, 10938, 10402, 1213, 836, 12084, 2115, 105, 9989, 4955, 8999, 7551, 8622, 5203, 6811, 2913, 110, 7167, 9336, 503, 4715, 1523, 472, 4650, 4131, 10929, 1240, 9908, 7713, 2237, 8571, 210, 6383, 8933, 9010, 2937, 826, 8764, 3113, 247, 6971, 2937, 6717, 5323, 312, 7589, 1293, 5838, 152, 432, 200, 7728, 8045, 4035, 267, 6332, 1511, 879, 4579, 887, 10454, 943, 10965, 2627, 187, 2328, 10617, 4971, 9918, 210, 693, 1175, 2693, 3649, 3511, 922, 11917, 6900, 4228, 6091, 1878, 2137, 1452, 615, 4989, 5780, 1375, 10225, 6500, 9211, 1150, 1224, 9655, 5583, 6916, 3738, 3368, 528, 7801, 11371, 4949, 2002, 6660, 4196, 190, 2231, 7118, 4258, 2825, 9850, 4513, 231, 10667, 623, 3964, 1569, 8556, 141, 6949, 1087, 6779, 2026, 6494, 11347, 1341, 7163, 7190, 7064, 9666, 9012, 10872, 10663, 10840, 7521, 2494, 167, 5447, 129, 11100, 6178, 1983, 1660, 1821, 2848, 174, 4817, 881, 5800, 5610, 3810, 9716, 9730, 2274, 279, 11516, 2786, 930, 5396, 1897, 2825, 8787, 6409, 3365, 11182, 3630, 508, 806, 5861, 317, 9527, 4377, 212, 3563, 6159, 6072, 572, 8272, 156, 11350, 8914, 8567, 4949, 8007, 1678, 208, 2586, 9312, 1503, 1679, 7932, 473, 4866, 882, 149, 912, 4954, 7966, 5678, 1233, 9293, 429, 220, 11823, 6006, 736, 2413, 11654, 11420, 234, 3937, 11527, 1159, 4080, 577, 7010, 11743, 169, 10832, 3554, 5333, 12087, 4637, 5629, 2075, 4835, 3747, 335, 6583, 1868, 7099, 1293, 3062, 1649, 5164, 6003, 10342, 4856, 205, 9579, 4364, 4621, 7655, 120, 2893, 3959, 11973, 7140, 6441, 8033, 6613, 5554, 778, 3788, 3645, 9675, 384, 542, 11141, 2451, 2488, 11093, 618, 592, 1301, 511, 7217, 4587, 11183, 5398, 246, 1653, 1581, 11667, 1930, 4076, 736, 3825, 173, 11481, 6886, 290, 807, 9428, 5692, 11924, 5189, 7782, 874, 3735, 240, 1415, 1808, 7786, 2634, 529, 4893, 7478, 1477, 4431, 1498, 3413, 2505, 601, 4232, 1229, 11906, 566, 3361, 7459, 6629, 4619, 984, 6447, 7434, 6538, 1237, 11810, 1979, 11139, 8706, 4783, 3531, 10313, 1548, 11541, 3404, 5205, 8826, 2032, 2661, 819, 9811, 6007, 773, 6912, 2982, 297, 9877, 2475, 4321, 11806, 4042, 451, 6456, 1070, 2982, 8613, 10331, 1384, 10190, 3719, 5863, 2625, 1143, 1284, 2154, 629, 12066, 7303, 10647, 8153, 10543, 249, 591, 7261, 1919, 1974, 4137, 365, 4741, 5712, 3688, 2617, 366, 7801, 8135, 226, 10257, 2581, 1595, 938, 10244, 3913, 665, 768, 5973, 2180, 8855, 11213, 8867, 125, 10089, 105, 2038, 1587, 195, 6886, 465, 394, 4165, 714, 11878, 3618, 5908, 177, 3109, 1992, 5799, 1680, 4404, 173, 4361, 2006, 779, 1048, 3798, 1146, 1783, 4069, 2392, 116, 5962, 8403, 1239, 11389, 8289, 3097, 3991, 6056, 7878, 2926, 5252, 399, 10352, 7044, 5724, 250, 1569, 648, 6467, 4877]
test_median = np.median(np.array([8969, 10783, 6779, 641, 242, 8300, 10004, 12049, 5174, 10455, 8831, 1069, 2922, 2973, 3233, 6440, 3809, 821, 226, 9829, 1031, 8421, 1953, 940, 9741, 5194, 2677, 8316, 4178, 179, 252, 293, 1160, 10890, 6777, 11550, 5803, 1799, 5036, 3251, 5742, 2799, 7501, 7302, 475, 2186, 8949, 6660, 3637, 7033, 4661, 7902, 4774, 602, 1889, 9466, 11970, 10695, 7428, 5320, 887, 10666, 5848, 1212, 2268, 3169, 835, 4000, 11394, 324, 6198, 4659, 4942, 4966, 2741, 10901, 5056, 2048, 6615, 7209, 541, 1461, 11856, 11512, 11762, 6271, 3643, 1875, 1998, 1953, 1984, 10114, 3217, 1214, 1262, 7726, 104, 11975, 974, 7483, 8073, 648, 7954, 3323, 1782, 290, 4587, 480, 3554, 3738, 810, 1834, 1270, 7160, 3999, 166, 533, 12053, 9326, 11758, 2437, 461, 7581, 8945, 1974, 1323, 11086, 215, 9519, 3784, 10869, 7177, 1932, 1258, 745, 1340, 3906, 7638, 9528, 260, 1296, 2553, 728, 7728, 2260, 1238, 3974, 5042, 2791, 8101, 357, 423, 3532, 6018, 339, 10280, 8776, 6627, 10733, 5482, 2238, 11702, 649, 10638, 635, 1664, 10631, 136, 2740, 4075, 3730, 5166, 785, 757, 1403, 301, 4480, 6710, 8603, 157, 401, 1869, 113, 3666, 428, 5508, 1614, 2478, 7006, 609, 430, 5900, 1537, 685, 1199, 10260, 1718, 4616, 3368, 111, 1984, 7272, 7823, 1956, 1025, 403, 131, 235, 10533, 6268, 11254, 174, 2209, 7555, 2876, 7084, 6791, 7834, 10450, 3187, 5984, 10309, 1982, 289, 5443, 731, 4257, 236, 6328, 10689, 10621, 812, 1010, 5023, 409, 11170, 156, 8206, 1900, 203, 3575, 324, 10242, 6837, 434, 1544, 304, 10124, 4524, 6616, 6955, 618, 207, 11248, 9815, 3486, 5675, 1921, 5366, 10507, 9788, 3941, 100, 7162, 744, 892, 902, 2862, 3636, 599, 3821, 2898, 977, 3272, 10715, 10505, 2391, 407, 152, 3860, 794, 7490, 2764, 7001, 5278, 5792, 1081, 11230, 7371, 7850, 309, 10177, 11768, 10661, 512, 7613, 11633, 2745, 884, 3398, 10021, 8709, 2850, 3185, 481, 118, 11580, 2002, 319, 8252, 2780, 10704, 6422, 7518, 100, 123, 4632, 8404, 3394, 3460, 2949, 578, 4193, 5979, 8146, 4498, 7178, 141, 1453, 7497, 188, 9582, 8972, 2916, 1446, 8694, 1886, 1492, 153, 2397, 6588, 9159, 6533, 5931, 5358, 5153, 705, 3894, 7257, 4926, 4013, 7012, 6504, 6249, 464, 400, 9832, 3065, 12074, 912, 7092, 3722, 577, 675, 10197, 6359, 3242, 12059, 1095, 6071, 3983, 1627, 4100, 1496, 6319, 5450, 552, 9508, 11961, 10315, 4620, 436, 6183, 5701, 8700, 5620, 1570, 3199, 1642, 1532, 2168, 3181, 2054, 2711, 4789, 1002, 931, 9273, 10833, 4734, 11477, 9197, 1030, 262, 5259, 3884, 2534, 766, 9946, 7478, 9205, 5082, 3096, 470, 7509, 5332, 7450, 1844, 3549, 1129, 2818, 1800, 10995, 2347, 3775, 3673, 2053, 1748, 170, 359, 1183, 1382, 5471, 4871, 233, 1032, 315, 2224, 1283, 256, 7125, 142, 4850, 9883, 7687, 573, 3783, 5929, 11868, 199, 10891, 6271, 9826, 3401, 6215, 9701, 11837, 5795, 7146, 935, 884, 10106, 5155, 7135, 4995, 3121, 1572, 566, 4965, 6323, 916, 5838, 760, 4002, 1878, 5737, 941, 386, 379, 112, 715, 11625, 4477, 246, 3008, 303, 3010, 3979, 4140, 479, 104, 3720, 6694, 10671, 9943, 8457, 5531, 2112, 838, 522, 5227, 2317, 471, 738, 2357, 290, 7981, 4264, 445, 2789, 8762, 8297, 8068, 2707, 9427, 8989, 482, 6410, 7012, 5660, 6405, 5116, 3775, 2572, 959, 4237, 4396, 101, 3079, 9287, 5061, 1796, 620, 10814, 8007, 7617, 441, 1151, 3265, 3904, 4711, 524, 2161, 7368, 140, 8926, 7809, 3294, 4227, 7613, 4086, 117, 10279, 1084, 8818, 2158, 107, 11277, 10138, 9528, 10938, 10402, 1213, 836, 12084, 2115, 105, 9989, 4955, 8999, 7551, 8622, 5203, 6811, 2913, 110, 7167, 9336, 503, 4715, 1523, 472, 4650, 4131, 10929, 1240, 9908, 7713, 2237, 8571, 210, 6383, 8933, 9010, 2937, 826, 8764, 3113, 247, 6971, 2937, 6717, 5323, 312, 7589, 1293, 5838, 152, 432, 200, 7728, 8045, 4035, 267, 6332, 1511, 879, 4579, 887, 10454, 943, 10965, 2627, 187, 2328, 10617, 4971, 9918, 210, 693, 1175, 2693, 3649, 3511, 922, 11917, 6900, 4228, 6091, 1878, 2137, 1452, 615, 4989, 5780, 1375, 10225, 6500, 9211, 1150, 1224, 9655, 5583, 6916, 3738, 3368, 528, 7801, 11371, 4949, 2002, 6660, 4196, 190, 2231, 7118, 4258, 2825, 9850, 4513, 231, 10667, 623, 3964, 1569, 8556, 141, 6949, 1087, 6779, 2026, 6494, 11347, 1341, 7163, 7190, 7064, 9666, 9012, 10872, 10663, 10840, 7521, 2494, 167, 5447, 129, 11100, 6178, 1983, 1660, 1821, 2848, 174, 4817, 881, 5800, 5610, 3810, 9716, 9730, 2274, 279, 11516, 2786, 930, 5396, 1897, 2825, 8787, 6409, 3365, 11182, 3630, 508, 806, 5861, 317, 9527, 4377, 212, 3563, 6159, 6072, 572, 8272, 156, 11350, 8914, 8567, 4949, 8007, 1678, 208, 2586, 9312, 1503, 1679, 7932, 473, 4866, 882, 149, 912, 4954, 7966, 5678, 1233, 9293, 429, 220, 11823, 6006, 736, 2413, 11654, 11420, 234, 3937, 11527, 1159, 4080, 577, 7010, 11743, 169, 10832, 3554, 5333, 12087, 4637, 5629, 2075, 4835, 3747, 335, 6583, 1868, 7099, 1293, 3062, 1649, 5164, 6003, 10342, 4856, 205, 9579, 4364, 4621, 7655, 120, 2893, 3959, 11973, 7140, 6441, 8033, 6613, 5554, 778, 3788, 3645, 9675, 384, 542, 11141, 2451, 2488, 11093, 618, 592, 1301, 511, 7217, 4587, 11183, 5398, 246, 1653, 1581, 11667, 1930, 4076, 736, 3825, 173, 11481, 6886, 290, 807, 9428, 5692, 11924, 5189, 7782, 874, 3735, 240, 1415, 1808, 7786, 2634, 529, 4893, 7478, 1477, 4431, 1498, 3413, 2505, 601, 4232, 1229, 11906, 566, 3361, 7459, 6629, 4619, 984, 6447, 7434, 6538, 1237, 11810, 1979, 11139, 8706, 4783, 3531, 10313, 1548, 11541, 3404, 5205, 8826, 2032, 2661, 819, 9811, 6007, 773, 6912, 2982, 297, 9877, 2475, 4321, 11806, 4042, 451, 6456, 1070, 2982, 8613, 10331, 1384, 10190, 3719, 5863, 2625, 1143, 1284, 2154, 629, 12066, 7303, 10647, 8153, 10543, 249, 591, 7261, 1919, 1974, 4137, 365, 4741, 5712, 3688, 2617, 366, 7801, 8135, 226, 10257, 2581, 1595, 938, 10244, 3913, 665, 768, 5973, 2180, 8855, 11213, 8867, 125, 10089, 105, 2038, 1587, 195, 6886, 465, 394, 4165, 714, 11878, 3618, 5908, 177, 3109, 1992, 5799, 1680, 4404, 173, 4361, 2006, 779, 1048, 3798, 1146, 1783, 4069, 2392, 116, 5962, 8403, 1239, 11389, 8289, 3097, 3991, 6056, 7878, 2926, 5252, 399, 10352, 7044, 5724, 250, 1569, 648, 6467, 4877]))
test_mean = np.mean(np.array([8969, 10783, 6779, 641, 242, 8300, 10004, 12049, 5174, 10455, 8831, 1069, 2922, 2973, 3233, 6440, 3809, 821, 226, 9829, 1031, 8421, 1953, 940, 9741, 5194, 2677, 8316, 4178, 179, 252, 293, 1160, 10890, 6777, 11550, 5803, 1799, 5036, 3251, 5742, 2799, 7501, 7302, 475, 2186, 8949, 6660, 3637, 7033, 4661, 7902, 4774, 602, 1889, 9466, 11970, 10695, 7428, 5320, 887, 10666, 5848, 1212, 2268, 3169, 835, 4000, 11394, 324, 6198, 4659, 4942, 4966, 2741, 10901, 5056, 2048, 6615, 7209, 541, 1461, 11856, 11512, 11762, 6271, 3643, 1875, 1998, 1953, 1984, 10114, 3217, 1214, 1262, 7726, 104, 11975, 974, 7483, 8073, 648, 7954, 3323, 1782, 290, 4587, 480, 3554, 3738, 810, 1834, 1270, 7160, 3999, 166, 533, 12053, 9326, 11758, 2437, 461, 7581, 8945, 1974, 1323, 11086, 215, 9519, 3784, 10869, 7177, 1932, 1258, 745, 1340, 3906, 7638, 9528, 260, 1296, 2553, 728, 7728, 2260, 1238, 3974, 5042, 2791, 8101, 357, 423, 3532, 6018, 339, 10280, 8776, 6627, 10733, 5482, 2238, 11702, 649, 10638, 635, 1664, 10631, 136, 2740, 4075, 3730, 5166, 785, 757, 1403, 301, 4480, 6710, 8603, 157, 401, 1869, 113, 3666, 428, 5508, 1614, 2478, 7006, 609, 430, 5900, 1537, 685, 1199, 10260, 1718, 4616, 3368, 111, 1984, 7272, 7823, 1956, 1025, 403, 131, 235, 10533, 6268, 11254, 174, 2209, 7555, 2876, 7084, 6791, 7834, 10450, 3187, 5984, 10309, 1982, 289, 5443, 731, 4257, 236, 6328, 10689, 10621, 812, 1010, 5023, 409, 11170, 156, 8206, 1900, 203, 3575, 324, 10242, 6837, 434, 1544, 304, 10124, 4524, 6616, 6955, 618, 207, 11248, 9815, 3486, 5675, 1921, 5366, 10507, 9788, 3941, 100, 7162, 744, 892, 902, 2862, 3636, 599, 3821, 2898, 977, 3272, 10715, 10505, 2391, 407, 152, 3860, 794, 7490, 2764, 7001, 5278, 5792, 1081, 11230, 7371, 7850, 309, 10177, 11768, 10661, 512, 7613, 11633, 2745, 884, 3398, 10021, 8709, 2850, 3185, 481, 118, 11580, 2002, 319, 8252, 2780, 10704, 6422, 7518, 100, 123, 4632, 8404, 3394, 3460, 2949, 578, 4193, 5979, 8146, 4498, 7178, 141, 1453, 7497, 188, 9582, 8972, 2916, 1446, 8694, 1886, 1492, 153, 2397, 6588, 9159, 6533, 5931, 5358, 5153, 705, 3894, 7257, 4926, 4013, 7012, 6504, 6249, 464, 400, 9832, 3065, 12074, 912, 7092, 3722, 577, 675, 10197, 6359, 3242, 12059, 1095, 6071, 3983, 1627, 4100, 1496, 6319, 5450, 552, 9508, 11961, 10315, 4620, 436, 6183, 5701, 8700, 5620, 1570, 3199, 1642, 1532, 2168, 3181, 2054, 2711, 4789, 1002, 931, 9273, 10833, 4734, 11477, 9197, 1030, 262, 5259, 3884, 2534, 766, 9946, 7478, 9205, 5082, 3096, 470, 7509, 5332, 7450, 1844, 3549, 1129, 2818, 1800, 10995, 2347, 3775, 3673, 2053, 1748, 170, 359, 1183, 1382, 5471, 4871, 233, 1032, 315, 2224, 1283, 256, 7125, 142, 4850, 9883, 7687, 573, 3783, 5929, 11868, 199, 10891, 6271, 9826, 3401, 6215, 9701, 11837, 5795, 7146, 935, 884, 10106, 5155, 7135, 4995, 3121, 1572, 566, 4965, 6323, 916, 5838, 760, 4002, 1878, 5737, 941, 386, 379, 112, 715, 11625, 4477, 246, 3008, 303, 3010, 3979, 4140, 479, 104, 3720, 6694, 10671, 9943, 8457, 5531, 2112, 838, 522, 5227, 2317, 471, 738, 2357, 290, 7981, 4264, 445, 2789, 8762, 8297, 8068, 2707, 9427, 8989, 482, 6410, 7012, 5660, 6405, 5116, 3775, 2572, 959, 4237, 4396, 101, 3079, 9287, 5061, 1796, 620, 10814, 8007, 7617, 441, 1151, 3265, 3904, 4711, 524, 2161, 7368, 140, 8926, 7809, 3294, 4227, 7613, 4086, 117, 10279, 1084, 8818, 2158, 107, 11277, 10138, 9528, 10938, 10402, 1213, 836, 12084, 2115, 105, 9989, 4955, 8999, 7551, 8622, 5203, 6811, 2913, 110, 7167, 9336, 503, 4715, 1523, 472, 4650, 4131, 10929, 1240, 9908, 7713, 2237, 8571, 210, 6383, 8933, 9010, 2937, 826, 8764, 3113, 247, 6971, 2937, 6717, 5323, 312, 7589, 1293, 5838, 152, 432, 200, 7728, 8045, 4035, 267, 6332, 1511, 879, 4579, 887, 10454, 943, 10965, 2627, 187, 2328, 10617, 4971, 9918, 210, 693, 1175, 2693, 3649, 3511, 922, 11917, 6900, 4228, 6091, 1878, 2137, 1452, 615, 4989, 5780, 1375, 10225, 6500, 9211, 1150, 1224, 9655, 5583, 6916, 3738, 3368, 528, 7801, 11371, 4949, 2002, 6660, 4196, 190, 2231, 7118, 4258, 2825, 9850, 4513, 231, 10667, 623, 3964, 1569, 8556, 141, 6949, 1087, 6779, 2026, 6494, 11347, 1341, 7163, 7190, 7064, 9666, 9012, 10872, 10663, 10840, 7521, 2494, 167, 5447, 129, 11100, 6178, 1983, 1660, 1821, 2848, 174, 4817, 881, 5800, 5610, 3810, 9716, 9730, 2274, 279, 11516, 2786, 930, 5396, 1897, 2825, 8787, 6409, 3365, 11182, 3630, 508, 806, 5861, 317, 9527, 4377, 212, 3563, 6159, 6072, 572, 8272, 156, 11350, 8914, 8567, 4949, 8007, 1678, 208, 2586, 9312, 1503, 1679, 7932, 473, 4866, 882, 149, 912, 4954, 7966, 5678, 1233, 9293, 429, 220, 11823, 6006, 736, 2413, 11654, 11420, 234, 3937, 11527, 1159, 4080, 577, 7010, 11743, 169, 10832, 3554, 5333, 12087, 4637, 5629, 2075, 4835, 3747, 335, 6583, 1868, 7099, 1293, 3062, 1649, 5164, 6003, 10342, 4856, 205, 9579, 4364, 4621, 7655, 120, 2893, 3959, 11973, 7140, 6441, 8033, 6613, 5554, 778, 3788, 3645, 9675, 384, 542, 11141, 2451, 2488, 11093, 618, 592, 1301, 511, 7217, 4587, 11183, 5398, 246, 1653, 1581, 11667, 1930, 4076, 736, 3825, 173, 11481, 6886, 290, 807, 9428, 5692, 11924, 5189, 7782, 874, 3735, 240, 1415, 1808, 7786, 2634, 529, 4893, 7478, 1477, 4431, 1498, 3413, 2505, 601, 4232, 1229, 11906, 566, 3361, 7459, 6629, 4619, 984, 6447, 7434, 6538, 1237, 11810, 1979, 11139, 8706, 4783, 3531, 10313, 1548, 11541, 3404, 5205, 8826, 2032, 2661, 819, 9811, 6007, 773, 6912, 2982, 297, 9877, 2475, 4321, 11806, 4042, 451, 6456, 1070, 2982, 8613, 10331, 1384, 10190, 3719, 5863, 2625, 1143, 1284, 2154, 629, 12066, 7303, 10647, 8153, 10543, 249, 591, 7261, 1919, 1974, 4137, 365, 4741, 5712, 3688, 2617, 366, 7801, 8135, 226, 10257, 2581, 1595, 938, 10244, 3913, 665, 768, 5973, 2180, 8855, 11213, 8867, 125, 10089, 105, 2038, 1587, 195, 6886, 465, 394, 4165, 714, 11878, 3618, 5908, 177, 3109, 1992, 5799, 1680, 4404, 173, 4361, 2006, 779, 1048, 3798, 1146, 1783, 4069, 2392, 116, 5962, 8403, 1239, 11389, 8289, 3097, 3991, 6056, 7878, 2926, 5252, 399, 10352, 7044, 5724, 250, 1569, 648, 6467, 4877]))

print("Mean diff:", sum(abs(test_mean - x) for x in arr))
print("Median diff:", sum(abs(test_median - x) for x in arr))

('Mean diff:', 3130809.399999999)
('Median diff:', 3089610.0)


In [2]:
print("Mean:", np.mean(np.array([2, 2, 3, 4])))
print("Median:", np.median(np.array([2, 2, 3, 4])))

('Mean:', 2.75)
('Median:', 2.5)


In [3]:
print("Mean diff:", sum(abs(2.75 - x) for x in [2, 2, 3, 4]))
print("Median diff:", sum(abs(2.5 - x) for x in [2, 2, 3, 4]))

('Mean diff:', 3.0)
('Median diff:', 3.0)


In [4]:
np.mean(np.array([8969, 10783, 6779, 641, 242, 8300, 10004, 12049, 5174, 10455, 8831, 1069, 2922, 2973, 3233, 6440, 3809, 821, 226, 9829, 1031, 8421, 1953, 940, 9741, 5194, 2677, 8316, 4178, 179, 252, 293, 1160, 10890, 6777, 11550, 5803, 1799, 5036, 3251, 5742, 2799, 7501, 7302, 475, 2186, 8949, 6660, 3637, 7033, 4661, 7902, 4774, 602, 1889, 9466, 11970, 10695, 7428, 5320, 887, 10666, 5848, 1212, 2268, 3169, 835, 4000, 11394, 324, 6198, 4659, 4942, 4966, 2741, 10901, 5056, 2048, 6615, 7209, 541, 1461, 11856, 11512, 11762, 6271, 3643, 1875, 1998, 1953, 1984, 10114, 3217, 1214, 1262, 7726, 104, 11975, 974, 7483, 8073, 648, 7954, 3323, 1782, 290, 4587, 480, 3554, 3738, 810, 1834, 1270, 7160, 3999, 166, 533, 12053, 9326, 11758, 2437, 461, 7581, 8945, 1974, 1323, 11086, 215, 9519, 3784, 10869, 7177, 1932, 1258, 745, 1340, 3906, 7638, 9528, 260, 1296, 2553, 728, 7728, 2260, 1238, 3974, 5042, 2791, 8101, 357, 423, 3532, 6018, 339, 10280, 8776, 6627, 10733, 5482, 2238, 11702, 649, 10638, 635, 1664, 10631, 136, 2740, 4075, 3730, 5166, 785, 757, 1403, 301, 4480, 6710, 8603, 157, 401, 1869, 113, 3666, 428, 5508, 1614, 2478, 7006, 609, 430, 5900, 1537, 685, 1199, 10260, 1718, 4616, 3368, 111, 1984, 7272, 7823, 1956, 1025, 403, 131, 235, 10533, 6268, 11254, 174, 2209, 7555, 2876, 7084, 6791, 7834, 10450, 3187, 5984, 10309, 1982, 289, 5443, 731, 4257, 236, 6328, 10689, 10621, 812, 1010, 5023, 409, 11170, 156, 8206, 1900, 203, 3575, 324, 10242, 6837, 434, 1544, 304, 10124, 4524, 6616, 6955, 618, 207, 11248, 9815, 3486, 5675, 1921, 5366, 10507, 9788, 3941, 100, 7162, 744, 892, 902, 2862, 3636, 599, 3821, 2898, 977, 3272, 10715, 10505, 2391, 407, 152, 3860, 794, 7490, 2764, 7001, 5278, 5792, 1081, 11230, 7371, 7850, 309, 10177, 11768, 10661, 512, 7613, 11633, 2745, 884, 3398, 10021, 8709, 2850, 3185, 481, 118, 11580, 2002, 319, 8252, 2780, 10704, 6422, 7518, 100, 123, 4632, 8404, 3394, 3460, 2949, 578, 4193, 5979, 8146, 4498, 7178, 141, 1453, 7497, 188, 9582, 8972, 2916, 1446, 8694, 1886, 1492, 153, 2397, 6588, 9159, 6533, 5931, 5358, 5153, 705, 3894, 7257, 4926, 4013, 7012, 6504, 6249, 464, 400, 9832, 3065, 12074, 912, 7092, 3722, 577, 675, 10197, 6359, 3242, 12059, 1095, 6071, 3983, 1627, 4100, 1496, 6319, 5450, 552, 9508, 11961, 10315, 4620, 436, 6183, 5701, 8700, 5620, 1570, 3199, 1642, 1532, 2168, 3181, 2054, 2711, 4789, 1002, 931, 9273, 10833, 4734, 11477, 9197, 1030, 262, 5259, 3884, 2534, 766, 9946, 7478, 9205, 5082, 3096, 470, 7509, 5332, 7450, 1844, 3549, 1129, 2818, 1800, 10995, 2347, 3775, 3673, 2053, 1748, 170, 359, 1183, 1382, 5471, 4871, 233, 1032, 315, 2224, 1283, 256, 7125, 142, 4850, 9883, 7687, 573, 3783, 5929, 11868, 199, 10891, 6271, 9826, 3401, 6215, 9701, 11837, 5795, 7146, 935, 884, 10106, 5155, 7135, 4995, 3121, 1572, 566, 4965, 6323, 916, 5838, 760, 4002, 1878, 5737, 941, 386, 379, 112, 715, 11625, 4477, 246, 3008, 303, 3010, 3979, 4140, 479, 104, 3720, 6694, 10671, 9943, 8457, 5531, 2112, 838, 522, 5227, 2317, 471, 738, 2357, 290, 7981, 4264, 445, 2789, 8762, 8297, 8068, 2707, 9427, 8989, 482, 6410, 7012, 5660, 6405, 5116, 3775, 2572, 959, 4237, 4396, 101, 3079, 9287, 5061, 1796, 620, 10814, 8007, 7617, 441, 1151, 3265, 3904, 4711, 524, 2161, 7368, 140, 8926, 7809, 3294, 4227, 7613, 4086, 117, 10279, 1084, 8818, 2158, 107, 11277, 10138, 9528, 10938, 10402, 1213, 836, 12084, 2115, 105, 9989, 4955, 8999, 7551, 8622, 5203, 6811, 2913, 110, 7167, 9336, 503, 4715, 1523, 472, 4650, 4131, 10929, 1240, 9908, 7713, 2237, 8571, 210, 6383, 8933, 9010, 2937, 826, 8764, 3113, 247, 6971, 2937, 6717, 5323, 312, 7589, 1293, 5838, 152, 432, 200, 7728, 8045, 4035, 267, 6332, 1511, 879, 4579, 887, 10454, 943, 10965, 2627, 187, 2328, 10617, 4971, 9918, 210, 693, 1175, 2693, 3649, 3511, 922, 11917, 6900, 4228, 6091, 1878, 2137, 1452, 615, 4989, 5780, 1375, 10225, 6500, 9211, 1150, 1224, 9655, 5583, 6916, 3738, 3368, 528, 7801, 11371, 4949, 2002, 6660, 4196, 190, 2231, 7118, 4258, 2825, 9850, 4513, 231, 10667, 623, 3964, 1569, 8556, 141, 6949, 1087, 6779, 2026, 6494, 11347, 1341, 7163, 7190, 7064, 9666, 9012, 10872, 10663, 10840, 7521, 2494, 167, 5447, 129, 11100, 6178, 1983, 1660, 1821, 2848, 174, 4817, 881, 5800, 5610, 3810, 9716, 9730, 2274, 279, 11516, 2786, 930, 5396, 1897, 2825, 8787, 6409, 3365, 11182, 3630, 508, 806, 5861, 317, 9527, 4377, 212, 3563, 6159, 6072, 572, 8272, 156, 11350, 8914, 8567, 4949, 8007, 1678, 208, 2586, 9312, 1503, 1679, 7932, 473, 4866, 882, 149, 912, 4954, 7966, 5678, 1233, 9293, 429, 220, 11823, 6006, 736, 2413, 11654, 11420, 234, 3937, 11527, 1159, 4080, 577, 7010, 11743, 169, 10832, 3554, 5333, 12087, 4637, 5629, 2075, 4835, 3747, 335, 6583, 1868, 7099, 1293, 3062, 1649, 5164, 6003, 10342, 4856, 205, 9579, 4364, 4621, 7655, 120, 2893, 3959, 11973, 7140, 6441, 8033, 6613, 5554, 778, 3788, 3645, 9675, 384, 542, 11141, 2451, 2488, 11093, 618, 592, 1301, 511, 7217, 4587, 11183, 5398, 246, 1653, 1581, 11667, 1930, 4076, 736, 3825, 173, 11481, 6886, 290, 807, 9428, 5692, 11924, 5189, 7782, 874, 3735, 240, 1415, 1808, 7786, 2634, 529, 4893, 7478, 1477, 4431, 1498, 3413, 2505, 601, 4232, 1229, 11906, 566, 3361, 7459, 6629, 4619, 984, 6447, 7434, 6538, 1237, 11810, 1979, 11139, 8706, 4783, 3531, 10313, 1548, 11541, 3404, 5205, 8826, 2032, 2661, 819, 9811, 6007, 773, 6912, 2982, 297, 9877, 2475, 4321, 11806, 4042, 451, 6456, 1070, 2982, 8613, 10331, 1384, 10190, 3719, 5863, 2625, 1143, 1284, 2154, 629, 12066, 7303, 10647, 8153, 10543, 249, 591, 7261, 1919, 1974, 4137, 365, 4741, 5712, 3688, 2617, 366, 7801, 8135, 226, 10257, 2581, 1595, 938, 10244, 3913, 665, 768, 5973, 2180, 8855, 11213, 8867, 125, 10089, 105, 2038, 1587, 195, 6886, 465, 394, 4165, 714, 11878, 3618, 5908, 177, 3109, 1992, 5799, 1680, 4404, 173, 4361, 2006, 779, 1048, 3798, 1146, 1783, 4069, 2392, 116, 5962, 8403, 1239, 11389, 8289, 3097, 3991, 6056, 7878, 2926, 5252, 399, 10352, 7044, 5724, 250, 1569, 648, 6467, 4877]))

4583.9139999999998

In [5]:
import pandas as pd

df = pd.read_csv("data/imdb.tsv", sep="\t", header=None, names=["actor", "movie", "year"])
df.head()

Unnamed: 0,actor,movie,year
0,"McClure, Marc (I)",Freaky Friday,2003
1,"McClure, Marc (I)",Coach Carter,2005
2,"McClure, Marc (I)",Superman II,1980
3,"McClure, Marc (I)",Apollo 13,1995
4,"McClure, Marc (I)",Superman,1978


In [6]:
df.shape

(31383, 3)

In [27]:
sorted(pd.unique(df.actor))

['Aaron, Caroline',
 'Aarons, Bonnie',
 'Abadie, William',
 'Abbott, Deborah',
 'Abdoo, Rose',
 'Abdullah, Haji',
 'Abell, Alistair',
 'Abercrombie, Ian',
 'Abergel, Rakefet',
 'Abernathy, Don',
 'Aboutboul, Alon',
 'Abraham, F. Murray',
 'Abrahams, Doug',
 'Abrahams, Jon (I)',
 'Abrell, Brad',
 'Abustan, Jason',
 'Acheson, Mark',
 'Ackland, Joss',
 'Acovone, Jay',
 'Acres, Isabella',
 'Adair-Rios, Mark',
 'Adams, Amy (III)',
 'Adams, Christine (I)',
 'Adams, Jane (II)',
 'Adams, January',
 'Adams, Joey Lauren',
 'Adams, Lillian',
 'Adamson, Christopher (I)',
 'Adamthwaite, Michael',
 'Addison, Walter',
 'Addy, Mark',
 'Adelstein, Paul',
 'Adkins, Scott (II)',
 'Adler, Charles (I)',
 'Adler, Jonathan (II)',
 'Adler, Lauren',
 'Adler, Matt',
 'Adlon, Pamela',
 'Adoti, Razaaq',
 'Adsit, Scott',
 'Aduramo, Israel',
 'Affleck, Ben',
 'Affleck, Casey',
 'Affleck, Rab',
 'Afonso, Diana',
 'Afshar, Ali',
 'Aghdashloo, Shohreh',
 'Agranov, David',
 'Aguilar, G.A.',
 'Aguirre, Cesar (II)',
 'Ah

In [57]:
df.loc[df.actor == 'Aaron, Caroline', ["movie", "year"]]

Unnamed: 0,movie,year
31262,Just Like Heaven,2005
31263,Primary Colors,1998
31264,Sleepless in Seattle,1993
31265,Along Came Polly,2004
31266,Cellular,2004
31267,21 Jump Street,2012


In [9]:
G = {}

for actor in sorted(pd.unique(df.actor)):
    
    movies_years = df.loc[df.actor == actor, ["movie", "year"]]
    movies = movies_years["movie"].tolist()
    years = movies_years["year"].tolist()
    costars = []
    
    for movie, year in zip(movies, years):
        costars += list(pd.unique(df.loc[(df.actor != actor) & (df.movie == movie) & (df.year == year), "actor"]))
    
    if len(costars) == 0: 
        print("No costars for:", actor)
        continue
        
    costars = set(costars)
    
    G[actor] = {
        "costars": costars,
        "degree": len(costars)
    }    


In [39]:
bipartite_graph = {}

for actor in sorted(pd.unique(df.actor)):
    
    bipartite_graph[actor] = {}
    movies_years = df.loc[df.actor == actor, ["movie", "year"]]
    movies = movies_years["movie"].tolist()
    years = movies_years["year"].tolist()
    
    costars = []
    
    for movie, year in zip(movies, years):
        if (movie, year) not in bipartite_graph:
            bipartite_graph[(movie, year)] = {actor: 1}
        else:
            bipartite_graph[(movie, year)][actor] = 1
            
        bipartite_graph[actor][(movie, year)] = 1


In [46]:
def bfs_bipartite(G, root):
    queue = [root]
    step = 0
    visited = {root: 0}
    while len(queue) != 0:
        current_node = queue.pop(0)
        for neighbor in G[current_node]:
            if neighbor not in visited:
                visited[neighbor] = visited[current_node] + 1
                queue.append(neighbor)
    
    del visited[root]
    return sum(visited.values())/float(len(visited))

bfs_bipartite(bipartite_graph, 'Morrison, Rana')

3.7969242405143073

In [47]:
bfs_bipartite(bipartite_graph, 'Tatasciore, Fred')

3.6179251229043237

In [37]:
def bfs_alternative(G, root):
    queue = [root]
    step = 0
    visited = {root: 0}
    while len(queue) != 0:
        current_node = queue.pop(0)
        for costar in G[current_node]['costars']:
            if costar not in visited:
                visited[costar] = visited[current_node] + 1
                queue.append(costar)
    
    #del visited[root]
    return sum(visited.values())/float(len(visited))


bfs_alternative(G, 'Morrison, Rana')      

1.848935192278229

In [31]:
def bfs(G, root):
    queue = [node for node in G[root]['costars']]
    step = 0
    visited = {root: 0}
    while len(queue) != 0:
        step += 1
        new_queue = []
        for node in queue:
            visited[node] = step
            
        for node in queue:
            new_queue += [costar for costar in G[node]['costars'] if costar not in visited]
        queue = set(new_queue)
    del visited[root]
    return_value = {
        "Graph Size": len(G), 
        "Connected Nodes": len(visited)+1, 
        "Average Path Length": sum(visited.values())/float(len(visited)),
        "Min Path Length": sorted(visited.values())[0],
        "Max Path Length": sorted(visited.values())[-1]
    }
    
    return return_value
bfs(G, 'Tatasciore, Fred')

{'Average Path Length': 1.8492185105730923,
 'Connected Nodes': 6527,
 'Graph Size': 6527,
 'Max Path Length': 3,
 'Min Path Length': 1}

In [38]:
bfs(G, 'Morrison, Rana')

{'Average Path Length': 1.9261415874961691,
 'Connected Nodes': 6527,
 'Graph Size': 6527,
 'Max Path Length': 3,
 'Min Path Length': 1}

In [19]:
for k, v in G.items():
    if v['costars'] != set(v['costars']):
        print(k)

In [15]:
paths = {}
count = 0
for node in G:
    count += 1
    if count%100 == 0:
        print "Completion rate: {}%".format(100*count/float(len(G)))    
    paths[node] = bfs(G, node)["Average Path Length"]
    
print "Most Central Actor:", sorted(paths.items(), key=lambda x: x[1])[0]

Completion rate: 1.5320974414%
Completion rate: 3.06419488279%
Completion rate: 4.59629232419%
Completion rate: 6.12838976559%
Completion rate: 7.66048720699%
Completion rate: 9.19258464838%
Completion rate: 10.7246820898%
Completion rate: 12.2567795312%
Completion rate: 13.7888769726%
Completion rate: 15.320974414%
Completion rate: 16.8530718554%
Completion rate: 18.3851692968%
Completion rate: 19.9172667382%
Completion rate: 21.4493641796%
Completion rate: 22.981461621%
Completion rate: 24.5135590624%
Completion rate: 26.0456565038%
Completion rate: 27.5777539452%
Completion rate: 29.1098513865%
Completion rate: 30.6419488279%
Completion rate: 32.1740462693%
Completion rate: 33.7061437107%
Completion rate: 35.2382411521%
Completion rate: 36.7703385935%
Completion rate: 38.3024360349%
Completion rate: 39.8345334763%
Completion rate: 41.3666309177%
Completion rate: 42.8987283591%
Completion rate: 44.4308258005%
Completion rate: 45.9629232419%
Completion rate: 47.4950206833%
Completion 

In [49]:
bipartite_paths = {}
count = 0
for node in bipartite_graph:
    count += 1
    if count%100 == 0:
        print "Completion rate: {}%".format(100*count/float(len(bipartite_graph)))    
    bipartite_paths[node] = bfs_bipartite(bipartite_graph, node)
    
print "Most Central Actor:", sorted(bipartite_paths.items(), key=lambda x: x[1])[0]

Completion rate: 1.5320974414%
Completion rate: 3.06419488279%
Completion rate: 4.59629232419%
Completion rate: 6.12838976559%
Completion rate: 7.66048720699%
Completion rate: 9.19258464838%
Completion rate: 10.7246820898%
Completion rate: 12.2567795312%
Completion rate: 13.7888769726%
Completion rate: 15.320974414%
Completion rate: 16.8530718554%
Completion rate: 18.3851692968%
Completion rate: 19.9172667382%
Completion rate: 21.4493641796%
Completion rate: 22.981461621%
Completion rate: 24.5135590624%
Completion rate: 26.0456565038%
Completion rate: 27.5777539452%
Completion rate: 29.1098513865%
Completion rate: 30.6419488279%
Completion rate: 32.1740462693%
Completion rate: 33.7061437107%
Completion rate: 35.2382411521%
Completion rate: 36.7703385935%
Completion rate: 38.3024360349%
Completion rate: 39.8345334763%
Completion rate: 41.3666309177%
Completion rate: 42.8987283591%
Completion rate: 44.4308258005%
Completion rate: 45.9629232419%
Completion rate: 47.4950206833%
Completion 

In [51]:
count = 0

for node in sorted(bipartite_paths.items(), key=lambda x: x[1]):
    if type(node[0]) == tuple:
        continue
    elif count > 20:
        break
    else:
        count += 1
        print(count, node)
    

(1, ('Tatasciore, Fred', 3.6179251229043237))
(2, ('Jackson, Samuel L.', 3.699609227278457))
(3, ('Welker, Frank', 3.715240136140174))
(4, ('Harnell, Jess', 3.734904827933947))
(5, ('Willis, Bruce', 3.7444850623975796))
(6, ('Hanks, Tom', 3.7467540652968614))
(7, ('Blum, Steve (IX)', 3.7641497541913527))
(8, ('Papajohn, Michael', 3.7717130971889574))
(9, ('Voight, Jon', 3.771965208622211))
(10, ('Starr, Arne', 3.7727215429219716))
(11, ('Damon, Matt', 3.776503214420774))
(12, ('Cruise, Tom', 3.7767553258540274))
(13, ('Downes, Robin Atkin', 3.777511660153788))
(14, ('Abernathy, Don', 3.7787722173200553))
(15, ('Wilson, Owen (I)', 3.782553888818858))
(16, ('Pitt, Brad', 3.787596117483928))
(17, ('Baldwin, Alec', 3.7893608975167026))
(18, ('Diaz, Cameron', 3.790117231816463))
(19, ('North, Nolan', 3.7944031261817726))
(20, ('Hoffman, Dustin', 3.7949073490482794))
(21, ('Morrison, Rana', 3.7969242405143073))


In [11]:
for central_node in G:
    neighbor_connections = sum(len([connected_node for connected_node in G[costar]["costars"] if connected_node in G[central_node]["costars"] and connected_node != central_node]) for costar in G[central_node]["costars"])
    G[central_node]["neighbor_connections"] = neighbor_connections
    

In [12]:
for node in G:
    G[node]['closeness_centrality'] = G[node]["neighbor_connections"]/float(2*G[node]["degree"]*(G[node]["degree"]-1))
    

In [13]:
max_centrality = 0
most_popular_actor = ""
for node in G:
    if G[node]['closeness_centrality'] > max_centrality:
        max_centrality = G[node]['closeness_centrality']
        most_popular_actor = node
        
print("max_centrality:", max_centrality)
most_popular_actor


('max_centrality:', 0.5)


'Featherston, Katie'

In [68]:
print(most_popular_actor)

Bhardwaj, Macéo


In [69]:
G['Bhardwaj, Macéo']

{'closeness_centrality': 0.966140350877193,
 'costars': ['Blanc, JB',
  'Boo Khoo, Ian',
  'Carter, Jim (I)',
  'Close, Glenn',
  'Depardieu, G\xc3\xa9rard',
  'Gruffudd, Ioan',
  'Harmon, Linda (II)',
  'Idle, Eric',
  'Keyes, David (II)',
  'McInnerny, Tim',
  'Shale, Kerry',
  'Attenborough, Tom',
  'Biggerstaff, Sean',
  'Bradley, David (IV)',
  'Branagh, Kenneth',
  'Chillin, Gregg',
  'Cleese, John',
  'Coker, David (I)',
  'Coltrane, Robbie',
  'Columbus, Eleanor',
  'Columbus, Violet',
  'Corduner, Allan',
  'Cronin, Sean (I)',
  'Crosby, Chris (III)',
  'Davis, Warwick (I)',
  'Doyle, Louis',
  'Enoch, Alfie',
  'Fairhall, Gary',
  'Felton, Tom',
  'Fry, Stephen (I)',
  'Glover, Julian',
  'Griffiths, Richard (I)',
  'Grint, Rupert',
  'Hardy, Robert (I)',
  'Harris, Richard (I)',
  'Henderson, Shirley (I)',
  'Herdman, Josh',
  'Isaacs, Jason',
  'Jones, Gemma (I)',
  'Jones, Toby (I)',
  'Karpf, \xc3\x88ve',
  'Kydd, Jonathan',
  'Lewis, Matthew (III)',
  'Macleod, Lewis',
 

In [67]:
for k in G:
    print(k, ":", G[k])
    break

('McClure, Marc (I)', ':', {'neighbor_connections': 7414, 'closeness_centrality': 0.11634548992530287, 'degree': 179, 'costars': ['Boyd, Cayden', 'Chao, Rosalind', 'Curtis, Jamie Lee', 'Elrod, Lu', 'Garson, Willie', 'Gonzalo, Julie', 'Gould, Harold (I)', 'Lohan, Lindsay', 'Raymont, Daniel', 'Scott, Lorna', 'Soong, Lucille', 'Tank, Hayden', 'Tobolowsky, Stephen', 'Trainor, Mary Ellen', 'Waters, Dina', 'Ashanti (II)', 'Baker, Ray (I)', 'Beeman, Lacey', 'Boehle, Michelle', 'Byrd, Terrell', 'Callahan, Julius', 'Clark, Adam (I)', 'Clendenin, Curt', 'Costas, Bob', 'Dew, Marty', 'Dowse, Denise', 'Eddy, Sonya', 'Fukuzaki, Rob', 'Gilliard, Carl', 'Gonzalez, Rick (I)', 'Hall, Justin Rodgers', 'Hoffman, Robert (X)', 'Jackson, Samuel L.', 'Laresca, Vincent', 'Levine, Floyd', 'Lewis, Chauntal', 'Lim, Roger (I)', 'McCall-Short, Tanee', 'McKee, Danielle', 'McMurrer, Robert', 'Moreno Jr., Carlos', 'Morgan, Debbi', 'Sagastizado I, Tony', 'Sountsov, Aleksandr', 'Spencer, Octavia', 'Tatum, Channing', 'Th

In [39]:
sum(len([connected_node for connected_node in G[costar]["costars"] if connected_node in G['McClure, Marc (I)']["costars"] and connected_node != 'McClure, Marc (I)']) for costar in G['McClure, Marc (I)']["costars"])

5786

In [None]:
sorted(pd.unique(df.loc[(df.actor != sorted(pd.unique(df.loc[(df.actor != actor) & (df.movie.isin(pd.unique(df.loc[df.actor == 'Aaron, Caroline', "movie"])))]))) & (df.movie.isin(pd.unique(df.loc[df.actor == 'Aaron, Caroline', "movie"])))]))) & (df.movie.isin(pd.unique(df.loc[df.actor == 'Aaron, Caroline', "movie"])))]))) & (df.movie.isin(pd.unique(df.loc[df.actor == 'Aaron, Caroline', "movie"])))]))

In [None]:

sorted(pd.unique(df.loc[(df.actor.isin(df.loc[(df.actor != 'Aaron, Caroline') & (df.movie.isin(pd.unique(df.loc[df.actor == 'Aaron, Caroline', "movie"]))==False)]]))
                    
                

In [None]:
for x in [2, 2, 3, 4]:
    print(2.7)

In [None]:
def minimize_absolute(L):
    min_diff = sum([abs(L[0]-x) for x in L])
    min_val = L[0]
    for val in L:
        abs_diff = sum([abs(val-x) for x in L])
        if abs_diff < min_diff:
            min_diff = abs_diff
            min_val = val
    # your code here
    return min_val

# 



<img src="images/" />

<img src="images/" />


# 

<img src="images/" />

<img src="images/" />

<img src="images/" />


# 

<img src="images/" />

<img src="images/" />

<img src="images/" />


# 

<img src="images/" />

<img src="images/" />

<img src="images/" />


# 

<img src="images/" />

<img src="images/" />

<img src="images/" />


# 

<img src="images/" />

<img src="images/" />

<img src="images/" />


# 

<img src="images/" />

<img src="images/" />

<img src="images/" />
