In [100]:
try:
    import graphlab as gl
    import graphlab.aggregate as agg
    import matplotlib.pyplot as plt
    from matplotlib import rcParams
    import datetime as dt
    from graphlab import SFrame
    %matplotlib inline
    
except:
    raise ImportError("Key libraries cannot be loaded.")

In [44]:
rcParams['figure.figsize'] = (10, 10)
rcParams['axes.labelsize'] = 20
rcParams['axes.titlesize'] = 22
rcParams['xtick.labelsize'] = 16
rcParams['ytick.labelsize'] = 16
rcParams['xtick.direction'] = 'out'
rcParams['ytick.direction'] = 'out'

def clean_plot(ax):
    for x in ax.spines.values():
        x.set_visible(False)
    ax.grid(True, 'major', color='w', linestyle='-', linewidth=1.4)
    ax.patch.set_facecolor('0.92')
    ax.set_axisbelow(True)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

In [4]:
import os

f_userdata = 'user_edges_2011-07-13'

if os.path.exists(f_userdata):
    sf = gl.SFrame(f_userdata)
else:
    url_userdata = 'https://static.turi.com/datasets/bitcoin/{}.txt'.format(f_userdata)
    sf = gl.SFrame.read_csv(url_userdata, delimiter='\t', header=False,
                              column_type_hints={'X1': int, 'X2': int, 'X3': float})
    sf.rename({'X1': 'src', 'X2': 'dst', 'X3': 'btc', 'X4': 'timestamp'})
    sf.save(f_userdata)

In [45]:
# Show graphs and sframes inside ipython notebook
gl.canvas.set_target('ipynb')

sf.show()

In [6]:
sf['timestamp'] = sf['timestamp'].str_to_datetime('%Y-%m-%d-%H-%M-%S')
sf.add_columns(sf['timestamp'].split_datetime(column_name_prefix=None, limit=['year', 'month', 'day']))

src,dst,btc,timestamp,year,month,day
1,5994,8.94,2011-07-04 09:05:56,2011,7,4
905914,20572,0.01,2011-06-23 19:10:01,2011,6,23
905914,622803,220.07592886,2011-06-23 19:10:01,2011,6,23
823336,118969,2.12,2011-05-16 01:58:01,2011,5,16
823336,330686,0.56210609,2011-05-16 01:58:01,2011,5,16
2,282877,0.15,2011-05-23 04:48:17,2011,5,23
2,902253,1.35,2011-05-23 04:48:17,2011,5,23
448634,40297,0.88,2011-06-11 06:00:01,2011,6,11
448634,71810,0.11,2011-06-11 06:00:01,2011,6,11
373684,2909,35.0,2010-10-19 10:37:48,2010,10,19


In [7]:
f_price = 'https://static.turi.com/datasets/bitcoin/market-price.csv'

sf_price = gl.SFrame.read_csv(f_price, delimiter=',', header=False,
                              column_type_hints=[str, float])
sf_price.rename({'X1': 'timestamp', 'X2': 'close-price'})

sf_price['timestamp'] = sf_price['timestamp'].str_to_datetime('%d/%m/%Y')
sf_price.add_columns(sf_price['timestamp'].split_datetime(column_name_prefix=None, limit=['year', 'month', 'day']))
sf_price.head(5)

timestamp,close-price,year,month,day
2009-01-03 00:00:00,0.0,2009,1,3
2009-01-04 00:00:00,0.0,2009,1,4
2009-01-05 00:00:00,0.0,2009,1,5
2009-01-06 00:00:00,0.0,2009,1,6
2009-01-07 00:00:00,0.0,2009,1,7


In [8]:
sf = sf.join(sf_price, on=['year', 'month', 'day'], how='left')
sf.remove_column('timestamp.1')

sf['dollar'] = sf.apply(lambda x: x['btc'] * x['close-price'])
sf['dollar_label'] = sf['dollar'].apply(lambda x: '$' + str(round(x, 2)))

In [47]:
sf_new = sf.head(500)
sf_new

src,dst,btc,timestamp,year,month,day,close-price,dollar,dollar_label
1,5994,8.94,2011-07-04 09:05:56,2011,7,4,15.85,141.699,$141.7
905914,20572,0.01,2011-06-23 19:10:01,2011,6,23,16.0,0.16,$0.16
905914,622803,220.07592886,2011-06-23 19:10:01,2011,6,23,16.0,3521.21486176,$3521.21
823336,118969,2.12,2011-05-16 01:58:01,2011,5,16,8.5,18.02,$18.02
823336,330686,0.56210609,2011-05-16 01:58:01,2011,5,16,8.5,4.777901765,$4.78
2,282877,0.15,2011-05-23 04:48:17,2011,5,23,7.45,1.1175,$1.12
2,902253,1.35,2011-05-23 04:48:17,2011,5,23,7.45,10.0575,$10.06
448634,40297,0.88,2011-06-11 06:00:01,2011,6,11,30.0,26.4,$26.4
448634,71810,0.11,2011-06-11 06:00:01,2011,6,11,30.0,3.3,$3.3
373684,2909,35.0,2010-10-19 10:37:48,2010,10,19,0.1024,3.584,$3.58


In [48]:
g = gl.SGraph().add_edges(sf, src_field='src', dst_field='dst')
# g = gl.SGraph().add_edges(sf_new, src_field='src', dst_field='dst')

In [49]:
g.summary()

{'num_edges': 1961636, 'num_vertices': 881678}

In [42]:
# g.show(arrows=False, node_size=100, ewidth = 1)
# g.show(highlight=[2, 3], vlabel='id', arrows=True)

# 以下尝试，将hub结点，展示出来  -  最好可以标记其入度或者出度等指标

In [50]:
# def count_degree(src, edge, dst):
#     dst['in_degree'] += 1
#     src['out_degree'] += 1
#     return (src, edge, dst)

# def get_degree(g):
#     new_g = gl.SGraph(g.vertices, g.edges)
#     new_g.vertices['in_degree'] = 0
#     new_g.vertices['out_degree'] = 0
#     return new_g.triple_apply(count_degree, ['in_degree', 'out_degree']).get_vertices()

# sf_degree = get_degree(g)
# sf_degree['total_degree'] = sf_degree['in_degree'] + sf_degree['out_degree']

In [126]:
# print sf_degree

In [53]:
pr = gl.pagerank.create(g, verbose=False)
pr_out = pr.get('pagerank')
pr_out.topk('pagerank', k=5)

AttributeError: 'SFrame' object has no attribute '__id'

In [120]:
most_influencial_id = pr_out.topk('pagerank', k=30)[1:31]['__id']

In [73]:
18 in most_influencial_id

True

In [121]:
sf_temp = sf[:0]
for each in most_influencial_id:
    sf_temp = sf_temp.append(sf[sf['src'] == each])
    sf_temp = sf_temp.append(sf[sf['dst'] == each])

In [122]:
sf_most_influencial = sf_temp[sf_temp['src'] != sf_temp['dst']]

# 得到top k influencial的id，选出sf中对应row。建立图g，画出来即可

删除src和dst相同的

In [125]:
sf_most_influencial.head(1000)

src,dst,btc,timestamp,year,month,day,close-price,dollar,dollar_label
18,11020,0.01113424,2011-07-01 05:43:25,2011,7,1,17.0,0.18928208,$0.19
18,204144,5.0,2011-07-01 05:43:25,2011,7,1,17.0,85.0,$85.0
18,820702,1.0,2011-07-08 09:18:09,2011,7,8,15.64276,15.64276,$15.64
18,632032,144.9,2011-07-02 21:25:00,2011,7,2,16.49,2389.401,$2389.4
18,64304,4.5,2011-06-25 17:58:48,2011,6,25,17.6,79.2,$79.2
18,46974,0.49,2011-06-25 17:58:48,2011,6,25,17.6,8.624,$8.62
18,405155,10.92,2010-12-10 14:17:13,2010,12,10,0.204,2.22768,$2.23
18,836233,25.0,2010-12-10 14:17:13,2010,12,10,0.204,5.1,$5.1
18,441758,1.67496,2011-06-29 14:31:00,2011,6,29,17.3,28.976808,$28.98
18,64530,0.01236146,2011-06-29 14:31:00,2011,6,29,17.3,0.213853258,$0.21


In [131]:
g_most_influencial = gl.SGraph().add_edges(sf_most_influencial.head(200), src_field='src', dst_field='dst')

In [132]:
g_most_influencial.show(arrows=False, node_size=100, ewidth = 1)

In [138]:
g_1 = gl.SGraph().add_edges(sf.head(500), src_field='src', dst_field='dst')

In [139]:
g_1.show(arrows=False, node_size=100, ewidth = 1)

In [137]:
g_1.summary()

{'num_edges': 999, 'num_vertices': 1323}