In [543]:
import pandas as pd
import networkx as nx
import numpy as np
import community
import matplotlib.pyplot as plt
import vincent
from vincent.values import ValueRef
from vincent.legends import LegendProperties
from vincent.properties import PropertySet
import json
vincent.core.initialize_notebook()
#from mpl_toolkits.basemap import Basemap

%matplotlib inline

In [544]:
def update_nodes(node, plays):
    if graph.has_node(node):
            graph.node[node]['weight'] = plays
    else:
        graph.add_node(node)
        graph.node[node]['weight'] = plays

def update_edges(src, dst, n_common_users):
    if graph.has_edge(src,dst):
        graph.edge[src][dst]['weight'] = graph[src][dst]['weight'] + n_common_users
    else:
        graph.add_edge(src, dst)
        graph.edge[src][dst]['weight'] = n_common_users

In [545]:
def get_group(artist):
    gid = part.get(artist)
    if gid == None or gid not in comm_id:
        gid = 0
        return gid
    return gid+1

In [546]:
def get_artists(gid):
    return [node for node in graph.nodes() if part.get(node)==gid]

In [547]:
def norm(x):
    x.plays = x.plays/total if x.userid == userid else x.plays
    return x
    

In [548]:
def normalize(data):
    for userid in np.unique(data['userid']):
        plays = data[data['userid'] == userid]['plays']
        #plays = plays/plays.sum()
        total = plays.sum()
        
        data.ix[data.userid == userid, 'plays'] /= total
        

In [549]:
data = pd.read_csv("../data/10000users.csv")

In [527]:
#d = normalize(data)

NameError: name 'gid' is not defined

In [550]:
data.shape

(488518, 8)

In [551]:
top = 30

In [552]:
groups = data.groupby('artist_name').aggregate({'plays': [np.sum]}).sort([ ('plays','sum') ], ascending=False)

In [553]:
top_artists, top_plays = groups.head(top).index, groups.head(top)['plays']['sum'].values

In [554]:
','.join(top_artists)

'the beatles,radiohead,coldplay,muse,pink floyd,linkin park,metallica,nine inch nails,red hot chili peppers,system of a down,depeche mode,placebo,death cab for cutie,in flames,rammstein,the killers,bob dylan,arctic monkeys,nirvana,tool,queen,led zeppelin,the cure,iron maiden,nightwish,u2,sigur r\xc3\xb3s,bj\xc3\xb6rk,david bowie,rise against'

In [296]:
top_plays = top_plays#/float(top_plays.max())

In [555]:
graph=nx.Graph()
for artistA, playsA in zip(top_artists, top_plays):
    update_nodes(artistA, int(playsA) )
    users_per_artistA = data[data['artist_name'] == artistA]
    for artistB, playsB in zip(top_artists, top_plays):
        if artistA == artistB: 
            continue
        users_per_artistB = data[data['artist_name'] == artistB]
        count = list(users_per_artistA.userid.isin(users_per_artistB.userid)).count(True)
        
        #print users_per_artistA.shape
        #print users_per_artistB.shape
        #print count
        if count > 1:
            temp = pd.merge(users_per_artistA, users_per_artistB, on='userid', how='inner')
            temp['total'] = temp['plays_x'] + temp['plays_y']
            #temp['total'] = temp['total']/float(temp['total'].max() )
            update_nodes(artistB, int(playsB))
            update_edges(artistA, artistB, int(temp['total'].sum()) )

In [556]:
nx.write_graphml(graph.to_undirected(), '10000users_latest.graphml')

In [557]:
part = community.best_partition(graph)

In [558]:
community.modularity(part, graph)

0.12777137293373447

In [559]:
values = [part.get(node) for node in graph.nodes()]
np.unique(values)

array([0, 1, 2])

In [560]:
comm_id = set()
for i in  np.unique(values):
    for node in graph.nodes():
        if part.get(node) == i:
            if values.count(i) > 3:
                comm_id.add(i)
            print node,
            print "--",
    print
    print
comm_id

rammstein -- red hot chili peppers -- linkin park -- rise against -- iron maiden -- nightwish -- tool -- system of a down -- nirvana -- metallica -- in flames --

bob dylan -- queen -- the beatles -- u2 -- pink floyd -- david bowie -- led zeppelin --

depeche mode -- nine inch nails -- sigur rós -- death cab for cutie -- radiohead -- björk -- coldplay -- muse -- the cure -- the killers -- arctic monkeys -- placebo --



{0, 1, 2}

In [561]:
data['group'] = data['artist_name'].apply(get_group)

In [562]:
import colorsys

def get_N_HexCol(N=5):

    HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in xrange(N)]
    hex_out = []
    for rgb in HSV_tuples:
        rgb = map(lambda x: int(x*255),colorsys.hsv_to_rgb(*rgb))
        print rgb
        hex_out.append("".join(map(lambda x: chr(x).encode('hex'),rgb)))
    hex_out  = ['#'+ h for h in hex_out]
    print hex_out
    return hex_out

def get_colors(N=5):
    tuples = [(225 - int(i* 200/float(N)), 237 - int(i* 200/float(N)), 234 - int(i * 200 /float(N) ) ) for i in range(N)]
    hex_out = []
    for rgb in tuples:
        hex_out.append("".join(map(lambda x: chr(x).encode('hex'), rgb)))
    hex_out  = ['#'+ h for h in hex_out]
    #print hex_out
    return hex_out

In [563]:
def plot_map(x, title, data_col='plays'):
    world_topo='../data/world-countries.topo.json'
    geo_data = [{'name': 'countries',
                 'url': world_topo,
                 'feature': 'world-countries'},
                {'name': 'countries_outline',
                 'url': world_topo,
                 'feature': 'world-countries'}]
    vis = vincent.Map(data=x, geo_data=geo_data, scale=500,
              projection='winkel3', data_bind=data_col, data_key='country',
              map_key={'countries': 'properties.name'})
    vis.legend(title=title)
    del vis.marks[1].properties.update
    vis.marks[1].properties.enter.stroke.value = '#000'
    #vis.grammar()
    vis.scales[0].domain = [1,x[data_col].max()]
    vis.scales[0].range = get_colors(N=200)
    vis.legends[0].properties = LegendProperties(labels=PropertySet(font_size=ValueRef(value=30)),title=PropertySet(font_size=ValueRef(value=30)))
    vis.display()

In [564]:
for i in np.unique(values):
    map_data = data.groupby(['group']).get_group((i))[['country','plays']]

    x= map_data.groupby('country').sum().sort('plays', ascending=False)
    x.reset_index(level=0, inplace=True)
    max_count = x['plays'].max()
    plot_map(x, title="Group %d"%i)
#x['country'] = x.country.apply(lambda x: 'United States of America' if x == 'United States' else x)

In [565]:
d = data.groupby(['artist_name', 'country']).aggregate({'plays': [np.sum]}).sort([ ('plays','sum') ], ascending=False)

In [566]:
d['plays']['sum'].max(),d['plays']['sum'].min()

(326503, 1)

In [567]:
d.reset_index(inplace=True)

In [568]:
artist = 'pink floyd'
artist_popularity = d[d['artist_name']==artist]

In [569]:
artist_popularity.columns = reversed(artist_popularity.columns.levels[0])

In [570]:
plot_map(artist_popularity, title='Popularity of Pink Floyd')

In [333]:
len(graph.edges())

435

In [535]:
np.unique(data.group)

array([1, 2, 3])

In [529]:
part.get('betty blowtorch')

None


In [541]:
data.head()

Unnamed: 0,userid,artistid,artist_name,plays,gender,age,country,dos,group
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137,f,22,Germany,"Feb 1, 2007",0
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099,f,22,Germany,"Feb 1, 2007",0
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897,f,22,Germany,"Feb 1, 2007",0
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717,f,22,Germany,"Feb 1, 2007",0
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706,f,22,Germany,"Feb 1, 2007",0


In [542]:
data.to_csv('10000users.csv')