In [1]:
import numpy as np
from tqdm import tqdm_notebook as tqdm
import msgpack

## Preprocessing
Here we're going to loop over all of the stored lyrics to itemize the features and producers (hereby referred to as *associates*).  

We will store a tuple of two values for each artist-associate pair:
1. The frequency of the associate within the artist's corpus
2. The relative weight of that frequency (absolute / total associate count)

First, we build a dictionary containing all of the lyric paths, organized by artist.

In [2]:
from collections import defaultdict
import os

def grab_lyric_paths():
    paths = defaultdict(list)
    for artist in os.listdir('lyrics'):
        artist_path = 'lyrics/{}'.format(artist)
        if os.path.isdir(artist_path):
            song_list = os.listdir(artist_path)
            for song in song_list:
                song_path = 'lyrics/{}/{}'.format(artist, song)
                paths[artist].append(song_path)
    return paths
lyric_paths = grab_lyric_paths()

Then we loop over those paths, and count the total number of songs for each artist.

In [3]:
song_counts = dict()
for artist in lyric_paths:
    song_counts[artist] = len(lyric_paths[artist])
top_ten_prolific_artists = sorted(song_counts.items(), key=lambda item: item[1], reverse=True)[:10]
top_ten_prolific_artists

[('William-shakespeare', 1192),
 ('Lil-wayne', 1152),
 ('Gucci-mane', 1004),
 ('Lil-b', 951),
 ('Eminem', 764),
 ('The-game', 713),
 ('Chief-keef', 649),
 ('Snoop-dogg', 598),
 ('Kanye-west', 576),
 ('2pac', 575)]

> Note: Keep in mind that these numbers are biased towards artists that simply have more lyrics on the Genius platform.  These numbers will be pretty indicative of the "real" number of songs, but there are a few outliers, notably [Lil B](https://www.reddit.com/r/ThankYouBasedGod/comments/1wttyi/does_anyone_have_an_official_count_of_how_much/).  We're just gonna gloss over this, since tabulating the true number of songs by The Based God would amount to a wild-goose chase.  The other issue we face is one of duplicate lyrics posted to the website under slightly different names.  We could assign a duplicate likeliness threshold with fuzzy string matching to get rid of those if we wanted to, but those approaches aren't always very reliable.

Let's make a function to itemize all of the producers and featured artists tied to an artist by looping over the artist's corpus and tallying up everyone they've worked with.

> *Note*: In the case of producers, they aren't always listed on the lyric page, especially if they aren't well known.  This is just part of the game.

In [4]:
import json
from collections import Counter, OrderedDict
from slugify import slugify

def process_associates(associates):
    ass_counted = Counter(associates)
    ass_sorted = sorted(ass_counted.items(), key=lambda item: item[1])[::-1]
    max_weight = ass_sorted[0][1] if ass_sorted else 1
    ass_weighted = [(i[0], (i[1], i[1] / max_weight)) for i in ass_sorted]
    processed_associates = OrderedDict(ass_weighted)
    return processed_associates

def grab_name(d): return [slugify(i['name']) for i in d]

def itemized_associates():
    artists = defaultdict(dict)        
    for artist in tqdm(lyric_paths):
        artist_name = slugify(artist)
        producers, features = list(), list()
        for song_path in lyric_paths[artist]:
            with open(song_path, 'rb') as lfile:
                lyric = msgpack.load(lfile, encoding='utf-8')
                producers += grab_name(lyric.get('producer_artists', []))
                features += grab_name(lyric.get('featured_artists', []))
        artists[artist_name]['producers'] = process_associates(producers)
        artists[artist_name]['features'] = process_associates(features)

    return artists

In [5]:
lyric = msgpack.load(open(lyric_paths['Lil-wayne'][0], 'rb'), encoding='utf-8')
lyric['producer_artists']

[{'api_path': '/artists/43274',
  'header_image_url': 'https://s3.amazonaws.com/rapgenius/Screen-Shot-2013-08-16-at-12.47.56-PM-550x550.jpg',
  'id': 43274,
  'image_url': 'https://s3.amazonaws.com/rapgenius/Screen-Shot-2013-08-16-at-12.47.56-PM-550x550.jpg',
  'is_meme_verified': False,
  'is_verified': False,
  'name': 'DVLP',
  'url': 'http://genius.com/artists/Dvlp'}]

In [6]:
artist_associates = itemized_associates()




Before we proceed, let's grab a quick count of how many features each artist has.

In [7]:
feature_counts = dict()
for artist, associates in artist_associates.items():
    feature_counts[artist] = len(associates['features'])
sorted(feature_counts.items(), key=lambda i: i[1], reverse=True)

[('the-game', 266),
 ('e-40', 221),
 ('2pac', 219),
 ('gucci-mane', 206),
 ('tech-n9ne', 197),
 ('snoop-dogg', 192),
 ('lil-wayne', 174),
 ('dj-kay-slay', 159),
 ('kanye-west', 155),
 ('busta-rhymes', 142),
 ('talib-kweli', 142),
 ('rick-ross', 140),
 ('nas', 137),
 ('funkmaster-flex', 133),
 ('wale', 133),
 ('dj-khaled', 132),
 ('jay-z', 131),
 ('trae-tha-truth', 130),
 ('eminem', 122),
 ('meek-mill', 120),
 ('ti', 119),
 ('statik-selektah', 119),
 ('waka-flocka-flame', 118),
 ('wyclef-jean', 118),
 ('ghostface-killah', 113),
 ('twista', 112),
 ('ludacris', 111),
 ('pitbull', 111),
 ('chris-brown', 109),
 ('z-ro', 108),
 ('drake', 107),
 ('wiz-khalifa', 105),
 ('master-p', 105),
 ('fat-joe', 103),
 ('dj-screw', 102),
 ('fabolous', 102),
 ('french-montana', 102),
 ('raekwon', 102),
 ('juicy-j', 101),
 ('kurupt', 98),
 ('rza', 98),
 ('gordon-parks', 98),
 ('ll-cool-j', 96),
 ('nore', 95),
 ('young-thug', 94),
 ('jadakiss', 93),
 ('snowgoons', 92),
 ('lupe-fiasco', 91),
 ('apathy', 90),


In [24]:
artist_associates['lil-wayne']['producers']

OrderedDict([('mannie-fresh', (49, 1.0)),
             ('infamous', (20, 0.40816326530612246)),
             ('streetrunner', (20, 0.40816326530612246)),
             ('cool', (18, 0.3673469387755102)),
             ('dre', (18, 0.3673469387755102)),
             ('kanye-west', (10, 0.20408163265306123)),
             ('t-mix', (9, 0.1836734693877551)),
             ('mike-will-made-it', (9, 0.1836734693877551)),
             ('dj-drama', (7, 0.14285714285714285)),
             ('marques-houston', (6, 0.12244897959183673)),
             ('boi-1da', (6, 0.12244897959183673)),
             ('dj-nasty', (6, 0.12244897959183673)),
             ('metro-boomin', (6, 0.12244897959183673)),
             ('l-v-m', (5, 0.10204081632653061)),
             ('juicy-j', (5, 0.10204081632653061)),
             ('jim-jonsin', (5, 0.10204081632653061)),
             ('swizz-beatz', (5, 0.10204081632653061)),
             ('detail', (5, 0.10204081632653061)),
             ('jahlil-beats', (5, 0.10204081

Now that we have the associates tied to each artist, let's build a graph to connect them all together.  First up we will add each primary artist, then all associates, as nodes on the graph.  We will refer to the combined domain of objects as *entities*.

In [8]:
import networkx as nx
G = nx.Graph()

In [9]:
all_entities = set(artist_associates.keys())
print('{}: primary artists'.format(len(all_entities)))
for associates in artist_associates.values():
    features = set(associates['features'].keys())
    producers = set(associates['producers'].keys())
    comp = features.union(producers)
    all_entities = all_entities.union(comp)
print('{}: entities after adding associates'.format(len(all_entities)))

10437: primary artists
30972: entities after adding associates


In [10]:
weighted_edges = defaultdict(int)
for artist, associates in artist_associates.items():
    for feature, score in associates['features'].items():
        composite_key = ''.join(sorted([artist, feature]))
        weighted_edges[composite_key] += score[0]

In [11]:
len(list(weighted_edges.items()))

48605

In [12]:
G.add_nodes_from(artist_associates.keys())

Now we will add the edges between all entities, represented as the relationship between the primary artist and each associate.

In [13]:
def iterate_feature_edges(artists):
    for artist, associates in artist_associates.items():
        for feature in associates['features']:
            composite_key = ''.join(sorted([artist, feature]))
            weight = weighted_edges[composite_key]
            yield (artist, feature, {'weight': weight})
G.add_edges_from(iterate_feature_edges(artist_associates))

In [16]:
G.neighbors('lil-wayne')[:10]

['dre',
 'koustav',
 'ransom',
 'weezer',
 'lil-mouse',
 'pac-millie',
 'shane-heyl',
 'jc',
 'beyoownboss',
 'young-tripp']

In [17]:
eigenvectors = nx.eigenvector_centrality(G)
sorted(eigenvectors.items(), key=lambda x: x[1], reverse=True)

[('lil-wayne', 0.398751811946201),
 ('rick-ross', 0.27421767254599855),
 ('dj-khaled', 0.20367339217497815),
 ('gucci-mane', 0.20126221840514064),
 ('birdman', 0.19143958468599745),
 ('the-game', 0.18680848922563334),
 ('drake', 0.16845261361251426),
 ('french-montana', 0.15699218871584897),
 ('2-chainz', 0.13701658890141108),
 ('meek-mill', 0.1351549291608813),
 ('chris-brown', 0.1308589944378187),
 ('future', 0.1306812883604435),
 ('wiz-khalifa', 0.12302036800639604),
 ('jeezy', 0.12099455270583989),
 ('wale', 0.11635473948169693),
 ('nicki-minaj', 0.1153717471324089),
 ('kanye-west', 0.1115899612577216),
 ('busta-rhymes', 0.11144590715496708),
 ('jay-z', 0.10807070374863799),
 ('snoop-dogg', 0.1069078646148289),
 ('tyga', 0.10545555782362681),
 ('juicy-j', 0.102739997048475),
 ('jadakiss', 0.10009879995004893),
 ('curren-y', 0.09538357106077265),
 ('young-thug', 0.09536192498424381),
 ('waka-flocka-flame', 0.09113150016933709),
 ('fabolous', 0.09006150999918441),
 ('t-pain', 0.08966

In [19]:
katz = nx.katz_centrality_numpy(G)
sorted(katz.items(), key=lambda x: x[1], reverse=True)

ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/bvb/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-3ee2c7cdceda>", line 1, in <module>
    katz = nx.katz_centrality_numpy(G)
  File "<decorator-gen-151>", line 2, in katz_centrality_numpy
  File "/home/bvb/anaconda3/lib/python3.5/site-packages/networkx/utils/decorators.py", line 68, in _not_implemented_for
    return f(*args,**kwargs)
  File "/home/bvb/anaconda3/lib/python3.5/site-packages/networkx/algorithms/centrality/katz.py", line 327, in katz_centrality_numpy
    centrality = np.linalg.solve( np.eye(n,n) - (alpha * A) , b)
  File "/home/bvb/anaconda3/lib/python3.5/site-packages/numpy/matrixlib/defmatrix.py", line 349, in __rmul__
    return N.dot(other, self)
MemoryError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/bvb/anaconda3/lib


Unfortunately, your original traceback can not be constructed.



MemoryError: 

Now that we have a functioning graph, the first question I'd like to ask is simply which artist has the most connections? Given that we're currently only tracking breadth, this treats the associates as a set, rather than a collection.  We may add the frequency of the associated as weights later on.

In [77]:
n_neighbors = dict()
for artist in artist_associates:
    n_neighbors[artist] = len(G.neighbors(artist))
artists_sorted_features = sorted(n_neighbors.items(), key=lambda item: item[1], reverse=True)
artists_sorted_features[:10]

[('The-game', 432),
 ('E-40', 423),
 ('Drake', 420),
 ('Nas', 414),
 ('Lil-wayne', 395),
 ('Wale', 388),
 ('Eminem', 357),
 ('Raekwon', 351),
 ('Ludacris', 338),
 ('Fabolous', 332)]

In [90]:
def epicenter_factor(nodes, graph):
    results = dict()
    for node in tqdm(nodes):
        distances = nx.single_source_shortest_path_length(graph, source=artist)
        values = [int(x) for x in distances.values()]
        mean = np.mean(values)
        results[node] = mean
    return results
sorted_names = [x[0] for x in artists_sorted_features]
artist_distances = epicenter_factor(sorted_names[:10], G)




In [91]:
sorted(artist_distances.items(), key=lambda x: x[1])

[('Nas', 4.0095805205907498),
 ('The-game', 4.0095805205907498),
 ('Fabolous', 4.0095805205907498),
 ('Lil-wayne', 4.0095805205907498),
 ('Raekwon', 4.0095805205907498),
 ('E-40', 4.0095805205907498),
 ('Wale', 4.0095805205907498),
 ('Ludacris', 4.0095805205907498),
 ('Drake', 4.0095805205907498),
 ('Eminem', 4.0095805205907498)]

In [100]:
nx.shortest_path_length(G, source='Denzel-curry')

33923