# Analyzing the Use of the Word Babe

This notebook contains analysis quantifying how frequently Babe/Baby are used in songs.

## Imports and Basic Setup

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import copy
import h5py
import numpy as np
import pandas as pd
import requests
import sqlite3
import string

In [3]:
# Plotting imports

import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
matplotlib.use('PDF')
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pu_colormaps as pu_cm
import matplotlib.transforms as transforms
import matplotlib.patheffects as path_effects

import general_plotting as gen_plot



In [4]:
# Default changes to matplotlib

# Make the x and y ticks bigger
matplotlib.rcParams['xtick.labelsize'] = 20
matplotlib.rcParams['xtick.major.size'] = 10
matplotlib.rcParams['xtick.major.width'] = 2
matplotlib.rcParams['ytick.labelsize'] = 20
matplotlib.rcParams['ytick.major.size'] = 10
matplotlib.rcParams['ytick.major.width'] = 2

# Global Parameters of the Data Set

In [5]:
n_songs = 237662

# Load the Data

In [6]:
# Load the file containing all the names
filename = './full_word_list.txt'
names = ['Word', 'Count']
full_word_list = pd.read_csv(filename, sep='<SEP>', names=names, skiprows=7)
full_word_list



Unnamed: 0,Word,Count
0,i,2078808
1,the,1863782
2,you,1744257
3,to,1067578
4,and,1055748
5,a,974499
6,it,821152
7,me,771755
8,not,735396
9,in,626410


In [7]:
# As an example, look at the number of times the words is said per song, on average
float(full_word_list['Count'][0])/float(n_songs)

8.74690947648341

# Find the Number of Times Babe Is Used

In [8]:
# Find The Index of a targeted word, probablye babe
targeted_word = 'bab'
selected_is = []
for i in range(5000):
  if targeted_word in full_word_list['Word'][i]:
    selected_is.append(i)

In [9]:
# Print out information about the targeted word
print '{:<10} {:<10} {:<10} {:<10}'.format('index', 'word', 'count', 'times_per_song')
print ''
for i in selected_is:
  word = full_word_list['Word'][i]
  count = full_word_list['Count'][i]
  times_per_song = float(count)/float(n_songs)
  print '{:<10} {:<10} {:<10} {:<10}'.format(i, word, count, times_per_song)

index      word       count      times_per_song

69         babi       124647     0.524471728758
674        babe       8359       0.0351717986047
1427       probabl    3080       0.012959581254
2376       babylon    1483       0.0062399542207


## Takeaway
The following facts are useful....

In [10]:
print 'Babe, or equivalents, is the {}th most common word.'.format(i+1)
print 'It is used {:.3g} times per song'.format(times_per_song)

Babe, or equivalents, is the 2377th most common word.
It is used 0.00624 times per song


# Process the Contributing Songs

In [11]:
# Get the training set data.
filename = './mxm_dataset_train.txt'
names = ['Full']
train_songs = pd.read_csv(filename, sep='<SEP>', skiprows=18, names=names)



In [12]:
# Process a line in the training set
def process_song(i):
  song = train_songs['Full'][i]
  song_split = string.split(song, ',')
  track_id = song_split[0]
  mxm_track_id = song_split[1]
  data = song_split[2:]
  
  # put the data in a dictionary format
  # Turn into a dictionary
  data_dict = {}
  for word_count in data:
    word_id, word_count = string.split(word_count, ':')
    data_dict[int(word_id)] = int(word_count)

  return track_id, mxm_track_id, data_dict

In [13]:
# Test my function out
process_song(0)

('TRAAAAV128F421A322',
 '4623710',
 {1: 6,
  2: 4,
  3: 2,
  4: 2,
  5: 5,
  6: 3,
  7: 1,
  8: 1,
  11: 1,
  12: 2,
  13: 3,
  14: 1,
  15: 1,
  18: 2,
  19: 2,
  20: 2,
  21: 2,
  23: 4,
  25: 1,
  26: 2,
  28: 1,
  30: 1,
  36: 2,
  42: 1,
  45: 1,
  54: 2,
  56: 1,
  57: 1,
  68: 1,
  99: 1,
  192: 2,
  249: 1,
  264: 1,
  356: 1,
  389: 1,
  561: 1,
  639: 1,
  656: 1,
  687: 1,
  761: 1,
  773: 1,
  804: 1,
  869: 2,
  914: 1,
  1035: 1,
  1156: 1,
  1221: 1,
  1287: 1,
  1364: 1,
  1407: 1,
  1533: 2,
  1857: 1,
  2096: 1,
  2117: 1,
  2482: 2,
  2548: 1,
  2705: 1,
  2723: 1,
  2868: 2,
  2992: 2,
  3455: 1,
  3717: 1,
  3851: 1,
  4322: 1,
  4382: 1,
  4613: 1,
  4713: 1,
  4906: 1})

In [14]:
# Process the full dataset
track_ids = []
mxm_track_ids = []
datas = []
for i in range(len(train_songs['Full'])):
  
  track_id, mxm_track_id, data = process_song(i)
  
  track_ids.append(track_id)
  mxm_track_ids.append(mxm_track_id)
  datas.append(data)

In [15]:
# Add them to the dataset
train_songs['ID'] = pd.Series(track_ids, index=train_songs.index)
train_songs['MXM_ID'] = pd.Series(mxm_track_ids, index=train_songs.index)
train_songs['Data'] = pd.Series(datas, index=train_songs.index)

# Identify Songs with Certain Words in Them

In [16]:
# Get all the words.
f = open(filename)
f.readline(2)

'# '

In [17]:
# Get the full list of words
with open(filename) as f:
    content = f.readlines()
words_string = content[17][1:]
words = string.split(words_string, ',')
words

['i',
 'the',
 'you',
 'to',
 'and',
 'a',
 'me',
 'it',
 'not',
 'in',
 'my',
 'is',
 'of',
 'your',
 'that',
 'do',
 'on',
 'are',
 'we',
 'am',
 'will',
 'all',
 'for',
 'no',
 'be',
 'have',
 'love',
 'so',
 'know',
 'this',
 'but',
 'with',
 'what',
 'just',
 'when',
 'like',
 'now',
 'que',
 'time',
 'can',
 'come',
 'de',
 'there',
 'go',
 'up',
 'oh',
 'la',
 'one',
 'they',
 'out',
 'down',
 'get',
 'she',
 'was',
 'see',
 'if',
 'got',
 'never',
 'from',
 'he',
 'feel',
 'want',
 'let',
 'make',
 'way',
 'say',
 'take',
 'would',
 'as',
 'ca',
 'day',
 'at',
 'babi',
 'away',
 'life',
 'yeah',
 'y',
 'back',
 'by',
 'her',
 'heart',
 'here',
 'how',
 'could',
 'night',
 'need',
 'our',
 'look',
 'where',
 'en',
 'eye',
 'thing',
 'world',
 'more',
 'caus',
 'gonna',
 'die',
 'right',
 'been',
 'tell',
 'think',
 'un',
 'who',
 'el',
 'through',
 'man',
 'live',
 'again',
 'give',
 'too',
 'onli',
 'te',
 'tri',
 'tu',
 'or',
 'whi',
 'se',
 'keep',
 'dream',
 'well',
 'mind',

In [18]:
words[72]

'babi'

In [19]:
words[729]

'babe'

In [20]:
# Find all songs with the targeted word in them
target_ind = 67
relevant_inds = []
for i, data in enumerate(train_songs['Data']):
  if target_ind in data:
    relevant_inds.append(i)
relevant_inds

[2,
 4,
 5,
 15,
 17,
 19,
 21,
 24,
 26,
 33,
 41,
 46,
 47,
 52,
 55,
 56,
 65,
 70,
 71,
 74,
 80,
 84,
 89,
 93,
 94,
 95,
 96,
 100,
 103,
 105,
 110,
 114,
 120,
 121,
 123,
 129,
 133,
 142,
 146,
 147,
 154,
 156,
 157,
 159,
 161,
 166,
 174,
 175,
 178,
 185,
 186,
 189,
 190,
 193,
 197,
 206,
 208,
 214,
 216,
 217,
 218,
 225,
 227,
 232,
 235,
 239,
 247,
 248,
 251,
 253,
 257,
 266,
 269,
 271,
 284,
 287,
 290,
 292,
 295,
 297,
 299,
 307,
 309,
 320,
 324,
 329,
 332,
 334,
 336,
 338,
 340,
 353,
 356,
 357,
 358,
 363,
 364,
 367,
 371,
 375,
 379,
 381,
 384,
 386,
 389,
 404,
 405,
 412,
 419,
 425,
 435,
 440,
 445,
 446,
 447,
 449,
 456,
 457,
 463,
 467,
 479,
 481,
 483,
 486,
 499,
 501,
 502,
 503,
 512,
 528,
 538,
 543,
 548,
 552,
 554,
 558,
 566,
 570,
 576,
 577,
 591,
 598,
 616,
 617,
 620,
 624,
 628,
 629,
 631,
 632,
 641,
 642,
 647,
 648,
 649,
 656,
 660,
 662,
 667,
 678,
 679,
 682,
 696,
 697,
 700,
 705,
 707,
 709,
 717,
 722,
 726,
 731

In [21]:
len(relevant_inds)

49683

In [22]:
relevant_songs = train_songs.iloc[relevant_inds]
relevant_songs

Unnamed: 0,Full,ID,MXM_ID,Data
2,"TRAAAED128E0783FAB,2516445,1:28,2:15,3:2,4:12,...",TRAAAED128E0783FAB,2516445,"{1: 28, 2: 15, 3: 2, 4: 12, 5: 22, 6: 2, 7: 2,..."
4,"TRAAAEW128F42930C0,3783760,1:4,4:5,5:7,6:2,7:4...",TRAAAEW128F42930C0,3783760,"{1: 4, 258: 1, 4: 5, 5: 7, 6: 2, 7: 4, 264: 1,..."
5,"TRAAAFD128F92F423A,6640025,1:16,2:4,4:1,5:3,6:...",TRAAAFD128F92F423A,6640025,"{1: 16, 2: 4, 299: 1, 4: 1, 5: 3, 6: 5, 7: 5, ..."
15,"TRAABHB12903CAFC2F,9357016,1:2,2:7,4:4,5:1,6:8...",TRAABHB12903CAFC2F,9357016,"{512: 2, 1: 2, 2: 7, 4: 4, 5: 1, 6: 8, 8: 1, 9..."
17,"TRAABIG128F9356C56,678806,1:28,2:77,3:31,4:41,...",TRAABIG128F9356C56,678806,"{1: 28, 2: 77, 3: 31, 4: 41, 5: 5, 6: 13, 8: 1..."
19,"TRAABJV128F1460C49,851082,1:6,2:5,3:10,4:4,5:6...",TRAABJV128F1460C49,851082,"{1: 6, 2: 5, 3: 10, 4: 4, 5: 6, 6: 2, 7: 6, 8:..."
21,"TRAABOA128F933684A,3329603,1:15,2:24,3:1,4:19,...",TRAABOA128F933684A,3329603,"{1: 15, 2: 24, 3: 1, 4: 19, 5: 16, 6: 2, 520: ..."
24,"TRAABVM128F92CA9DC,2097569,1:19,2:5,3:2,4:4,5:...",TRAABVM128F92CA9DC,2097569,"{1: 19, 2: 5, 3: 2, 4: 4, 5: 5, 6: 6, 7: 3, 8:..."
26,"TRAACER128F4290F96,8700148,1:4,2:11,3:19,4:4,5...",TRAACER128F4290F96,8700148,"{512: 1, 1: 4, 2: 11, 3: 19, 4: 4, 5: 4, 6: 2,..."
33,"TRAACQW128F428854F,1694363,1:5,2:11,3:11,4:4,5...",TRAACQW128F428854F,1694363,"{1: 5, 2: 11, 3: 11, 4: 4, 5: 3, 6: 3, 257: 2,..."


# Get a sampling of the relevant songs

## Example Parsing of One Song

### For A Request Based on Track ID

In [23]:
# Get just one song out.
parameters={
  'apikey' : 'dbd9ed973aa54279874ac56cc1546a5a',
  'track_id' : 2516445,
}
response = requests.get("http://api.musixmatch.com/ws/1.1/track.get", params=parameters)
response

<Response [200]>

In [24]:
# Get the data
json = response.json()
data = json['message']['body']['track']
data

{u'album_coverart_100x100': u'http://s.mxmcdn.net/images-storage/albums/nocover.png',
 u'album_coverart_350x350': u'',
 u'album_coverart_500x500': u'',
 u'album_coverart_800x800': u'',
 u'album_id': 15797268,
 u'album_name': u'Twentysomething',
 u'artist_id': 53213,
 u'artist_mbid': u'b10806de-2198-4313-af6a-13df4acb912f',
 u'artist_name': u'Jamie Cullum',
 u'commontrack_id': 3005009,
 u'commontrack_vanity_id': u'Jamie-Cullum/It-s-About-Time',
 u'explicit': 0,
 u'first_release_date': u'2003-01-01T00:00:00Z',
 u'has_lyrics': 1,
 u'has_lyrics_crowd': 0,
 u'has_richsync': 0,
 u'has_subtitles': 1,
 u'instrumental': 0,
 u'lyrics_id': 1516822,
 u'num_favourite': 0,
 u'primary_genres': {u'music_genre_list': [{u'music_genre': {u'music_genre_id': 11,
     u'music_genre_name': u'Jazz',
     u'music_genre_name_extended': u'Jazz',
     u'music_genre_parent_id': 34,
     u'music_genre_vanity': u'Jazz'}}]},
 u'restricted': 0,
 u'secondary_genres': {u'music_genre_list': []},
 u'subtitle_id': 9730608,

### For a Search Request

In [25]:
# Get just one song out.
parameters={
  'apikey' : 'dbd9ed973aa54279874ac56cc1546a5a',
  'q_lyrics' : 'baby',
  'page_size' : 100,
}
response = requests.get("http://api.musixmatch.com/ws/1.1/track.search", params=parameters)
response

<Response [200]>

In [26]:
track_data = response.json()['message']['body']['track_list'][0]['track']
track_data

{u'album_coverart_100x100': u'http://s.mxmcdn.net/images-storage/albums/nocover.png',
 u'album_coverart_350x350': u'',
 u'album_coverart_500x500': u'',
 u'album_coverart_800x800': u'',
 u'album_id': 14328685,
 u'album_name': u'Jai Paul',
 u'artist_id': 13772423,
 u'artist_mbid': u'312183e0-64ff-48df-a9f4-0804ca23a7d3',
 u'artist_name': u'Jai Paul',
 u'commontrack_id': 12239288,
 u'commontrack_vanity_id': u'Jai-Paul/Baby-Beats',
 u'explicit': 0,
 u'first_release_date': u'2013-04-13T00:00:00Z',
 u'has_lyrics': 1,
 u'has_lyrics_crowd': 0,
 u'has_richsync': 0,
 u'has_subtitles': 1,
 u'instrumental': 0,
 u'lyrics_id': 11347912,
 u'num_favourite': 0,
 u'primary_genres': {u'music_genre_list': []},
 u'restricted': 0,
 u'secondary_genres': {u'music_genre_list': []},
 u'subtitle_id': 6673878,
 u'track_edit_url': u'https://www.musixmatch.com/lyrics/Jai-Paul/Baby-Beats/edit?utm_source=application&utm_campaign=api&utm_medium=Northwestern+University%3A1409614557260',
 u'track_id': 19881155,
 u'track

In [27]:
# Get the Artist out
artist = track_data['artist_name']
artist

u'Jai Paul'

In [28]:
# Get the song tile out
song_title = track_data['track_name']
song_title

u'Baby Beats'

### What Next??

This Notebook is so close to plotting some information. It needs to be finished up though.