In [1]:
import os
import os.path
import re
from itertools import chain
from pbl import *
import array
from scipy.io import wavfile

### Getting the genre related to the given artist using Echonest API

In [2]:
from pyechonest import song, track, artist

In [3]:
a = artist.Artist('hugh laurie')

In [4]:
a.terms

[{u'frequency': 1.0, u'name': u'blues', u'weight': 1.0},
 {u'frequency': 0.6555445685581386,
  u'name': u'comedy',
  u'weight': 0.8634624581539933},
 {u'frequency': 0.7890925247882442,
  u'name': u'house',
  u'weight': 0.5853658536585366},
 {u'frequency': 0.49534963086953726,
  u'name': u'jazz',
  u'weight': 0.37374461979913914},
 {u'frequency': 0.2814431479110384,
  u'name': u'soul',
  u'weight': 0.2697274031563845},
 {u'frequency': 0.061417137095553305,
  u'name': u'piano blues',
  u'weight': 0.2068388330942133},
 {u'frequency': 0.16820015207998337,
  u'name': u'singer-songwriter',
  u'weight': 0.20157819225251075},
 {u'frequency': 0.07294488419943496,
  u'name': u'delta blues',
  u'weight': 0.1874701099952176},
 {u'frequency': 0.4065324320235649,
  u'name': u'rock',
  u'weight': 0.17025346724055476},
 {u'frequency': 0.10006501718912103,
  u'name': u'swing',
  u'weight': 0.15136298421807748},
 {u'frequency': 0.17460130296195855,
  u'name': u'country',
  u'weight': 0.14968914395026303

### Converting wav file to mp3 file

In [5]:
from pydub import AudioSegment
filepath = os.path.join("../dl_songs/chrisBrown_nextToYou.mp3")
wavfilepath = re.sub(".mp3", ".wav", filepath)

AudioSegment.from_mp3(filepath).export(wavfilepath, format="wav")

<open file '../dl_songs/chrisBrown_nextToYou.wav', mode 'wb+' at 0x110e9d8a0>

In [8]:
from utils import wavfilePlayer
%pylab inline
display(wavfilePlayer.wavPlayer(wavfilepath))

Populating the interactive namespace from numpy and matplotlib


### Accessing the audio signal from the mp3 file

In [10]:
from pydub.utils import mediainfo
info = mediainfo(filepath)
sample_rate = int(info['sample_rate'])
channels = int(info['channels'])
print "File:{}\nSampling Rate: {}\nChannels:{}".format(filepath,sample_rate, channels)
#Channels - 1 if mono, 2 if stereo

File:../dl_songs/chrisBrown_nextToYou.mp3
Sampling Rate: 44100
Channels:2


In [11]:
sample = AudioSegment.from_mp3(filepath)
audio_bytestring = sample._data
len(audio_bytestring)

49167360

In [12]:
audio = np.fromstring(audio_bytestring, dtype=np.int32()).astype(np.int16()) #converting the dtype to int16 so wavplayer can function
audio

array([0, 0, 0, ..., 0, 0, 0], dtype=int16)

In [13]:
newwavfilepath = re.sub(".wav", '1.wav', wavfilepath)
wavfile.write(newwavfilepath, sample_rate, audio)
display(wavfilePlayer.wavPlayer(newwavfilepath))

### Compare the signals from the wavfile we created using the pydub's AudioSegment's .export() and the wavfile created using the audio signal from the ._data

In [14]:
wav1 = wavfile.read(wavfilepath)
wav2 = wavfile.read(newwavfilepath)

In [15]:
wav1 #the signal is two-dimensional

(44100, array([[0, 0],
        [0, 0],
        [0, 0],
        ..., 
        [0, 0],
        [0, 0],
        [0, 0]], dtype=int16))

In [16]:
wav2 #the signal is one-dimensional

(44100, array([0, 0, 0, ..., 0, 0, 0], dtype=int16))

In [17]:
len(wav2[1])

12291840

In [18]:
np.all(wav1[1][:,0]==wav2[1]) #the first column of audio of wav1 is equal to audio of wav2

True

### Get the songs from Top 40 Albums Week 20 2013 album

In [19]:
basedir = os.path.join("../Top 40 Albums Week 20 2013/")

In [20]:
def extract_names(directory):
    music_names = []
    music_data = []
    for file_ in os.listdir(directory):
        for string in file_.split('-'):
            if re.match(".*.mp3", string):
                music_names.append(string.strip())
                data_dir = os.path.join(directory, file_)
                music_data.append(extract_data(data_dir))
    return music_names, music_data

def extract_data(directory):
    audio = AudioSegment.from_mp3(directory)
    audio_bytestring = audio._data
    audio_signal = np.fromstring(audio_bytestring, dtype=np.int32()).astype(np.int16())
    
    info = mediainfo(directory)
    sample_rate = int(info['sample_rate'])
    channels = int(info['channels'])
    return audio_signal, sample_rate, channels

    

def addstodct(singer, dct, dir_):
    if singer in dct:
        dct[singer]+=extract_names(dir_)
    else:
        dct[singer] = extract_names(dir_)
        
def process(path):
    dct = {}
    for i in os.listdir(path): 
        dir_ = os.path.join(path,i)
        if os.path.isdir(dir_) and re.match("%s[0-9][0-9].*"%path, dir_): #excludes the last album(All that Jazz and Sum)
            singer = dir_.split('-')[1].strip()
            #print singer
            if not extract_names(dir_): #detects empty music, thus have subcategories
                for file_ in os.listdir(dir_):
                    subcategory = os.path.join(file_)
                    if os.path.isdir(subcategory):
                        addstodct(singer, dct, subcategory)
            else: 
                addstodct(singer, dct, dir_)
            
    return dct

In [21]:
a = process(basedir)

In [22]:
b= {x:zip(*a[x]) for x in a}

In [23]:
df1 = pd.DataFrame(list(chain.from_iterable((((k, v) for v in vals) for (k, vals) in b.items()))),
                  columns=('artist', 'data'))

In [24]:
df1['song'] = df1['data'].map(lambda x: x[0])
df1['signal'] = df1['data'].map(lambda x: x[1][0])
df1['sample_rate'] = df1['data'].map(lambda x: x[1][1])
df1['channel'] = df1['data'].map(lambda x: x[1][2])
df1.drop('data', axis =1, inplace = True)

In [25]:
df1

Unnamed: 0,artist,song,signal,sample_rate,channel
0,Pink,Are We All We Are.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
1,Pink,Blow Me One Last Kiss .mp3,"[-9, -14, -10, -9, -11, -11, -9, -12, -11, -10...",44100,2
2,Pink,Try.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
3,Pink,Just Give Me A Reason.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
4,Pink,True Love.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
5,Pink,How Come Youre Not Here.mp3,"[-4, 3, 8, -1, 0, 7, 10, 8, 5, 1, 4, 11, 3, -6...",44100,2
6,Pink,Slut Like You.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
7,Pink,The Truth About Love.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
8,Pink,Beam Me Up.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
9,Pink,Walk Of Shame.mp3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2


In [26]:
#some final cleaning
df1.loc[df1["artist"]=="Taylor Swift Red 2012", "artist"] = "Taylor Swift"
df1.loc[df1["artist"]=="The Script  #3", "artist"] = "The Script"
df1.loc[df1["artist"]=="Paramore  Paramore(2013)", 'artist'] = "Paramore"
df1.loc[df1["artist"]=="Bastille Bad Blood", 'artist'] = "Bastille"
df1['song'] = map(lambda x: re.sub(".mp3", "", x).strip(), df1['song'])

In [27]:
df1

Unnamed: 0,artist,song,signal,sample_rate,channel
0,Pink,Are We All We Are,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
1,Pink,Blow Me One Last Kiss,"[-9, -14, -10, -9, -11, -11, -9, -12, -11, -10...",44100,2
2,Pink,Try,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
3,Pink,Just Give Me A Reason,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
4,Pink,True Love,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
5,Pink,How Come Youre Not Here,"[-4, 3, 8, -1, 0, 7, 10, 8, 5, 1, 4, 11, 3, -6...",44100,2
6,Pink,Slut Like You,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
7,Pink,The Truth About Love,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
8,Pink,Beam Me Up,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
9,Pink,Walk Of Shame,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2


In [28]:
df1[df1['artist']=='Fall Out Boy']

Unnamed: 0,artist,song,signal,sample_rate,channel
260,Fall Out Boy,The Phoenix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
261,Fall Out Boy,My Songs Know What You Did In The Dark,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
262,Fall Out Boy,Alone Together,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
263,Fall Out Boy,Where Did The Party Go,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
264,Fall Out Boy,Just One Yesterday feat Foxes,"[0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0,...",44100,2
265,Fall Out Boy,The Mighty Fall feat Big Sean,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
266,Fall Out Boy,Miss Missing You,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
267,Fall Out Boy,Death Valley,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
268,Fall Out Boy,Young Volcanoes,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
269,Fall Out Boy,Rat A Tat feat Courtney Love,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2


Example: 

In [38]:
index = 261
wavfile.write("sample.wav", df1.ix[index]['sample_rate'], df1.ix[index]['signal'])
display(wavfilePlayer.wavPlayer("sample.wav"))

### Get the songs from dl_songs/ directory.

In [31]:
import os
from utils import data_extraction
basedir = "../dl_songs/"
pwd = os.getcwd()
dir_ = os.path.join(pwd, basedir)
df2 = data_extraction.process_music(dir_)

In [32]:
df2

Unnamed: 0,artist,song,signal,sample_rate,channel
0,Kelly Clarkson,Because Of You,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
1,Kelly Clarkson,Heartbeat Song,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
2,Kelly Clarkson,Since You Been Gone,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
3,Kelly Clarkson,What Doesn't Kill You,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
4,Tove Lo,Habits,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
5,Tove Lo,Heroes,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
6,Tove Lo,Not On Drugs,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
7,Tove Lo,Talking Body,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
8,The Cascades,First Love Never Dies,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
9,The Cascades,Rhythm Of The Rain,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2


Example

In [33]:
index = 0
wavfile.write("sample.wav", df2.ix[index]['sample_rate'], df2.ix[index]['signal'])
display(wavfilePlayer.wavPlayer("sample.wav"))

In [34]:
length = (len(df2.ix[index]['signal'])/df2.ix[index]['sample_rate'])/60
minutes = int(length)
seconds = int((length - minutes)*60)
print "{} min, {} sec".format(minutes, seconds)

3 min, 39 sec


### Combine the 2 dataframes

In [35]:
df = pd.concat([df1, df2]).reset_index(drop = True)
df

Unnamed: 0,artist,song,signal,sample_rate,channel
0,Pink,Are We All We Are,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
1,Pink,Blow Me One Last Kiss,"[-9, -14, -10, -9, -11, -11, -9, -12, -11, -10...",44100,2
2,Pink,Try,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
3,Pink,Just Give Me A Reason,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
4,Pink,True Love,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
5,Pink,How Come Youre Not Here,"[-4, 3, 8, -1, 0, 7, 10, 8, 5, 1, 4, 11, 3, -6...",44100,2
6,Pink,Slut Like You,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
7,Pink,The Truth About Love,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
8,Pink,Beam Me Up,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2
9,Pink,Walk Of Shame,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44100,2


### Pickle the File

In [36]:
import cPickle

In [37]:
df.to_pickle("../others/songs_paul.pkl")