Data Quality assessment task
-------------------

At this point we count with all the lyrics from the different sources saved on disk, so the first step is to load them togheder with the metadata

**Load lyrics from different sources**


In [141]:
import json, os
import re

# Songs are called works, load the metadata of songs:
works = json.load(open('./data/clean/works.json'))

# load all the lyrics from the different sources
lyrics = {}
for w in works:
    mypath = "./data/lyrics/%d" % (w['id'])
    if os.path.exists(mypath):
        sources =[f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
        lyrics_sources = {}
        for source in sources:
            lyrics_sources[source.replace('.json', '')] = json.load(open(os.path.join(mypath, source)))
        lyrics[w['id']] = lyrics_sources

In [152]:
# load metadata of artists and recordings
artists = json.load(open('./data/clean/artists.json'))
artists[1794]['name'] = 'Carlos Gardel'
recordings = json.load(open('./data/clean/recordings.json'))

# Load 12k source
sourcek = json.load(open('./data/lyric.json'))

** Define auxiliary methods for calculating jaccard index and comparing name of songs **

In [143]:
from math import*
  
def jaccard_similarity(x,y):
  
 intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 union_cardinality = len(set.union(*[set(x), set(y)]))
 if float(union_cardinality) == 0:
    return 0
 return intersection_cardinality/float(union_cardinality)

def compare_names(a, b):
    if a in b or b in a:
        return True
    sim,_,_ = calculate_distances(a, b)
    return sim >= 0.9

** Define method to compare two lyrics, here we replace upper case for lower case and remove special characters before comparing**

In [144]:
import Levenshtein, difflib
replace_elems = [u'!',u'¡',u'?',u'¿',u'.',u',',u'*',u'\'',u'"',u'`',u'(',u')', u':', u';', u'“']
replace_tilde = {u'á': 'a', u'é': 'e', u'í': 'i', u'ó': 'o', u'ú': 'u',}
def calculate_distances(lyric1, lyric2):
    for e in replace_elems:
        lyric1 = lyric1.replace(e, '')
        lyric2 = lyric2.replace(e, '')
    for e in replace_tilde.keys():
        lyric1 = lyric1.replace(e, replace_tilde[e])
        lyric2 = lyric2.replace(e, replace_tilde[e])
    lyric1 = lyric1.lower().split()
    lyric2 = lyric2.lower().split()
    
    jac = jaccard_similarity(lyric1, lyric2) 
    inter1 =  len(lyric1)
    inter2 = len(lyric2)
    return jac, inter1, inter2

** For each source choose right lyric and normalize **

For the source 12K we will try to the title with the metadata of songs, then we normalize it by removing the text which is not part of the lyric

In [145]:
for work in works:
    for l in sourcek:
        #For each candidate in 12K compare the title 
        sim = compare_names(work['title'], l['title'][0].replace('\n', '').strip())
        #sim,_,_ = calculate_distances(work['title'], l['title'][0].replace('\n', '').strip())
        if sim :
            save = False
            saved = ''
            # Now that the title matched remove the part of the text that is not lyrics
            if work['lyric']:
                for s2 in l['body'][0].split('\n'):
                    if save:
                        saved += s2 + '\n'
                    else:
                        for s in work['lyric']:
                            for line in s:
                                sim,_,_ = calculate_distances(line, s2)
                                if sim >=0.90:
                                    save = True
                                    saved = s2 + '\n'
                                
            if save:
                l = saved
            else:
                l = l['body'][0]
            lyrics[work['id']]['12k'] = l
            break
    if work['lyric'] and len(work['lyric']):
        lyrics[work['id']]['todotango'] = work['lyric']

Select a candidate for the rest of the sources and store all the lyrics on 'lyrics_clean'

In [146]:
lyrics_clean = {}

for work in works:
    w = work['id']
    lyrics_clean[w] = {}
    found = False
    if '12k' in lyrics[w]:
        lyrics_clean[w]['12k'] = lyrics[w]['12k']
    if lyrics[w]['minilyrics']:
        best_lyric = None
        best_score = -1
        if lyrics[w]['minilyrics'] and 'todotango' in lyrics[w] and lyrics[w]['todotango']:
            lyric = None
            for l in lyrics[w]['minilyrics']:
                for i in l:
                    if i['lyric']:
                        lyric = i['lyric']
                        # When the lyrics contains brachets with the time we remove it
                        if re.search('\[(.*)\]', lyric):
                            lyric = re.sub('\[(.*)\]', '', lyric)
                            lyric = lyric.encode('utf-8').decode('utf-8')
                        else:    
                            # Try to identify the the right encoding and load it as unicode
                            try:
                                lyric = lyric.encode('latin-1').decode('utf-8')
                            except UnicodeDecodeError:
                                lyric = lyric.encode('utf8').decode('utf8')
                            pass

                        # calculate the similarity with Todotango, from the different results we keep the best one 
                        l2 = '\n'.join([item for sublist in lyrics[w]['todotango'] for item in sublist])
                        sim,_,_ = calculate_distances(lyric, l2)

                        if best_score<sim:
                            best_score = sim
                            best_lyric = lyric

        if best_lyric and best_score >= 0.7:
            lyrics_clean[w]['minilyrics'] = best_lyric
        else:
            #If we didn't found a good candidate we try to match the artist name
            lyric = None
            for l in lyrics[w]['minilyrics']:
                for i in l:
                    if i['artist']:
                        for c in work['composers']:
                            a = artists[c]
                            sim = compare_names(a['name'], i['artist'])
                            if sim:
                                lyric = i['lyric']
                        for r in recordings:
                            if r['work'] == w:
                                for a in r['artist']: 

                                    #if artists[a['id']]['name'].upper() == i['artist'].upper():
                                    sim = compare_names(artists[a['id']]['name'], i['artist'])
                                    if sim:
                                        lyric = i['lyric']
            if lyric:
                if re.search('\[(.*)\]', lyric):
                    lyric = re.sub('\[(.*)\]', '', lyric)
                    lyric = lyric.encode('utf-8').decode('utf-8')
                else:    
                    try:
                        lyric = lyric.encode('latin-1').decode('utf-8')
                    except UnicodeDecodeError:
                        lyric = lyric.encode('utf8').decode('utf8')
                    pass
                lyrics_clean[w]['minilyrics'] = lyric
    if 'chartlyrics' in lyrics[w] and lyrics[w]['chartlyrics']:
        lyric = None
        # For all the possible lyrics on Chartlyrics we try to match the artists name with the metadata
        for l in lyrics[w]['chartlyrics']:
            for i in l:
                for c in work['composers']:
                    a = artists[c]
                    sim = compare_names(a['name'], i['artist'])
                    #if a['name'].upper() == i['artist'].upper():
                    if sim :
                        lyric = i['lyric']
                for r in recordings:
                    if r['work'] == w:
                        for a in r['artist']: 
                            sim = compare_names(artists[a['id']]['name'], i['artist'])
                            if sim :
                                #if artists[a['id']]['name'].upper() == i['artist'].upper():
                                lyric = i['lyric']
        if lyric:
            
            lyrics_clean[w]['chartlyrics'] = lyric
    if lyrics[w]['lyricswikia']:
        lyrics_clean[w]['lyricswikia'] = lyrics[w]['lyricswikia'][0]['lyrics']
    if 'todotango' in lyrics[w] and lyrics[w]['todotango']:
        lyrics_clean[w]['todotango'] = '\n'.join([item for sublist in lyrics[w]['todotango'] for item in sublist])
    

**Profiling**

Now that we have a single candidate from each source for each song we start the profiling

In [159]:
initial_sources = {
    'songlyrics': 0,
    'musixmatch': 0,
    'lyricsmania': 0,
    'chartlyrics': 0,
    'lyricswikia': 0,
    'minilyrics': 0,
    'todotango': 0,
    '12k': 0
}
sources = {
    'chartlyrics': 0,
    'lyricswikia': 0,
    'minilyrics': 0,
    'todotango': 0,
    '12k': 0
}
initial_count = {
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0,
    6: 0,
    7: 0,
}
count = {
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0
}
initial_total = 0
count_total = 0
for work in works:
    w = work['id']
    initial_found = 0
    found = 0
    for s in sources.keys():
        if s in lyrics_clean[w] and lyrics_clean[w][s]:
            sources[s] += 1
            found += 1
    for s in initial_sources.keys():
        if s in lyrics[w] and lyrics[w][s]:
            initial_sources[s] += 1
            initial_found +=1
    if found > 0:
        count[found] += 1
        count_total += 1
    if initial_found > 0:
        initial_count[initial_found] += 1
        initial_total +=1
        
print 'Inital sources:'
print initial_sources
print initial_count
print initial_total
print ""
print 'Cleaned sources:'
print sources
print count
print count_total

Inital sources:
{'songlyrics': 0, 'chartlyrics': 2740, 'lyricswikia': 577, 'musixmatch': 62, 'minilyrics': 2914, 'todotango': 5647, '12k': 2926, 'lyricsmania': 0}
{1: 3231, 2: 2517, 3: 1476, 4: 468, 5: 59, 6: 1, 7: 0}
7752

Cleaned sources:
{'12k': 2926, 'minilyrics': 1005, 'todotango': 5647, 'chartlyrics': 60, 'lyricswikia': 577}
{1: 3545, 2: 1830, 3: 782, 4: 161, 5: 4}
6322


** Calculate distance between all the candidate lyrics of a given song **

In [148]:
distances = {}
lengths = {}
lengths_counts = {}
lengths_max = {}
lengths_min = {}
for work in works:
    w = work['id']
    distances[w] = {}

    for s in sources.keys():
        if s not in distances[w]:
            distances[w][s] = {}
        if s not in lengths:
            lengths[s] = 0
            lengths_counts[s] = 0
            lengths_max[s] = 0
            lengths_min[s] = 999999
        for s2 in sources.keys():
            if s != None and s2 != None and s != s2:
                if s in lyrics_clean[w] and lyrics_clean[w][s] and s2 in lyrics_clean[w] and lyrics_clean[w][s2]:
                    if s in distances[w] and s2 in distances[w][s]:
                        distances[w][s2][s] = {
                            'jac': distances[w][s][s2]['jac'],
                            'inter1': distances[w][s][s2]['diff2'],
                            'inter2': distances[w][s][s2]['diffl'], 
                        }
                    else:
                        jac, inter1, inter2 = calculate_distances(lyrics_clean[w][s],lyrics_clean[w][s2])
                        distances[w][s][s2] = {'jac': jac, 'inter1': inter1, 'inter2': inter2}
                        lengths[s] += inter1
                        lengths_counts[s] += 1
                        if inter1> lengths_max[s]:
                            lengths_max[s] = inter1
                        elif inter1<lengths_min[s]:
                            lengths_min[s] = inter1

** Calculate mean distance between all the sources **

In [149]:
values = ['jac','inter1','inter2']
vals = {}
counts = {}
for w in distances.keys():
    for s in distances[w].keys():
        for s2 in distances[w][s]:
            for elem in distances[w][s][s2].keys():
                if s not in vals:
                    vals[s] = {}
                    counts[s] = {}
                if s2 not in vals[s]:
                    vals[s][s2] = {}
                    counts[s][s2] = {}
                if elem not in vals[s][s2]:
                    vals[s][s2][elem] = 0
                    counts[s][s2][elem] = 0
                vals[s][s2][elem] += distances[w][s][s2][elem]
                counts[s][s2][elem] += 1

for s in vals.keys():
    for s2 in vals[s]:
        print '%s - %s, Jaccard index: \x1b[1;31m%f\x1b[0m' % (s,s2,vals[s][s2]['jac'] / counts[s][s2]['jac'])
        print '%s - %s, Average length for the first source: %f' % (s,s2,vals[s][s2]['inter1'] / counts[s][s2]['inter1'])
        print '%s - %s, Average length for the second source: %f' % (s,s2,vals[s][s2]['inter2'] / counts[s][s2]['inter2'])
        
    print "Average length in words for %s : %f" % (s, lengths[s]/lengths_counts[s])
    print "Min length in words for %s : %f" % (s, lengths_min[s])
    print "Max length in words for %s : %f" % (s, lengths_max[s])

12k - todotango, Jaccard index: [1;31m0.775914[0m
12k - todotango, Average length for the first source: 173.000000
12k - todotango, Average length for the second source: 160.000000
12k - minilyrics, Jaccard index: [1;31m0.732773[0m
12k - minilyrics, Average length for the first source: 189.000000
12k - minilyrics, Average length for the second source: 179.000000
12k - lyricswikia, Jaccard index: [1;31m0.829158[0m
12k - lyricswikia, Average length for the first source: 181.000000
12k - lyricswikia, Average length for the second source: 168.000000
12k - chartlyrics, Jaccard index: [1;31m0.249605[0m
12k - chartlyrics, Average length for the first source: 263.000000
12k - chartlyrics, Average length for the second source: 247.000000
Average length in words for 12k : 178.000000
Min length in words for 12k : 12.000000
Max length in words for 12k : 2040.000000
todotango - 12k, Jaccard index: [1;31m0.775914[0m
todotango - 12k, Average length for the first source: 160.000000
todotango

**Finally, choose one candidate for each song and save the results**

In [150]:
for work in works:
    w= work['id']
    if not w in lyrics_clean:
        work['lyric'] = None
    elif 'minilyrics' in lyrics_clean[w]:
        work['lyric'] = lyrics_clean[w]['minilyrics'].replace('\r\n', '\n')
    elif 'todotango' in lyrics_clean[w]:
        work['lyric'] = lyrics_clean[w]['todotango']
    elif '12k' in lyrics_clean[w]:
        work['lyric'] = lyrics_clean[w]['12k']
 
import csv
with open('tango_lyrics.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for work in works:
        if 'lyric' in work and work['lyric']:
            writer.writerow([work['title'].encode('utf-8'), work['lyric'].encode('utf-8')])
    