In [1]:
import sys
import operator
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import re
import unicodedata
from operator import add
from itertools import combinations

In [2]:
spark = SparkSession \
            .builder \
            .appName("clustering") \
            .getOrCreate()
sc = spark.sparkContext

In [3]:
data = spark.read.csv('spofity_cleaned.csv', header = True)

In [4]:
data2 = data.rdd.map(lambda x:(x['track_name'], x['artist_name']))\
.distinct().map(lambda x:(x[0], x[0]+ ' '+x[1])).cache()

In [5]:
names = data2.map(lambda x: (x[0], x[1].strip()))\
                .map(lambda x: (x[0], x[1].lower()))\
                .map(lambda x: (x[0], re.sub(r'[^\w\s]','', x[1])))\
                .map(lambda x: (x[0], unicodedata.normalize('NFC',x[1])))\
                .map(lambda x: (x[0], x[1].split(' ')))\
                .map(lambda x: (x[0], list(filter(None, x[1])))).cache()

In [6]:
print(names.take(10))

[('Le petit souper aux chandelles', ['le', 'petit', 'souper', 'aux', 'chandelles', 'henri', 'salvador']), ('Let Me Let Go', ['let', 'me', 'let', 'go', 'laura', 'mayne']), ('Les bisous des bisounours', ['les', 'bisous', 'des', 'bisounours', 'le', 'club', 'des', 'juniors']), ('Symphony No.4 In E Minor Op.98 : IV. Allegro Energico E Passionato', ['symphony', 'no4', 'in', 'e', 'minor', 'op98', 'iv', 'allegro', 'energico', 'e', 'passionato', 'leopold', 'stokowski']), ('The Hanging (Maverick - Original Motion Picture Score) - Remastered', ['the', 'hanging', 'maverick', 'original', 'motion', 'picture', 'score', 'remastered', 'randy', 'newman']), ('Keys of Love', ['keys', 'of', 'love', 'richard', 'm', 'sherman']), ('Diane', ['diane', 'jean', 'claude', 'corbel']), ('Pourquoi aller plus loin ?', ['pourquoi', 'aller', 'plus', 'loin', 'jean', 'claude', 'corbel']), ('For the Game', ['for', 'the', 'game', 'chorus']), ('La Vie grise', ['la', 'vie', 'grise', 'henri', 'salvador'])]


In [7]:
all_names=names.collect()
name_dict ={}
for each in all_names:
    name_dict[each[0]]=each[1]

In [8]:
tokens = names.map(lambda x: x[1]).collect()

In [9]:
shingles = []
for each in tokens:
    shingles.extend(each)
shingles = sorted(list(set(shingles)))
rep_shingles = {}
i = 1
for each in shingles:
    rep_shingles[each] = i
    i = i+1

In [10]:
print(len(shingles))

57177


In [11]:
print(rep_shingles)



In [12]:
trans_names = names.mapValues(lambda x: [rep_shingles[i] for i in x]).cache()

In [13]:
print(trans_names.take(20))

[('Le petit souper aux chandelles', [29011, 38588, 47599, 4120, 9376, 22963, 44023]), ('Let Me Let Go', [29320, 32189, 29320, 20830, 28910, 32022]), ('Les bisous des bisounours', [29298, 6036, 13652, 6035, 29011, 10513, 13652, 26598]), ('Symphony No.4 In E Minor Op.98 : IV. Allegro Energico E Passionato', [49677, 35604, 24708, 15628, 33132, 36735, 25497, 2225, 16461, 15628, 37940, 29278, 48553]), ('The Hanging (Maverick - Original Motion Picture Score) - Remastered', [50732, 22266, 31975, 36880, 33987, 38833, 44707, 42089, 41337, 35260]), ('Keys of Love', [27375, 36311, 30284, 42613, 30673, 45703]), ('Diane', [13989, 25920, 10341, 11506]), ('Pourquoi aller plus loin ?', [39804, 2234, 39318, 30030, 25920, 10341, 11506]), ('For the Game', [18903, 50732, 19897, 9976]), ('La Vie grise', [28398, 54123, 21463, 22963, 44023]), ('Dancing with Gene', [12725, 55763, 20238, 27276, 37387]), ('Penn Station - Seesaw', [38310, 48315, 44956, 27276, 37387]), ('Sketch producteur (suite)', [46606, 40247,

In [14]:
def min_hash(values):
    sign = []
    for i in range(1, 101):
        hash_list = []
        for v in values:
            h_value = (2*v + 11*i) % 100
            hash_list.append(h_value)
        if hash_list!=[]:
            sign.append(min(hash_list))
    return sign
def band(item):
    band = []
    for i in range(0, 2):
        band.append((i, (item[0], item[1][i])))
    return band



In [15]:
def cluster(values):
    Output = {} 
    for x, y in values: 
        if tuple(y) in Output: 
            Output[tuple(y)].append((x)) 
        else: 
            Output[tuple(y)] = [(x)]
    return Output

In [16]:
def clean(l):
    output = []
    for each in l:
        if len(each[1])>=2:
            output.extend(each(1))
    return output

In [17]:
signiture = trans_names.mapValues(lambda x: min_hash(x))\
                        .mapValues(lambda x: [x[i:i+50] for i in range(0, 100, 50)])\
                        .flatMap(lambda x: band(x))\
                        .groupByKey()\
                        .map(lambda x: (x[0],list(x[1])))\
                        .mapValues(lambda x: cluster(x))\
                        .map(lambda x: list(x[1].items())).cache()

In [18]:
candidates = signiture.collect()

In [19]:

cand = []
for each in candidates:
    cand.extend(each)
candidate = []
for each in cand:
    if len(each[1])>=2:
        candidate.append(each[1])


In [20]:
print(len(candidate))

35645


In [21]:
def similarity(values):
    result = []
    permutations = list(combinations(values, 2))
    for each in permutations:
        u1 = each[0]
        u2 = each[1]
        m1 = name_dict[u1]
        m2 = name_dict[u2]
        inter = set(m1).intersection(set(m2))
        union = set(m1).union(set(m1))
        if len(union)!=0:
            simi = float(len(inter)/len(union))
            if simi>=0.9:
                result.append(each)
    return result

In [22]:
sc_cand = sc.parallelize(candidate).map(lambda x: similarity(x))

In [33]:
results = sc_cand.collect()
result = []
for each in results:
    if len(each)>0:
        result.append(each)
output = []
for i in result:
    clusters = []
    for each in i:
        clusters.append(each[0])
        clusters.append(each[1])
#     clusters = list(set(clusters))
    output.append(clusters)
output2 = list(set([tuple(t) for t in output]))

In [34]:
for each in output:
    print(each)

['Le petit souper aux chandelles', 'Le Petit souper aux chandelles', 'Le petit souper aux chandelles', 'Le Petit Souper Aux Chandelles', 'Le petit souper aux chandelles', 'Le Petit Souper aux Chandelles', 'Le Petit souper aux chandelles', 'Le Petit Souper Aux Chandelles', 'Le Petit souper aux chandelles', 'Le Petit Souper aux Chandelles', 'Le Petit Souper Aux Chandelles', 'Le Petit Souper aux Chandelles']
['La Vie grise', 'La Vie Grise']
["Augustin! Y'a quelqu'un qui te demande", "Augustin! Y'a Quelqu'un Qui Te Demande"]
['Never Ever', 'Never Ever', 'Never Ever', 'Never Ever', 'Never Ever', 'Never Ever']
['Chanson surrealiste', 'Chanson Surrealiste']
['Buff Baby', 'Buff Baby']
['Le Marchand de sable', 'Le marchand de sable', 'Le Marchand de sable', 'Le Marchand De Sable', 'Le marchand de sable', 'Le Marchand De Sable']
['Re-bonjour', 'Re-Bonjour']
['Tylko ty', 'Tylko Ty']
['Adieu Foulards Adieu Madras', 'Adieu foulards adieu madras']
["J'aimerais tellement ca", "J'aimerais Tellement Ca

['"Theme - From ""Schindler\'s List"""', '"Theme from ""Schindler\'s List"""']
['Blood on My Hands', 'Blood On My Hands']
['Aria 1 - Pt. 2', 'Aria 2 - Pt. 1']
['"The Raiders March - From ""Raiders Of The Lost Ark"""', '"Raiders March (From ""Raiders of the Lost Ark"")"']
["I'm Not A Hero", "I'm Not a Hero"]
['"Superman March - From ""Superman"""', '"March (From ""Superman"")"']
['Main Title', 'Main Title']
['I Am The Batman', 'I Am the Batman']
['Forever Reign - Live', 'Forever Reign - Live']
['It Is Well with My Soul', 'It Is Well With My Soul']
['We Have Overcome', 'We Have Overcome - Instrumental']
['My All In All', 'My All In All - Acoustic']
['Because He Lives', 'Because He Lives']
['Viento del Arena', 'Viento Del Arena']
["All My Ex's Live In Texas", "All My Ex's Live In Texas - Live"]
["Don't Rock the Jukebox", "Don't Rock The Jukebox"]
['Believe', 'Believe']
['Rock My World - Little Country Girl', 'Rock My World (Little Country Girl)']
["I Think I'll Just Stay Here And Drink", 

["Don't Lose Touch", "Don't Lose Touch - Live"]
['A-OK', 'Aok']
['Lo De Nosotros (feat. Arcangel)', 'Lo de Nosotros (feat. Arcangel)']
['La Llave de Mi Corazon', 'La Llave De Mi Corazon']
['Me gusta todo de ti', 'Me Gusta Todo de Ti']
["Don't Go Breaking My Heart", "Don't Go Breaking My Heart - Remastered"]
['Chitty Chitty Bang Bang: Chitty Chitty Bang Bang', 'Chitty Chitty Bang Bang: Overture', 'Chitty Chitty Bang Bang: Chitty Chitty Bang Bang', 'Chitty Chitty Bang Bang: Posh']
['The Ends Of The Earth', '"The Ends Of The Earth - From ""Aladdin""/Score"']
["Where's the Girl?", "Where's The Girl?"]
['A Secret Revealed', 'A Secret Revealed - Score']
["I'm Not Afraid", "I'm Not Afraid (Live)"]
['Kill All the White Man', 'Kill All The White Man']
['"Put On A Happy Face (from ""Bye Bye Birdie"")"', '"Put on a Happy Face - from ""Bye Bye Birdie"""']
['Fall on Me', 'Fall On Me']
['"Mermaids - From ""Pirates of the Caribbean: On Stranger Tides""/Score"', '"Blackbeard - From ""Pirates of the Ca

In [35]:
print(len(output2))

2151


In [36]:
import csv
with open('sp_mat_lsh.csv', "w") as f:
    w = csv.writer(f)
    w.writerows(output2)