

# Import Libraries

In [None]:
!git clone https://github.com/cabell132/CM4706-Advanced-Data-Science.git

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import ngrams
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import numpy as np
from tqdm import tqdm
from collections import Counter
import unidecode
import re
import json
import spacy
from rapidfuzz import fuzz
from spacy.tokens import DocBin
from spacy.util import filter_spans
import random
import json

# download spaCy en_core_web_lg model
!python -m spacy download en_core_web_lg


The purpose of this code is to match together song releases by the song title and artists. 

For the first pipeline, we shall try and use Kmeans clustering to see whether we can group the tracks into similar releases.

For the second pipeline, we shall try and use a similar algorithm to Kmeans which is K nearest neighbours to group the releases together.

In the third pipeline, we shall use name entity recognition to identify key features in the text of the song name and then use fuzzy matching to identify groups of releases.

# Load Data Sets


In [2]:
data = pd.read_csv('track_name_list.csv')

# set the index to be the unique id
full_names = data.set_index('id')

# count the number of track names in the full_names dataframe with comma formatting

print(f'Number of track names in the full_names dataframe: {len(data):,}')

Number of track names in the full_names dataframe: 1,674


In [3]:
full_names['full_name'] = full_names['name'] + ' ' + full_names['artist']

In [4]:
# We will now normalise the full_name column by removing all non-alphanumeric characters, converting all characters to lowercase and translate text to 7-bit ASCII

# Normalise the full_name column by removing all non-alphanumeric characters, converting all characters to lowercase and translate text to 7-bit ASCII
full_names['full_name'] = full_names['full_name'].apply(lambda x: unidecode.unidecode(re.sub(r'[^a-zA-Z0-9\s]', '', x.lower())))

# create a new column called 'full_name_tokens' which contains the full_name column tokenised
full_names['full_name_tokens'] = full_names['full_name'].apply(lambda x: word_tokenize(x))

full_names.head(10)

Unnamed: 0_level_0,name,artist,platform,full_name,full_name_tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
70001,Dreams About You - Dreamy Remix,"Christian Drost, Dreamy",Track,dreams about you dreamy remix christian drost...,"[dreams, about, you, dreamy, remix, christian,..."
70009,Lost In Your Eyes,Love N Frequency,Track,lost in your eyes love n frequency,"[lost, in, your, eyes, love, n, frequency]"
70010,Lost In Your Eyes - Manuel Rocca Remix,"Love N Frequency, Manuel Rocca",Track,lost in your eyes manuel rocca remix love n f...,"[lost, in, your, eyes, manuel, rocca, remix, l..."
70035,Perception,Alae Khaldi,Track,perception alae khaldi,"[perception, alae, khaldi]"
70040,Ethereal - Kago Pengchi Remix,"Kago Pengchi, Odonbat",Track,ethereal kago pengchi remix kago pengchi odonbat,"[ethereal, kago, pengchi, remix, kago, pengchi..."
70045,Sunrise - Lence & Pluton Remix,"Denis Kenzo, Lence & Pluton, PLUTON",Track,sunrise lence pluton remix denis kenzo lence...,"[sunrise, lence, pluton, remix, denis, kenzo, ..."
70046,Sunrise,Denis Kenzo,Track,sunrise denis kenzo,"[sunrise, denis, kenzo]"
70078,Starclad,"Christian Zechner, Lira Yin",Track,starclad christian zechner lira yin,"[starclad, christian, zechner, lira, yin]"
70099,Drifting Through Darkness - Photographer Remix,"Chris Turner, Luke Terry, Photographer",Track,drifting through darkness photographer remix ...,"[drifting, through, darkness, photographer, re..."
70114,Peak Experience,Jo Micali,Track,peak experience jo micali,"[peak, experience, jo, micali]"


In [5]:
# train a Word2Vec model on the full_name_tokens column
model = Word2Vec(full_names['full_name_tokens'], min_count=1, vector_size=100, window=5, workers=4, seed=42)

In [6]:
# create a function that converts a track name to a vector
def vectoriser(tokenized_track_name, model):
    zero_vector = np.zeros(model.vector_size)
    vectors = []
    for token in tokenized_track_name:
        if token in model.wv:
            vectors.append(model.wv[token])
    
    if vectors:
        vectors = np.array(vectors)
        avg_vec = np.mean(vectors, axis=0)
        return avg_vec
    else:
        return zero_vector

# test the vectoriser function
vectoriser(full_names['full_name_tokens'].iloc[0], model)


array([ 8.32445640e-03, -6.02931017e-04, -8.53211619e-03,  8.02706461e-03,
       -1.13724926e-02,  1.24447579e-02,  7.72069674e-03,  7.69176334e-03,
       -6.76097174e-04, -2.58977059e-03, -2.24564672e-02,  9.74831171e-03,
        4.55621351e-03,  2.09161341e-02, -1.08838663e-03, -1.06598055e-02,
        4.66754660e-03, -1.78964194e-02, -8.92541837e-03,  4.71695745e-03,
        2.55849387e-04,  7.04311207e-03, -2.19302601e-03,  1.45929996e-02,
        6.15287013e-03, -1.78040657e-03, -9.64092743e-03,  8.86085629e-03,
       -4.57303645e-03, -1.87658751e-03,  4.07472160e-03, -9.14063305e-04,
       -5.90784999e-04,  4.78117913e-03,  9.33441392e-04,  3.72556678e-05,
        1.04957782e-02, -2.41209846e-02, -1.57294655e-03,  8.11378472e-03,
       -1.86485518e-03,  3.29029444e-03,  1.72652770e-02, -6.97229384e-03,
        1.87671313e-03,  2.11843988e-03,  8.35631043e-03,  1.08153410e-02,
        2.46760678e-02, -1.40059153e-02, -8.36214609e-03, -2.62745302e-02,
       -1.21485011e-03, -

In [7]:
# create a new column called 'full_name_vector' which contains the vector representation of the full_name column
full_names['full_name_vector'] = full_names['full_name_tokens'].apply(lambda x: vectoriser(x, model))

In [14]:
# fit a KMeans model to the full_name_vector column
kmeans = KMeans(n_clusters=699, random_state=42).fit(full_names['full_name_vector'].tolist())

# create a new column called 'cluster' which contains the cluster number for each track name
full_names['cluster'] = kmeans.labels_

# create a new column called 'cluster_size' which contains the number of track names in each cluster
full_names['cluster_size'] = full_names.groupby('cluster')['cluster'].transform('count')

full_names.head(10)

Unnamed: 0_level_0,name,artist,platform,full_name,full_name_tokens,full_name_vector,cluster,cluster_size
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
70001,Dreams About You - Dreamy Remix,"Christian Drost, Dreamy",Track,dreams about you dreamy remix christian drost...,"[dreams, about, you, dreamy, remix, christian,...","[0.008324456, -0.000602931, -0.008532116, 0.00...",558,2
70009,Lost In Your Eyes,Love N Frequency,Track,lost in your eyes love n frequency,"[lost, in, your, eyes, love, n, frequency]","[0.007382974, -0.0016506583, -0.0051318393, 0....",201,5
70010,Lost In Your Eyes - Manuel Rocca Remix,"Love N Frequency, Manuel Rocca",Track,lost in your eyes manuel rocca remix love n f...,"[lost, in, your, eyes, manuel, rocca, remix, l...","[0.0079321675, -0.0024371252, -0.0076448093, 0...",625,2
70035,Perception,Alae Khaldi,Track,perception alae khaldi,"[perception, alae, khaldi]","[0.0011704974, 0.00026999367, 0.002106758, 0.0...",467,1
70040,Ethereal - Kago Pengchi Remix,"Kago Pengchi, Odonbat",Track,ethereal kago pengchi remix kago pengchi odonbat,"[ethereal, kago, pengchi, remix, kago, pengchi...","[0.0075254096, -0.0026752353, -0.0051851417, 0...",507,2
70045,Sunrise - Lence & Pluton Remix,"Denis Kenzo, Lence & Pluton, PLUTON",Track,sunrise lence pluton remix denis kenzo lence...,"[sunrise, lence, pluton, remix, denis, kenzo, ...","[0.0059694434, -0.0021464734, -0.0027971028, 0...",684,2
70046,Sunrise,Denis Kenzo,Track,sunrise denis kenzo,"[sunrise, denis, kenzo]","[-0.0012884577, -0.00022606661, -0.007079489, ...",415,1
70078,Starclad,"Christian Zechner, Lira Yin",Track,starclad christian zechner lira yin,"[starclad, christian, zechner, lira, yin]","[0.0013698487, -0.0032376896, -0.002174544, 0....",653,1
70099,Drifting Through Darkness - Photographer Remix,"Chris Turner, Luke Terry, Photographer",Track,drifting through darkness photographer remix ...,"[drifting, through, darkness, photographer, re...","[0.012485746, -0.0048313877, -0.008024739, 0.0...",162,4
70114,Peak Experience,Jo Micali,Track,peak experience jo micali,"[peak, experience, jo, micali]","[-0.0011994136, 0.0036464352, -0.00013312529, ...",217,1


In [61]:
test_data = pd.read_csv('test_data.csv')
for row in tqdm(test_data.index):
    target = test_data.loc[row,'target_id']
    cluster = full_names.loc[target,'cluster']
    cluster_df = full_names[full_names['cluster'] == cluster]
    cluster_df = cluster_df[cluster_df.index != target]
    prediction = cluster_df.index
    answer = json.loads(test_data.loc[row,'match_id'])

    # true positive is the number of track names in the prediction that are also in the answer
    test_data.loc[row,'true_positive'] = len([i for i in prediction if i in answer])

    # false positive is the number of track names in the prediction that are not in the answer
    test_data.loc[row,'false_positive'] = len([i for i in prediction if i not in answer])

    test_data.loc[row,'total_predictions'] = len(prediction)
    # false negative is the number of tracks in the answer that are not in the prediction
    test_data.loc[row,'false_negative'] = len([i for i in answer if i not in prediction])
    
    
# calculate the accuracy of the model
acc = test_data['true_positive'].sum() / (test_data['total_predictions'].sum())

print(f'Accuracy of the model: {acc:.2%}')

# calculate the precision of the model
precision = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_positive'].sum())

print(f'Precision of the model: {precision:.2%}')

# calculate the recall of the model
recall = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_negative'].sum())

print(f'Recall of the model: {recall:.2%}')

# calculate the f1 score of the model
f1 = 2 * (precision * recall) / (precision + recall)

print(f'F1 score of the model: {f1:.2%}')


100%|██████████| 1674/1674 [00:01<00:00, 1291.48it/s]

Accuracy of the model: 71.54%
Precision of the model: 71.54%
Recall of the model: 80.60%
F1 score of the model: 75.80%





In [19]:
# create knn model
knn = NearestNeighbors(n_neighbors=100, metric='cosine', algorithm='auto', n_jobs=-1)

# fit knn model to the full_name_vector column
knn.fit(full_names['full_name_vector'].tolist())

In [62]:
THESHOLD = 0.01

for THESHOLD in [i/1000 for i in range(1, 4)]:

    test_data = pd.read_csv('test_data.csv').set_index('target_id')

    for row in tqdm(full_names.index, desc=f"distence threshold: {THESHOLD}"):
        track_name_vector = full_names.loc[row,'full_name_vector']
        # get the 100 nearest neighbours
        dist, indx = knn.kneighbors([track_name_vector])
        nearest_neighbours = full_names.iloc[indx[0]].index
        prediction = [i for i, d in zip(nearest_neighbours, dist[0]) if d < THESHOLD and i != row]

        answer = json.loads(test_data.loc[row,'match_id'])

        # true positive is the number of track names in the prediction that are also in the answer
        test_data.loc[row,'true_positive'] = len([i for i in prediction if i in answer])

        # false positive is the number of track names in the prediction that are not in the answer
        test_data.loc[row,'false_positive'] = len([i for i in prediction if i not in answer])

        # false negative is the number of tracks in the answer that are not in the prediction
        test_data.loc[row,'false_negative'] = len([i for i in answer if i not in prediction])

        test_data.loc[row,'total_predictions'] = len(prediction)
        
        
    # calculate the accuracy of the model
    acc = test_data['true_positive'].sum() / (test_data['total_predictions'].sum())

    print(f'Accuracy of the model: {acc:.2%} with threshold {THESHOLD}')

    # calculate the precision of the model
    precision = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_positive'].sum())

    print(f'Precision of the model: {precision:.2%} with threshold {THESHOLD}')

    # calculate the recall of the model
    recall = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_negative'].sum())

    print(f'Recall of the model: {recall:.2%} with threshold {THESHOLD}')

    # calculate the f1 score of the model
    f1 = 2 * (precision * recall) / (precision + recall)

    print(f'F1 score of the model: {f1:.2%} with threshold {THESHOLD}')

distence threshold: 0.001: 100%|██████████| 1674/1674 [00:27<00:00, 60.11it/s]


Accuracy of the model: 99.55% with threshold 0.001
Precision of the model: 99.55% with threshold 0.001
Recall of the model: 70.48% with threshold 0.001
F1 score of the model: 82.53% with threshold 0.001


distence threshold: 0.002: 100%|██████████| 1674/1674 [00:27<00:00, 61.71it/s]


Accuracy of the model: 97.50% with threshold 0.002
Precision of the model: 97.50% with threshold 0.002
Recall of the model: 74.43% with threshold 0.002
F1 score of the model: 84.42% with threshold 0.002


distence threshold: 0.003: 100%|██████████| 1674/1674 [00:27<00:00, 60.71it/s]

Accuracy of the model: 90.48% with threshold 0.003
Precision of the model: 90.48% with threshold 0.003
Recall of the model: 78.56% with threshold 0.003
F1 score of the model: 84.10% with threshold 0.003





In [249]:


# load training data

# file removed because it's too large to upload to github
with open('training_data.json', 'r') as f:
    training_data = json.load(f)

# split training data into train and dev
random.shuffle(training_data)

split = 0.2

dev_len = round(len(training_data) * split)
train_data = training_data[dev_len:]
dev_data = training_data[:dev_len]

def annotations_to_doc_bin(annotations):
    nlp = spacy.blank("en") # load a new blank spacy model
    doc_bin = DocBin()

    for data in annotations:
        doc = nlp.make_doc(data['text'])
        ents = []
        for start, end, label in data['entities']:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                pass
            else:
                ents.append(span)
        doc.ents = filter_spans(ents)
        doc_bin.add(doc)
    return doc_bin



annotations_to_doc_bin(train_data).to_disk("train.spacy")
annotations_to_doc_bin(dev_data).to_disk("dev.spacy")

In [250]:
!python -m spacy init fill-config base_config.cfg config.cfg

✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [251]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./dev.spacy

ℹ Saving to output directory: .
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     85.67   16.23   13.18   21.13    0.16
  2     200         58.09   3785.71   90.50   92.35   88.71    0.90
  6     400         57.76    997.97   91.46   92.26   90.68    0.91
 10     600         57.60    521.99   90.62   91.22   90.03    0.91
 15     800         63.76    373.44   92.25   93.17   91.34    0.92
 21    1000         83.63    285.57   91.09   92.32   89.90    0.91
 29    1200         69.76    168.63   90.96   91.50   90.42    0.91
 39    1400         50.85    106.59   92.17   92.47   91.86    0.92
 50    1600         33.81     49.64   91.86   91.86   91.86    0.92
 64    1800         53.08     64.29   93.54   93.92   93.18    0.94
 81    2000         39.16     40.59   93.13   93

[2022-11-17 17:05:08,187] [INFO] Set up nlp object from config
[2022-11-17 17:05:08,195] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-11-17 17:05:08,199] [INFO] Created vocabulary
[2022-11-17 17:05:09,707] [INFO] Added vectors: en_core_web_lg
[2022-11-17 17:05:11,064] [INFO] Finished initializing nlp object
[2022-11-17 17:05:11,703] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [2]:
nlp_ner = spacy.load(r"F:\track_name_match\model-best")

In [256]:
# convert spotify d to a list of json objects
tracks = pd.read_csv('track_name_list.csv').to_dict('records')

for track in tqdm(tracks):
    doc = nlp_ner(track['name'])
    for e in doc.ents:
        if e.label_ not in track:
            track[e.label_] = [e.text]
        else:
            track[e.label_].append(e.text)
    track['artist'] = track['artist'].split(', ')

100%|██████████| 1674/1674 [00:12<00:00, 134.04it/s]


In [255]:
def produce_clean_name(track):

    artists = track['artist']
    artists.sort()


    # if there are any featured artists in the track make sure they are not in the list of artist


    if 'featArtist' in track:
        for featArtist in track['featArtist']:
            if featArtist in artists:
                artists.remove(featArtist)

    # if there are any remix artists in the track make sure they are not in the list of artist

    if 'remixer' in track:
        for remixArtist in track['remixer']:
            if remixArtist in artists:
                artists.remove(remixArtist)

    if "title" in track:
        if 'remixer' in track:
            track['remixer'].sort()
            clean_name = f"{' '.join(track['title'])} {' '.join(track['remixer'])} Remix {' '.join(artists)}".lower()
        else:
            clean_name = f"{' '.join(track['title'])} {' '.join(artists)}".lower()

        return clean_name

for track in tqdm(tracks):
    clean_name = produce_clean_name(track)
    track['clean_name'] = clean_name

tracks = pd.DataFrame(tracks).set_index('id')

100%|██████████| 1674/1674 [00:00<00:00, 418305.92it/s]


In [106]:

def fuzzy_match(target, threshold, process):
    results = []
    for id in tracks.index:

        comparison = tracks.loc[id, 'clean_name']
        score = getattr(fuzz,process)(target, comparison)
        if score > threshold:
            results.append({"id":id, "name":comparison, "score":score})

    results = pd.DataFrame(results).sort_values('score', ascending=False)
    return results['id'].tolist()

THESHOLD = 90
PROCESS = "ratio"
results = []
for PROCESS in ['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio']:
    for THESHOLD in [80, 85, 90, 95]:
        test_data = pd.read_csv('test_data.csv').set_index('target_id')

        for row in tqdm(tracks.index):

            clean_name = tracks.loc[row, 'clean_name']
            prediction = fuzzy_match(clean_name, THESHOLD, PROCESS)
            prediction = [x for x in prediction if x != row]

            answer = json.loads(test_data.loc[row,'match_id'])

            # true positive is the number of track names in the prediction that are also in the answer
            test_data.loc[row,'true_positive'] = len([i for i in prediction if i in answer])

            # false positive is the number of track names in the prediction that are not in the answer
            test_data.loc[row,'false_positive'] = len([i for i in prediction if i not in answer])

            # false negative is the number of tracks in the answer that are not in the prediction
            test_data.loc[row,'false_negative'] = len([i for i in answer if i not in prediction])

            test_data.loc[row,'total_predictions'] = len(prediction)
                
                
        # calculate the accuracy of the model
        acc = test_data['true_positive'].sum() / (test_data['total_predictions'].sum())

        print(f'Accuracy of the model: {acc:.2%} for process {PROCESS} with threshold {THESHOLD}')

        # calculate the precision of the model
        precision = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_positive'].sum())

        print(f'Precision of the model: {precision:.2%} for process {PROCESS} with threshold {THESHOLD}')

        # calculate the recall of the model
        recall = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_negative'].sum())

        print(f'Recall of the model: {recall:.2%} for process {PROCESS} with threshold {THESHOLD}')

        # calculate the f1 score of the model
        f1 = 2 * (precision * recall) / (precision + recall)

        print(f'F1 score of the model: {f1:.2%} for process {PROCESS} with threshold {THESHOLD}')

        results.append({'process':PROCESS, 'threshold':THESHOLD, 'accuracy':acc, 'precision':precision, 'recall':recall, 'f1':f1})

results = pd.DataFrame(results)
results.sort_values('f1', ascending=False)

100%|██████████| 1674/1674 [00:25<00:00, 65.03it/s]


Accuracy of the model: 65.86% for process ratio with threshold 80
Precision of the model: 65.86% for process ratio with threshold 80
Recall of the model: 91.92% for process ratio with threshold 80
F1 score of the model: 76.74% for process ratio with threshold 80


100%|██████████| 1674/1674 [00:24<00:00, 67.55it/s]


Accuracy of the model: 84.79% for process ratio with threshold 85
Precision of the model: 84.79% for process ratio with threshold 85
Recall of the model: 89.69% for process ratio with threshold 85
F1 score of the model: 87.17% for process ratio with threshold 85


100%|██████████| 1674/1674 [00:23<00:00, 70.74it/s]


Accuracy of the model: 95.30% for process ratio with threshold 90
Precision of the model: 95.30% for process ratio with threshold 90
Recall of the model: 86.51% for process ratio with threshold 90
F1 score of the model: 90.70% for process ratio with threshold 90


100%|██████████| 1674/1674 [00:23<00:00, 70.95it/s]


Accuracy of the model: 97.10% for process ratio with threshold 95
Precision of the model: 97.10% for process ratio with threshold 95
Recall of the model: 83.14% for process ratio with threshold 95
F1 score of the model: 89.58% for process ratio with threshold 95


100%|██████████| 1674/1674 [00:35<00:00, 47.38it/s]


Accuracy of the model: 6.03% for process partial_ratio with threshold 80
Precision of the model: 6.03% for process partial_ratio with threshold 80
Recall of the model: 95.87% for process partial_ratio with threshold 80
F1 score of the model: 11.34% for process partial_ratio with threshold 80


100%|██████████| 1674/1674 [00:36<00:00, 46.18it/s]


Accuracy of the model: 11.95% for process partial_ratio with threshold 85
Precision of the model: 11.95% for process partial_ratio with threshold 85
Recall of the model: 92.05% for process partial_ratio with threshold 85
F1 score of the model: 21.16% for process partial_ratio with threshold 85


100%|██████████| 1674/1674 [00:36<00:00, 45.90it/s]


Accuracy of the model: 50.38% for process partial_ratio with threshold 90
Precision of the model: 50.38% for process partial_ratio with threshold 90
Recall of the model: 89.69% for process partial_ratio with threshold 90
F1 score of the model: 64.52% for process partial_ratio with threshold 90


100%|██████████| 1674/1674 [00:37<00:00, 45.19it/s]


Accuracy of the model: 91.84% for process partial_ratio with threshold 95
Precision of the model: 91.84% for process partial_ratio with threshold 95
Recall of the model: 88.04% for process partial_ratio with threshold 95
F1 score of the model: 89.90% for process partial_ratio with threshold 95


100%|██████████| 1674/1674 [00:34<00:00, 48.62it/s]


Accuracy of the model: 75.35% for process token_sort_ratio with threshold 80
Precision of the model: 75.35% for process token_sort_ratio with threshold 80
Recall of the model: 92.56% for process token_sort_ratio with threshold 80
F1 score of the model: 83.07% for process token_sort_ratio with threshold 80


100%|██████████| 1674/1674 [00:31<00:00, 53.43it/s]


Accuracy of the model: 87.86% for process token_sort_ratio with threshold 85
Precision of the model: 87.86% for process token_sort_ratio with threshold 85
Recall of the model: 90.27% for process token_sort_ratio with threshold 85
F1 score of the model: 89.05% for process token_sort_ratio with threshold 85


100%|██████████| 1674/1674 [00:31<00:00, 53.85it/s]


Accuracy of the model: 95.47% for process token_sort_ratio with threshold 90
Precision of the model: 95.47% for process token_sort_ratio with threshold 90
Recall of the model: 87.09% for process token_sort_ratio with threshold 90
F1 score of the model: 91.08% for process token_sort_ratio with threshold 90


100%|██████████| 1674/1674 [00:31<00:00, 53.16it/s]


Accuracy of the model: 97.11% for process token_sort_ratio with threshold 95
Precision of the model: 97.11% for process token_sort_ratio with threshold 95
Recall of the model: 83.46% for process token_sort_ratio with threshold 95
F1 score of the model: 89.77% for process token_sort_ratio with threshold 95


100%|██████████| 1674/1674 [00:33<00:00, 49.68it/s]


Accuracy of the model: 15.74% for process token_set_ratio with threshold 80
Precision of the model: 15.74% for process token_set_ratio with threshold 80
Recall of the model: 99.87% for process token_set_ratio with threshold 80
F1 score of the model: 27.20% for process token_set_ratio with threshold 80


100%|██████████| 1674/1674 [00:36<00:00, 46.21it/s]


Accuracy of the model: 54.65% for process token_set_ratio with threshold 85
Precision of the model: 54.65% for process token_set_ratio with threshold 85
Recall of the model: 99.87% for process token_set_ratio with threshold 85
F1 score of the model: 70.64% for process token_set_ratio with threshold 85


100%|██████████| 1674/1674 [00:35<00:00, 46.69it/s]


Accuracy of the model: 80.21% for process token_set_ratio with threshold 90
Precision of the model: 80.21% for process token_set_ratio with threshold 90
Recall of the model: 99.81% for process token_set_ratio with threshold 90
F1 score of the model: 88.95% for process token_set_ratio with threshold 90


100%|██████████| 1674/1674 [00:36<00:00, 45.27it/s]

Accuracy of the model: 82.56% for process token_set_ratio with threshold 95
Precision of the model: 82.56% for process token_set_ratio with threshold 95
Recall of the model: 99.11% for process token_set_ratio with threshold 95
F1 score of the model: 90.08% for process token_set_ratio with threshold 95





Unnamed: 0,process,threshold,accuracy,precision,recall,f1
10,token_sort_ratio,90,0.954672,0.954672,0.870865,0.910845
2,ratio,90,0.953048,0.953048,0.86514,0.906969
15,token_set_ratio,95,0.825649,0.825649,0.991094,0.900838
7,partial_ratio,95,0.918381,0.918381,0.880407,0.898993
11,token_sort_ratio,95,0.971132,0.971132,0.834606,0.897708
3,ratio,95,0.971025,0.971025,0.831425,0.895819
9,token_sort_ratio,85,0.878638,0.878638,0.902672,0.890493
14,token_set_ratio,90,0.802147,0.802147,0.998092,0.889456
1,ratio,85,0.847865,0.847865,0.896947,0.871716
8,token_sort_ratio,80,0.753496,0.753496,0.925573,0.830717


In [108]:
PROCESS = 'token_sort_ratio'
THESHOLD = 90

test_data = pd.read_csv('test_data.csv').set_index('target_id')

for row in tqdm(tracks.index):

    clean_name = tracks.loc[row, 'clean_name']
    prediction = fuzzy_match(clean_name, THESHOLD, PROCESS)
    prediction = [x for x in prediction if x != row]

    answer = json.loads(test_data.loc[row,'match_id'])

    # true positive is the number of track names in the prediction that are also in the answer
    test_data.loc[row,'true_positive'] = len([i for i in prediction if i in answer])

    # false positive is the number of track names in the prediction that are not in the answer
    test_data.loc[row,'false_positive'] = len([i for i in prediction if i not in answer])

    # false negative is the number of tracks in the answer that are not in the prediction
    test_data.loc[row,'false_negative'] = len([i for i in answer if i not in prediction])

    test_data.loc[row,'total_predictions'] = len(prediction)
        
        
# calculate the accuracy of the model
acc = test_data['true_positive'].sum() / (test_data['total_predictions'].sum())

print(f'Accuracy of the model: {acc:.2%} for process {PROCESS} with threshold {THESHOLD}')

# calculate the precision of the model
precision = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_positive'].sum())

print(f'Precision of the model: {precision:.2%} for process {PROCESS} with threshold {THESHOLD}')

# calculate the recall of the model
recall = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_negative'].sum())

print(f'Recall of the model: {recall:.2%} for process {PROCESS} with threshold {THESHOLD}')

# calculate the f1 score of the model
f1 = 2 * (precision * recall) / (precision + recall)

print(f'F1 score of the model: {f1:.2%} for process {PROCESS} with threshold {THESHOLD}')

100%|██████████| 1674/1674 [00:31<00:00, 53.53it/s]

Accuracy of the model: 95.47% for process token_sort_ratio with threshold 90
Precision of the model: 95.47% for process token_sort_ratio with threshold 90
Recall of the model: 87.09% for process token_sort_ratio with threshold 90
F1 score of the model: 91.08% for process token_sort_ratio with threshold 90





In [110]:
test_data.sort_values('false_negative', ascending=False).head(10)

Unnamed: 0_level_0,match_id,true_positive,false_positive,false_negative,total_predictions
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15988821,"[70927, 15984752, 15980885, 15460830, 14500502...",0.0,0.0,12.0,0.0
70927,"[15984752, 15988821, 15980885, 15460830, 14500...",0.0,0.0,12.0,0.0
1854197,"[72138, 2170039, 1639274, 1616286, 3793719]",0.0,0.0,5.0,0.0
70560,"[16000923, 7464140, 5175030, 5060778, 1250989]",0.0,1.0,5.0,1.0
8267950,"[70633, 10073343, 8300776, 4595138]",0.0,0.0,4.0,0.0
16000923,"[70560, 7464140, 5175030, 5060778, 1250989]",1.0,3.0,4.0,4.0
70553,"[10136028, 6220514, 2198842, 773918]",0.0,1.0,4.0,1.0
10136028,"[70553, 6220514, 2198842, 773918]",0.0,3.0,4.0,3.0
72508,"[7366291, 6466036, 4461422, 2443112]",0.0,0.0,4.0,0.0
70630,"[7394702, 5057761, 4434020, 3868177]",0.0,0.0,4.0,0.0


In [111]:
tracks.loc[15988821]

name                                     Love Is The Drug
artist                          [Bryan Ferry, Roxy Music]
platform                                     SpotifyTrack
title                                  [Love Is The Drug]
remixer                                               NaN
clean_name        love is the drug bryan ferry roxy music
subTitle                                              NaN
version                                               NaN
featuredArtist                                        NaN
mood                                                  NaN
quality                                               NaN
Name: 15988821, dtype: object

In [112]:
tracks.loc[json.loads(test_data.loc[15988821,"match_id"])]

Unnamed: 0_level_0,name,artist,platform,title,remixer,clean_name,subTitle,version,featuredArtist,mood,quality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
70927,Love Is The Drug,"[Bryan Ferry, Roxy Music, Todd Terje]",Track,[Love Is The Drug],,love is the drug bryan ferry roxy music todd t...,,,,,
15984752,Love Is The Drug - Remastered 1999,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,[Remastered 1999],,,,
15980885,Love Is The Drug,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,,,,,
15460830,Love Is The Drug,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,,,,,
14500502,Love Is The Drug - Remastered 1999,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,[Remastered 1999],,,,
11281410,Love Is The Drug,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,,,,,
10017574,Love Is The Drug,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,,,,,
8196637,Love Is The Drug,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,,,,,
7655811,Love Is The Drug,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,,,,,
7555597,Love Is The Drug,[Roxy Music],SpotifyTrack,[Love Is The Drug],,love is the drug roxy music,,,,,


In [91]:
# upon examining the test results from the highest scorring model, 
# we can see that the model gets confused if there are extra artists
# perhaps if we fuzzy match the artists,  as well as the track name we can improve the model

In [5]:
# convert spotify d to a list of json objects
tracks = pd.read_csv('track_name_list.csv').to_dict('records')

for track in tqdm(tracks):
    doc = nlp_ner(track['name'])
    for e in doc.ents:
        if e.label_ not in track:
            track[e.label_] = [e.text]
        else:
            track[e.label_].append(e.text)
    track['artist'] = track['artist'].split(', ')

100%|██████████| 1674/1674 [00:13<00:00, 122.88it/s]


In [6]:
def produce_clean_title(track):

    artists = track['artist']
    artists.sort()


    # if there are any featured artists in the track make sure they are not in the list of artist


    if 'featArtist' in track:
        for featArtist in track['featArtist']:
            if featArtist in artists:
                artists.remove(featArtist)

    # if there are any remix artists in the track make sure they are not in the list of artist

    if 'remixer' in track:
        for remixArtist in track['remixer']:
            if remixArtist in artists:
                artists.remove(remixArtist)

    if "title" in track:
        if 'remixer' in track:
            track['remixer'].sort()
            clean_name = f"{' '.join(track['title'])} {' '.join(list(set(track['remixer'])))} Remix".lower()
        else:
            clean_name = f"{' '.join(track['title'])}".lower()

        return clean_name

for track in tqdm(tracks):
    clean_name = produce_clean_title(track)
    track['clean_title'] = clean_name
    track['clean_artist'] = ' '.join(track['artist']).lower()

100%|██████████| 1674/1674 [00:00<00:00, 418430.57it/s]


In [7]:
tracks = pd.DataFrame(tracks).set_index('id')

In [8]:
def fuzzy_match(target_title, target_artist, threshold, title_process, artist_process):
    results = []
    for id in tracks.index:

        title_comparison = tracks.loc[id, 'clean_title']
        title_score = getattr(fuzz,title_process)(target_title, title_comparison)
        artist_comparison = tracks.loc[id, 'clean_artist']
        artist_score = getattr(fuzz,artist_process)(target_artist, artist_comparison)
        score = (title_score + artist_score) / 2
        #print(score)
        
        if score > threshold:
            results.append({"id":id, "title_comparison":title_comparison, "artist_comparison":artist_comparison, "score":score})
    if results:
        results = pd.DataFrame(results).sort_values('score', ascending=False)
        return results['id'].tolist()
    else:
        return []

In [208]:

results = []
for title_process in ['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio']:
    for artist_process in ['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio']:
        for THESHOLD in [80, 85, 90, 95]:
            test_data = pd.read_csv('test_data.csv').set_index('target_id')

            for row in tqdm(tracks.index):

                clean_title = tracks.loc[row, 'clean_title']
                clean_artist = tracks.loc[row, 'clean_artist']
                prediction = fuzzy_match(clean_title, clean_artist, THESHOLD, title_process, artist_process)    
                prediction = [x for x in prediction if x != row]

                answer = json.loads(test_data.loc[row,'match_id'])

                # true positive is the number of track names in the prediction that are also in the answer
                test_data.loc[row,'true_positive'] = len([i for i in prediction if i in answer])

                # false positive is the number of track names in the prediction that are not in the answer
                test_data.loc[row,'false_positive'] = len([i for i in prediction if i not in answer])

                # false negative is the number of tracks in the answer that are not in the prediction
                test_data.loc[row,'false_negative'] = len([i for i in answer if i not in prediction])

                test_data.loc[row,'total_predictions'] = len(prediction)
                    
                    
            # calculate the accuracy of the model
            acc = test_data['true_positive'].sum() / (test_data['total_predictions'].sum())

            print(f'Accuracy of the model: {acc:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')

            # calculate the precision of the model
            precision = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_positive'].sum())

            print(f'Precision of the model: {precision:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')

            # calculate the recall of the model
            recall = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_negative'].sum())

            print(f'Recall of the model: {recall:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')

            # calculate the f1 score of the model
            f1 = 2 * (precision * recall) / (precision + recall)

            print(f'F1 score of the model: {f1:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')

            results.append({"title_process":title_process, "artist_process":artist_process, "threshold":THESHOLD, "accuracy":acc, "precision":precision, "recall":recall, "f1":f1})

results = pd.DataFrame(results)
results.sort_values('f1', ascending=False)

100%|██████████| 1674/1674 [00:47<00:00, 35.57it/s]


Accuracy of the model: 84.04% for title process ratio and artist_process ratio with threshold 80
Precision of the model: 84.04% for title process ratio and artist_process ratio with threshold 80
Recall of the model: 93.13% for title process ratio and artist_process ratio with threshold 80
F1 score of the model: 88.35% for title process ratio and artist_process ratio with threshold 80


100%|██████████| 1674/1674 [00:47<00:00, 35.25it/s]


Accuracy of the model: 91.63% for title process ratio and artist_process ratio with threshold 85
Precision of the model: 91.63% for title process ratio and artist_process ratio with threshold 85
Recall of the model: 89.89% for title process ratio and artist_process ratio with threshold 85
F1 score of the model: 90.75% for title process ratio and artist_process ratio with threshold 85


100%|██████████| 1674/1674 [00:47<00:00, 35.29it/s]


Accuracy of the model: 93.71% for title process ratio and artist_process ratio with threshold 90
Precision of the model: 93.71% for title process ratio and artist_process ratio with threshold 90
Recall of the model: 87.15% for title process ratio and artist_process ratio with threshold 90
F1 score of the model: 90.31% for title process ratio and artist_process ratio with threshold 90


100%|██████████| 1674/1674 [00:47<00:00, 34.88it/s]


Accuracy of the model: 97.11% for title process ratio and artist_process ratio with threshold 95
Precision of the model: 97.11% for title process ratio and artist_process ratio with threshold 95
Recall of the model: 83.33% for title process ratio and artist_process ratio with threshold 95
F1 score of the model: 89.70% for title process ratio and artist_process ratio with threshold 95


100%|██████████| 1674/1674 [00:54<00:00, 30.54it/s]


Accuracy of the model: 78.47% for title process ratio and artist_process partial_ratio with threshold 80
Precision of the model: 78.47% for title process ratio and artist_process partial_ratio with threshold 80
Recall of the model: 98.79% for title process ratio and artist_process partial_ratio with threshold 80
F1 score of the model: 87.47% for title process ratio and artist_process partial_ratio with threshold 80


100%|██████████| 1674/1674 [00:54<00:00, 30.81it/s]


Accuracy of the model: 89.50% for title process ratio and artist_process partial_ratio with threshold 85
Precision of the model: 89.50% for title process ratio and artist_process partial_ratio with threshold 85
Recall of the model: 98.09% for title process ratio and artist_process partial_ratio with threshold 85
F1 score of the model: 93.60% for title process ratio and artist_process partial_ratio with threshold 85


100%|██████████| 1674/1674 [00:54<00:00, 30.63it/s]


Accuracy of the model: 93.91% for title process ratio and artist_process partial_ratio with threshold 90
Precision of the model: 93.91% for title process ratio and artist_process partial_ratio with threshold 90
Recall of the model: 96.18% for title process ratio and artist_process partial_ratio with threshold 90
F1 score of the model: 95.03% for title process ratio and artist_process partial_ratio with threshold 90


100%|██████████| 1674/1674 [00:53<00:00, 31.19it/s]


Accuracy of the model: 96.60% for title process ratio and artist_process partial_ratio with threshold 95
Precision of the model: 96.60% for title process ratio and artist_process partial_ratio with threshold 95
Recall of the model: 94.02% for title process ratio and artist_process partial_ratio with threshold 95
F1 score of the model: 95.29% for title process ratio and artist_process partial_ratio with threshold 95


100%|██████████| 1674/1674 [00:52<00:00, 31.82it/s]


Accuracy of the model: 83.20% for title process ratio and artist_process token_sort_ratio with threshold 80
Precision of the model: 83.20% for title process ratio and artist_process token_sort_ratio with threshold 80
Recall of the model: 93.26% for title process ratio and artist_process token_sort_ratio with threshold 80
F1 score of the model: 87.94% for title process ratio and artist_process token_sort_ratio with threshold 80


100%|██████████| 1674/1674 [00:54<00:00, 30.63it/s]


Accuracy of the model: 91.66% for title process ratio and artist_process token_sort_ratio with threshold 85
Precision of the model: 91.66% for title process ratio and artist_process token_sort_ratio with threshold 85
Recall of the model: 90.14% for title process ratio and artist_process token_sort_ratio with threshold 85
F1 score of the model: 90.89% for title process ratio and artist_process token_sort_ratio with threshold 85


100%|██████████| 1674/1674 [00:52<00:00, 31.81it/s]


Accuracy of the model: 93.73% for title process ratio and artist_process token_sort_ratio with threshold 90
Precision of the model: 93.73% for title process ratio and artist_process token_sort_ratio with threshold 90
Recall of the model: 87.47% for title process ratio and artist_process token_sort_ratio with threshold 90
F1 score of the model: 90.49% for title process ratio and artist_process token_sort_ratio with threshold 90


100%|██████████| 1674/1674 [00:51<00:00, 32.58it/s]


Accuracy of the model: 97.12% for title process ratio and artist_process token_sort_ratio with threshold 95
Precision of the model: 97.12% for title process ratio and artist_process token_sort_ratio with threshold 95
Recall of the model: 83.72% for title process ratio and artist_process token_sort_ratio with threshold 95
F1 score of the model: 89.92% for title process ratio and artist_process token_sort_ratio with threshold 95


100%|██████████| 1674/1674 [00:54<00:00, 30.71it/s]


Accuracy of the model: 78.52% for title process ratio and artist_process token_set_ratio with threshold 80
Precision of the model: 78.52% for title process ratio and artist_process token_set_ratio with threshold 80
Recall of the model: 98.60% for title process ratio and artist_process token_set_ratio with threshold 80
F1 score of the model: 87.42% for title process ratio and artist_process token_set_ratio with threshold 80


100%|██████████| 1674/1674 [00:54<00:00, 30.77it/s]


Accuracy of the model: 89.42% for title process ratio and artist_process token_set_ratio with threshold 85
Precision of the model: 89.42% for title process ratio and artist_process token_set_ratio with threshold 85
Recall of the model: 98.35% for title process ratio and artist_process token_set_ratio with threshold 85
F1 score of the model: 93.67% for title process ratio and artist_process token_set_ratio with threshold 85


100%|██████████| 1674/1674 [00:54<00:00, 30.71it/s]


Accuracy of the model: 93.50% for title process ratio and artist_process token_set_ratio with threshold 90
Precision of the model: 93.50% for title process ratio and artist_process token_set_ratio with threshold 90
Recall of the model: 97.84% for title process ratio and artist_process token_set_ratio with threshold 90
F1 score of the model: 95.62% for title process ratio and artist_process token_set_ratio with threshold 90


100%|██████████| 1674/1674 [00:55<00:00, 30.23it/s]


Accuracy of the model: 95.91% for title process ratio and artist_process token_set_ratio with threshold 95
Precision of the model: 95.91% for title process ratio and artist_process token_set_ratio with threshold 95
Recall of the model: 96.88% for title process ratio and artist_process token_set_ratio with threshold 95
F1 score of the model: 96.39% for title process ratio and artist_process token_set_ratio with threshold 95


100%|██████████| 1674/1674 [00:52<00:00, 31.90it/s]


Accuracy of the model: 48.54% for title process partial_ratio and artist_process ratio with threshold 80
Precision of the model: 48.54% for title process partial_ratio and artist_process ratio with threshold 80
Recall of the model: 94.02% for title process partial_ratio and artist_process ratio with threshold 80
F1 score of the model: 64.02% for title process partial_ratio and artist_process ratio with threshold 80


100%|██████████| 1674/1674 [00:52<00:00, 31.86it/s]


Accuracy of the model: 70.27% for title process partial_ratio and artist_process ratio with threshold 85
Precision of the model: 70.27% for title process partial_ratio and artist_process ratio with threshold 85
Recall of the model: 90.97% for title process partial_ratio and artist_process ratio with threshold 85
F1 score of the model: 79.29% for title process partial_ratio and artist_process ratio with threshold 85


100%|██████████| 1674/1674 [00:52<00:00, 31.77it/s]


Accuracy of the model: 81.01% for title process partial_ratio and artist_process ratio with threshold 90
Precision of the model: 81.01% for title process partial_ratio and artist_process ratio with threshold 90
Recall of the model: 88.17% for title process partial_ratio and artist_process ratio with threshold 90
F1 score of the model: 84.43% for title process partial_ratio and artist_process ratio with threshold 90


100%|██████████| 1674/1674 [00:52<00:00, 31.81it/s]


Accuracy of the model: 86.27% for title process partial_ratio and artist_process ratio with threshold 95
Precision of the model: 86.27% for title process partial_ratio and artist_process ratio with threshold 95
Recall of the model: 84.35% for title process partial_ratio and artist_process ratio with threshold 95
F1 score of the model: 85.30% for title process partial_ratio and artist_process ratio with threshold 95


100%|██████████| 1674/1674 [00:59<00:00, 28.35it/s]


Accuracy of the model: 31.74% for title process partial_ratio and artist_process partial_ratio with threshold 80
Precision of the model: 31.74% for title process partial_ratio and artist_process partial_ratio with threshold 80
Recall of the model: 99.81% for title process partial_ratio and artist_process partial_ratio with threshold 80
F1 score of the model: 48.17% for title process partial_ratio and artist_process partial_ratio with threshold 80


100%|██████████| 1674/1674 [00:57<00:00, 29.01it/s]


Accuracy of the model: 59.82% for title process partial_ratio and artist_process partial_ratio with threshold 85
Precision of the model: 59.82% for title process partial_ratio and artist_process partial_ratio with threshold 85
Recall of the model: 99.05% for title process partial_ratio and artist_process partial_ratio with threshold 85
F1 score of the model: 74.59% for title process partial_ratio and artist_process partial_ratio with threshold 85


100%|██████████| 1674/1674 [00:57<00:00, 29.02it/s]


Accuracy of the model: 77.11% for title process partial_ratio and artist_process partial_ratio with threshold 90
Precision of the model: 77.11% for title process partial_ratio and artist_process partial_ratio with threshold 90
Recall of the model: 97.52% for title process partial_ratio and artist_process partial_ratio with threshold 90
F1 score of the model: 86.12% for title process partial_ratio and artist_process partial_ratio with threshold 90


100%|██████████| 1674/1674 [00:58<00:00, 28.49it/s]


Accuracy of the model: 81.62% for title process partial_ratio and artist_process partial_ratio with threshold 95
Precision of the model: 81.62% for title process partial_ratio and artist_process partial_ratio with threshold 95
Recall of the model: 95.23% for title process partial_ratio and artist_process partial_ratio with threshold 95
F1 score of the model: 87.90% for title process partial_ratio and artist_process partial_ratio with threshold 95


100%|██████████| 1674/1674 [00:57<00:00, 29.22it/s]


Accuracy of the model: 48.26% for title process partial_ratio and artist_process token_sort_ratio with threshold 80
Precision of the model: 48.26% for title process partial_ratio and artist_process token_sort_ratio with threshold 80
Recall of the model: 94.15% for title process partial_ratio and artist_process token_sort_ratio with threshold 80
F1 score of the model: 63.81% for title process partial_ratio and artist_process token_sort_ratio with threshold 80


100%|██████████| 1674/1674 [00:58<00:00, 28.86it/s]


Accuracy of the model: 70.29% for title process partial_ratio and artist_process token_sort_ratio with threshold 85
Precision of the model: 70.29% for title process partial_ratio and artist_process token_sort_ratio with threshold 85
Recall of the model: 91.22% for title process partial_ratio and artist_process token_sort_ratio with threshold 85
F1 score of the model: 79.40% for title process partial_ratio and artist_process token_sort_ratio with threshold 85


100%|██████████| 1674/1674 [00:57<00:00, 29.30it/s]


Accuracy of the model: 81.01% for title process partial_ratio and artist_process token_sort_ratio with threshold 90
Precision of the model: 81.01% for title process partial_ratio and artist_process token_sort_ratio with threshold 90
Recall of the model: 88.49% for title process partial_ratio and artist_process token_sort_ratio with threshold 90
F1 score of the model: 84.58% for title process partial_ratio and artist_process token_sort_ratio with threshold 90


100%|██████████| 1674/1674 [00:57<00:00, 29.12it/s]


Accuracy of the model: 86.27% for title process partial_ratio and artist_process token_sort_ratio with threshold 95
Precision of the model: 86.27% for title process partial_ratio and artist_process token_sort_ratio with threshold 95
Recall of the model: 84.73% for title process partial_ratio and artist_process token_sort_ratio with threshold 95
F1 score of the model: 85.49% for title process partial_ratio and artist_process token_sort_ratio with threshold 95


100%|██████████| 1674/1674 [01:00<00:00, 27.47it/s]


Accuracy of the model: 31.83% for title process partial_ratio and artist_process token_set_ratio with threshold 80
Precision of the model: 31.83% for title process partial_ratio and artist_process token_set_ratio with threshold 80
Recall of the model: 99.62% for title process partial_ratio and artist_process token_set_ratio with threshold 80
F1 score of the model: 48.24% for title process partial_ratio and artist_process token_set_ratio with threshold 80


100%|██████████| 1674/1674 [01:00<00:00, 27.64it/s]


Accuracy of the model: 59.90% for title process partial_ratio and artist_process token_set_ratio with threshold 85
Precision of the model: 59.90% for title process partial_ratio and artist_process token_set_ratio with threshold 85
Recall of the model: 99.30% for title process partial_ratio and artist_process token_set_ratio with threshold 85
F1 score of the model: 74.72% for title process partial_ratio and artist_process token_set_ratio with threshold 85


100%|██████████| 1674/1674 [01:01<00:00, 27.34it/s]


Accuracy of the model: 77.10% for title process partial_ratio and artist_process token_set_ratio with threshold 90
Precision of the model: 77.10% for title process partial_ratio and artist_process token_set_ratio with threshold 90
Recall of the model: 99.17% for title process partial_ratio and artist_process token_set_ratio with threshold 90
F1 score of the model: 86.76% for title process partial_ratio and artist_process token_set_ratio with threshold 90


100%|██████████| 1674/1674 [01:00<00:00, 27.68it/s]


Accuracy of the model: 81.12% for title process partial_ratio and artist_process token_set_ratio with threshold 95
Precision of the model: 81.12% for title process partial_ratio and artist_process token_set_ratio with threshold 95
Recall of the model: 98.09% for title process partial_ratio and artist_process token_set_ratio with threshold 95
F1 score of the model: 88.80% for title process partial_ratio and artist_process token_set_ratio with threshold 95


100%|██████████| 1674/1674 [00:51<00:00, 32.32it/s]


Accuracy of the model: 85.38% for title process token_sort_ratio and artist_process ratio with threshold 80
Precision of the model: 85.38% for title process token_sort_ratio and artist_process ratio with threshold 80
Recall of the model: 93.26% for title process token_sort_ratio and artist_process ratio with threshold 80
F1 score of the model: 89.15% for title process token_sort_ratio and artist_process ratio with threshold 80


100%|██████████| 1674/1674 [00:50<00:00, 32.82it/s]


Accuracy of the model: 91.99% for title process token_sort_ratio and artist_process ratio with threshold 85
Precision of the model: 91.99% for title process token_sort_ratio and artist_process ratio with threshold 85
Recall of the model: 89.89% for title process token_sort_ratio and artist_process ratio with threshold 85
F1 score of the model: 90.93% for title process token_sort_ratio and artist_process ratio with threshold 85


100%|██████████| 1674/1674 [00:50<00:00, 32.86it/s]


Accuracy of the model: 93.71% for title process token_sort_ratio and artist_process ratio with threshold 90
Precision of the model: 93.71% for title process token_sort_ratio and artist_process ratio with threshold 90
Recall of the model: 87.15% for title process token_sort_ratio and artist_process ratio with threshold 90
F1 score of the model: 90.31% for title process token_sort_ratio and artist_process ratio with threshold 90


100%|██████████| 1674/1674 [00:52<00:00, 31.95it/s]


Accuracy of the model: 97.10% for title process token_sort_ratio and artist_process ratio with threshold 95
Precision of the model: 97.10% for title process token_sort_ratio and artist_process ratio with threshold 95
Recall of the model: 83.21% for title process token_sort_ratio and artist_process ratio with threshold 95
F1 score of the model: 89.62% for title process token_sort_ratio and artist_process ratio with threshold 95


100%|██████████| 1674/1674 [01:00<00:00, 27.78it/s]


Accuracy of the model: 79.62% for title process token_sort_ratio and artist_process partial_ratio with threshold 80
Precision of the model: 79.62% for title process token_sort_ratio and artist_process partial_ratio with threshold 80
Recall of the model: 98.92% for title process token_sort_ratio and artist_process partial_ratio with threshold 80
F1 score of the model: 88.23% for title process token_sort_ratio and artist_process partial_ratio with threshold 80


100%|██████████| 1674/1674 [00:57<00:00, 28.96it/s]


Accuracy of the model: 90.97% for title process token_sort_ratio and artist_process partial_ratio with threshold 85
Precision of the model: 90.97% for title process token_sort_ratio and artist_process partial_ratio with threshold 85
Recall of the model: 98.09% for title process token_sort_ratio and artist_process partial_ratio with threshold 85
F1 score of the model: 94.40% for title process token_sort_ratio and artist_process partial_ratio with threshold 85


100%|██████████| 1674/1674 [00:58<00:00, 28.75it/s]


Accuracy of the model: 93.91% for title process token_sort_ratio and artist_process partial_ratio with threshold 90
Precision of the model: 93.91% for title process token_sort_ratio and artist_process partial_ratio with threshold 90
Recall of the model: 96.18% for title process token_sort_ratio and artist_process partial_ratio with threshold 90
F1 score of the model: 95.03% for title process token_sort_ratio and artist_process partial_ratio with threshold 90


100%|██████████| 1674/1674 [00:59<00:00, 28.36it/s]


Accuracy of the model: 96.59% for title process token_sort_ratio and artist_process partial_ratio with threshold 95
Precision of the model: 96.59% for title process token_sort_ratio and artist_process partial_ratio with threshold 95
Recall of the model: 93.64% for title process token_sort_ratio and artist_process partial_ratio with threshold 95
F1 score of the model: 95.09% for title process token_sort_ratio and artist_process partial_ratio with threshold 95


100%|██████████| 1674/1674 [00:54<00:00, 30.44it/s]


Accuracy of the model: 85.40% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 80
Precision of the model: 85.40% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 80
Recall of the model: 93.38% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 80
F1 score of the model: 89.21% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 80


100%|██████████| 1674/1674 [00:52<00:00, 32.11it/s]


Accuracy of the model: 92.01% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 85
Precision of the model: 92.01% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 85
Recall of the model: 90.14% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 85
F1 score of the model: 91.07% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 85


100%|██████████| 1674/1674 [00:50<00:00, 32.87it/s]


Accuracy of the model: 93.73% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 90
Precision of the model: 93.73% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 90
Recall of the model: 87.47% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 90
F1 score of the model: 90.49% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 90


100%|██████████| 1674/1674 [00:50<00:00, 32.96it/s]


Accuracy of the model: 97.12% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 95
Precision of the model: 97.12% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 95
Recall of the model: 83.59% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 95
F1 score of the model: 89.85% for title process token_sort_ratio and artist_process token_sort_ratio with threshold 95


100%|██████████| 1674/1674 [00:51<00:00, 32.35it/s]


Accuracy of the model: 79.67% for title process token_sort_ratio and artist_process token_set_ratio with threshold 80
Precision of the model: 79.67% for title process token_sort_ratio and artist_process token_set_ratio with threshold 80
Recall of the model: 98.73% for title process token_sort_ratio and artist_process token_set_ratio with threshold 80
F1 score of the model: 88.18% for title process token_sort_ratio and artist_process token_set_ratio with threshold 80


100%|██████████| 1674/1674 [00:52<00:00, 31.79it/s]


Accuracy of the model: 90.89% for title process token_sort_ratio and artist_process token_set_ratio with threshold 85
Precision of the model: 90.89% for title process token_sort_ratio and artist_process token_set_ratio with threshold 85
Recall of the model: 98.35% for title process token_sort_ratio and artist_process token_set_ratio with threshold 85
F1 score of the model: 94.47% for title process token_sort_ratio and artist_process token_set_ratio with threshold 85


100%|██████████| 1674/1674 [01:00<00:00, 27.87it/s]


Accuracy of the model: 93.50% for title process token_sort_ratio and artist_process token_set_ratio with threshold 90
Precision of the model: 93.50% for title process token_sort_ratio and artist_process token_set_ratio with threshold 90
Recall of the model: 97.84% for title process token_sort_ratio and artist_process token_set_ratio with threshold 90
F1 score of the model: 95.62% for title process token_sort_ratio and artist_process token_set_ratio with threshold 90


100%|██████████| 1674/1674 [01:03<00:00, 26.19it/s]


Accuracy of the model: 95.89% for title process token_sort_ratio and artist_process token_set_ratio with threshold 95
Precision of the model: 95.89% for title process token_sort_ratio and artist_process token_set_ratio with threshold 95
Recall of the model: 96.50% for title process token_sort_ratio and artist_process token_set_ratio with threshold 95
F1 score of the model: 96.20% for title process token_sort_ratio and artist_process token_set_ratio with threshold 95


100%|██████████| 1674/1674 [00:53<00:00, 31.04it/s]


Accuracy of the model: 75.45% for title process token_set_ratio and artist_process ratio with threshold 80
Precision of the model: 75.45% for title process token_set_ratio and artist_process ratio with threshold 80
Recall of the model: 94.21% for title process token_set_ratio and artist_process ratio with threshold 80
F1 score of the model: 83.79% for title process token_set_ratio and artist_process ratio with threshold 80


100%|██████████| 1674/1674 [00:53<00:00, 31.51it/s]


Accuracy of the model: 81.59% for title process token_set_ratio and artist_process ratio with threshold 85
Precision of the model: 81.59% for title process token_set_ratio and artist_process ratio with threshold 85
Recall of the model: 91.35% for title process token_set_ratio and artist_process ratio with threshold 85
F1 score of the model: 86.19% for title process token_set_ratio and artist_process ratio with threshold 85


100%|██████████| 1674/1674 [00:53<00:00, 31.24it/s]


Accuracy of the model: 83.99% for title process token_set_ratio and artist_process ratio with threshold 90
Precision of the model: 83.99% for title process token_set_ratio and artist_process ratio with threshold 90
Recall of the model: 88.10% for title process token_set_ratio and artist_process ratio with threshold 90
F1 score of the model: 86.00% for title process token_set_ratio and artist_process ratio with threshold 90


100%|██████████| 1674/1674 [00:49<00:00, 33.50it/s]


Accuracy of the model: 86.39% for title process token_set_ratio and artist_process ratio with threshold 95
Precision of the model: 86.39% for title process token_set_ratio and artist_process ratio with threshold 95
Recall of the model: 84.41% for title process token_set_ratio and artist_process ratio with threshold 95
F1 score of the model: 85.39% for title process token_set_ratio and artist_process ratio with threshold 95


100%|██████████| 1674/1674 [01:03<00:00, 26.31it/s]


Accuracy of the model: 68.73% for title process token_set_ratio and artist_process partial_ratio with threshold 80
Precision of the model: 68.73% for title process token_set_ratio and artist_process partial_ratio with threshold 80
Recall of the model: 99.81% for title process token_set_ratio and artist_process partial_ratio with threshold 80
F1 score of the model: 81.40% for title process token_set_ratio and artist_process partial_ratio with threshold 80


100%|██████████| 1674/1674 [01:00<00:00, 27.58it/s]


Accuracy of the model: 76.35% for title process token_set_ratio and artist_process partial_ratio with threshold 85
Precision of the model: 76.35% for title process token_set_ratio and artist_process partial_ratio with threshold 85
Recall of the model: 99.17% for title process token_set_ratio and artist_process partial_ratio with threshold 85
F1 score of the model: 86.28% for title process token_set_ratio and artist_process partial_ratio with threshold 85


100%|██████████| 1674/1674 [00:59<00:00, 28.04it/s]


Accuracy of the model: 81.62% for title process token_set_ratio and artist_process partial_ratio with threshold 90
Precision of the model: 81.62% for title process token_set_ratio and artist_process partial_ratio with threshold 90
Recall of the model: 97.46% for title process token_set_ratio and artist_process partial_ratio with threshold 90
F1 score of the model: 88.84% for title process token_set_ratio and artist_process partial_ratio with threshold 90


100%|██████████| 1674/1674 [00:59<00:00, 28.08it/s]


Accuracy of the model: 83.38% for title process token_set_ratio and artist_process partial_ratio with threshold 95
Precision of the model: 83.38% for title process token_set_ratio and artist_process partial_ratio with threshold 95
Recall of the model: 95.42% for title process token_set_ratio and artist_process partial_ratio with threshold 95
F1 score of the model: 88.99% for title process token_set_ratio and artist_process partial_ratio with threshold 95


100%|██████████| 1674/1674 [00:59<00:00, 28.12it/s]


Accuracy of the model: 75.13% for title process token_set_ratio and artist_process token_sort_ratio with threshold 80
Precision of the model: 75.13% for title process token_set_ratio and artist_process token_sort_ratio with threshold 80
Recall of the model: 94.34% for title process token_set_ratio and artist_process token_sort_ratio with threshold 80
F1 score of the model: 83.64% for title process token_set_ratio and artist_process token_sort_ratio with threshold 80


100%|██████████| 1674/1674 [01:01<00:00, 27.07it/s]


Accuracy of the model: 80.76% for title process token_set_ratio and artist_process token_sort_ratio with threshold 85
Precision of the model: 80.76% for title process token_set_ratio and artist_process token_sort_ratio with threshold 85
Recall of the model: 91.60% for title process token_set_ratio and artist_process token_sort_ratio with threshold 85
F1 score of the model: 85.84% for title process token_set_ratio and artist_process token_sort_ratio with threshold 85


100%|██████████| 1674/1674 [00:59<00:00, 28.08it/s]


Accuracy of the model: 83.99% for title process token_set_ratio and artist_process token_sort_ratio with threshold 90
Precision of the model: 83.99% for title process token_set_ratio and artist_process token_sort_ratio with threshold 90
Recall of the model: 88.42% for title process token_set_ratio and artist_process token_sort_ratio with threshold 90
F1 score of the model: 86.15% for title process token_set_ratio and artist_process token_sort_ratio with threshold 90


100%|██████████| 1674/1674 [00:58<00:00, 28.67it/s]


Accuracy of the model: 86.39% for title process token_set_ratio and artist_process token_sort_ratio with threshold 95
Precision of the model: 86.39% for title process token_set_ratio and artist_process token_sort_ratio with threshold 95
Recall of the model: 84.80% for title process token_set_ratio and artist_process token_sort_ratio with threshold 95
F1 score of the model: 85.59% for title process token_set_ratio and artist_process token_sort_ratio with threshold 95


100%|██████████| 1674/1674 [01:01<00:00, 27.36it/s]


Accuracy of the model: 68.93% for title process token_set_ratio and artist_process token_set_ratio with threshold 80
Precision of the model: 68.93% for title process token_set_ratio and artist_process token_set_ratio with threshold 80
Recall of the model: 99.62% for title process token_set_ratio and artist_process token_set_ratio with threshold 80
F1 score of the model: 81.48% for title process token_set_ratio and artist_process token_set_ratio with threshold 80


100%|██████████| 1674/1674 [01:00<00:00, 27.58it/s]


Accuracy of the model: 76.21% for title process token_set_ratio and artist_process token_set_ratio with threshold 85
Precision of the model: 76.21% for title process token_set_ratio and artist_process token_set_ratio with threshold 85
Recall of the model: 99.43% for title process token_set_ratio and artist_process token_set_ratio with threshold 85
F1 score of the model: 86.28% for title process token_set_ratio and artist_process token_set_ratio with threshold 85


100%|██████████| 1674/1674 [01:03<00:00, 26.50it/s]


Accuracy of the model: 81.36% for title process token_set_ratio and artist_process token_set_ratio with threshold 90
Precision of the model: 81.36% for title process token_set_ratio and artist_process token_set_ratio with threshold 90
Recall of the model: 99.11% for title process token_set_ratio and artist_process token_set_ratio with threshold 90
F1 score of the model: 89.36% for title process token_set_ratio and artist_process token_set_ratio with threshold 90


100%|██████████| 1674/1674 [01:02<00:00, 26.60it/s]

Accuracy of the model: 82.89% for title process token_set_ratio and artist_process token_set_ratio with threshold 95
Precision of the model: 82.89% for title process token_set_ratio and artist_process token_set_ratio with threshold 95
Recall of the model: 98.28% for title process token_set_ratio and artist_process token_set_ratio with threshold 95
F1 score of the model: 89.93% for title process token_set_ratio and artist_process token_set_ratio with threshold 95





Unnamed: 0,title_process,artist_process,threshold,accuracy,precision,recall,f1
15,ratio,token_set_ratio,95,0.959068,0.959068,0.968830,0.963924
47,token_sort_ratio,token_set_ratio,95,0.958913,0.958913,0.965013,0.961953
14,ratio,token_set_ratio,90,0.934954,0.934954,0.978372,0.956170
46,token_sort_ratio,token_set_ratio,90,0.934954,0.934954,0.978372,0.956170
7,ratio,partial_ratio,95,0.966013,0.966013,0.940204,0.952934
...,...,...,...,...,...,...,...
21,partial_ratio,partial_ratio,85,0.598156,0.598156,0.990458,0.745868
16,partial_ratio,ratio,80,0.485386,0.485386,0.940204,0.640243
24,partial_ratio,token_sort_ratio,80,0.482556,0.482556,0.941476,0.638069
28,partial_ratio,token_set_ratio,80,0.318293,0.318293,0.996183,0.482440


In [11]:
# the model with the highest f1 score of 96% is the one with the following parameters
title_process = 'ratio'
artist_process = 'token_set_ratio'
THESHOLD = 95

test_data = pd.read_csv('test_data.csv').set_index('target_id')

for row in tqdm(tracks.index):

    clean_title = tracks.loc[row, 'clean_title']
    clean_artist = tracks.loc[row, 'clean_artist']
    prediction = fuzzy_match(clean_title, clean_artist, THESHOLD, title_process, artist_process)    
    prediction = [x for x in prediction if x != row]

    answer = json.loads(test_data.loc[row,'match_id'])

    # true positive is the number of track names in the prediction that are also in the answer
    test_data.loc[row,'true_positive'] = len([i for i in prediction if i in answer])

    # false positive is the number of track names in the prediction that are not in the answer
    test_data.loc[row,'false_positive'] = len([i for i in prediction if i not in answer])

    # false negative is the number of tracks in the answer that are not in the prediction
    test_data.loc[row,'false_negative'] = len([i for i in answer if i not in prediction])

    test_data.loc[row,'total_predictions'] = len(prediction)
        
        
# calculate the accuracy of the model
acc = test_data['true_positive'].sum() / (test_data['total_predictions'].sum())

print(f'Accuracy of the model: {acc:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')

# calculate the precision of the model
precision = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_positive'].sum())

print(f'Precision of the model: {precision:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')

# calculate the recall of the model
recall = test_data['true_positive'].sum() / (test_data['true_positive'].sum() + test_data['false_negative'].sum())

print(f'Recall of the model: {recall:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')

# calculate the f1 score of the model
f1 = 2 * (precision * recall) / (precision + recall)

print(f'F1 score of the model: {f1:.2%} for title process {title_process} and artist_process {artist_process} with threshold {THESHOLD}')



100%|██████████| 1674/1674 [00:54<00:00, 30.92it/s]


Accuracy of the model: 93.30% for title process ratio and artist_process token_set_ratio with threshold 95
Precision of the model: 93.30% for title process ratio and artist_process token_set_ratio with threshold 95
Recall of the model: 97.39% for title process ratio and artist_process token_set_ratio with threshold 95
F1 score of the model: 95.30% for title process ratio and artist_process token_set_ratio with threshold 95


In [12]:
for row in test_data.index:
    precision = test_data.loc[row, 'true_positive'] / (test_data.loc[row, 'true_positive'] + test_data.loc[row, 'false_positive'])
    recall = test_data.loc[row, 'true_positive'] / (test_data.loc[row, 'true_positive'] + test_data.loc[row, 'false_negative'])
    f1 = 2 * (precision * recall) / (precision + recall)
    test_data.loc[row, 'precision'] = precision
    test_data.loc[row, 'recall'] = recall
    test_data.loc[row, 'f1'] = f1

  precision = test_data.loc[row, 'true_positive'] / (test_data.loc[row, 'true_positive'] + test_data.loc[row, 'false_positive'])
  f1 = 2 * (precision * recall) / (precision + recall)


In [13]:
test_data

Unnamed: 0_level_0,match_id,true_positive,false_positive,false_negative,total_predictions,precision,recall,f1
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
70001,[4166386],1.0,0.0,0.0,1.0,1.0,1.0,1.0
70009,[1982018],1.0,0.0,0.0,1.0,1.0,1.0,1.0
70010,[1364087],1.0,0.0,0.0,1.0,1.0,1.0,1.0
70035,[16878979],1.0,0.0,0.0,1.0,1.0,1.0,1.0
70040,[4165828],1.0,0.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
1732671,[73053],1.0,0.0,0.0,1.0,1.0,1.0,1.0
11946049,[73061],0.0,0.0,1.0,0.0,,0.0,
3369917,[73071],0.0,0.0,1.0,0.0,,0.0,
8386151,[73086],1.0,0.0,0.0,1.0,1.0,1.0,1.0


In [14]:
test_data.sort_values('f1', ascending=True).head(20)

Unnamed: 0_level_0,match_id,true_positive,false_positive,false_negative,total_predictions,precision,recall,f1
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6815911,"[70519, 3865574]",2.0,9.0,0.0,11.0,0.181818,1.0,0.307692
70519,"[6815911, 3865574]",2.0,9.0,0.0,11.0,0.181818,1.0,0.307692
3865574,"[70519, 6815911]",2.0,9.0,0.0,11.0,0.181818,1.0,0.307692
5644219,"[72592, 10799206, 8126306, 1482528]",1.0,0.0,3.0,1.0,1.0,0.25,0.4
8126306,"[72592, 10799206, 5644219, 1482528]",1.0,0.0,3.0,1.0,1.0,0.25,0.4
72592,"[10799206, 8126306, 5644219, 1482528]",1.0,0.0,3.0,1.0,1.0,0.25,0.4
72142,[8459677],1.0,3.0,0.0,4.0,0.25,1.0,0.4
8459677,[72142],1.0,3.0,0.0,4.0,0.25,1.0,0.4
1482528,"[72592, 10799206, 8126306, 5644219]",1.0,0.0,3.0,1.0,1.0,0.25,0.4
1704648,"[70447, 1818170]",1.0,2.0,1.0,3.0,0.333333,0.5,0.4


In [15]:
row = 5644219

pd.read_csv('track_name_list.csv').set_index('id').loc[row]

name        Elektro - Nicola Fasano South Beach Radio Mix
artist                    Mr. Gee, Nicola Fasano, Outwork
platform                                     SpotifyTrack
Name: 5644219, dtype: object

In [16]:
tracks.loc[json.loads(test_data.loc[row,"match_id"])]

Unnamed: 0_level_0,name,artist,platform,title,mood,clean_title,clean_artist,remixer,version,featuredArtist,subTitle,quality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
72592,Elektro (Nicola Fasano South Beach Radio Mix) ...,"[Mister Gee, Mr. Gee, Outwork]",Track,[Elektro],,elektro nicola fasano south beach nicola fasan...,mister gee mr. gee outwork,"[Nicola Fasano, Nicola Fasano South Beach]",[Radio],,,
10799206,Elektro - Feat. Mr Gee,"[Nicola Fasano, Outwork]",SpotifyTrack,[Elektro],,elektro,nicola fasano outwork,,,[Mr Gee],,
8126306,Elektro - Nicola Fasano South Beach Radio Mix,"[Mr. Gee, Nicola Fasano, Outwork]",SpotifyTrack,[Elektro],,elektro nicola fasano south beach remix,mr. gee nicola fasano outwork,[Nicola Fasano South Beach],[Radio],,,
1482528,Elektro (Nicola Fasano South Beach Radio Mix) ...,"[Mr. Gee, Outwork]",SpotifyTrack,[Elektro],,elektro nicola fasano south beach nicola fasan...,mr. gee outwork,"[Nicola Fasano, Nicola Fasano South Beach]",[Radio],,,


In [278]:
title_process = 'ratio'
artist_process = 'token_set_ratio'

clean_title = tracks.loc[row, 'clean_title']
clean_artist = tracks.loc[row, 'clean_artist']
prediction = fuzzy_match(clean_title, clean_artist, THESHOLD, title_process, artist_process)    
prediction = [x for x in prediction if x != row]
tracks.loc[prediction]


Unnamed: 0_level_0,name,artist,platform,title,mood,clean_title,clean_artist,remixer,version,featuredArtist,subTitle,quality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
70447,Tonight,"[Deaf'n Dumb Crew, Sébastien Léger]",Track,[Tonight],,tonight,deaf'n dumb crew sébastien léger,,,,,
1704648,Tonight,[Sébastien Léger],SpotifyTrack,[Tonight],,tonight,sébastien léger,,,,,
3798201,Tonight,[Sébastien Léger],SpotifyTrack,[Tonight],,tonight,sébastien léger,,,,,


The third pipeline was the best as it finished with an f1 score of 95%. This could potentially be improved upon by improving the Name Entity Recognition model.