In [127]:
from fuzzywuzzy import fuzz
import pandas as pd
import re
import unicodedata

In [128]:
df = pd.read_parquet("artists_with_infos.parquet")

In [129]:
df.head()

Unnamed: 0,Artist,Spotify Name,Genres,Popularity
0,Bono,Bonobo,"[downtempo, electronica, jazztronica, nu jazz,...",67.0
1,Motörhead,Motörhead,"[album rock, hard rock, metal, rock, speed metal]",68.0
2,Nils Petter Molvær,Nils Petter Molvær,"[contemporary jazz, ecm-style jazz, fourth wor...",39.0
3,Color Me Badd,Color Me Badd,"[boy band, contemporary r&b, new jack swing, r...",53.0
4,Trumans Water,Trumans Water,[noise rock],7.0


In [130]:
def compute_similarity(s1, s2):
    if s1 is None or s2 is None:
        return 0
    else:
        return fuzz.ratio(s1.lower(), s2.lower())

def preprocess_string(s):

    if isinstance(s, str):
        s = s.replace("$", "s")
        s = s.replace("!", "i")
        s = s.replace("/", " ")

        s = re.sub(r'[^\w\s]', '', s)
        s = s.replace("&", "and")
        s = s.lower()
        return s
    else:

        return ""

In [131]:
df["Similarity"] = df.apply(lambda row: compute_similarity(row["Artist"], row["Spotify Name"]), axis=1)

df_filtered = df[df["Similarity"] >= 95]

df_leftover = df[df["Similarity"] < 95]

print(len(df), len(df_filtered), len(df_leftover))

13661 9890 3771


In [132]:
df_leftover2 = df_leftover.copy(deep=True)
df_leftover2.loc[:, "Artist2"] = df_leftover2.loc[:, "Artist"].apply(preprocess_string)
df_leftover2.loc[:, "Spotify Name2"] = df_leftover2.loc[:, "Spotify Name"].apply(preprocess_string)
df_leftover2["Similarity2"] = df_leftover2.apply(lambda row: compute_similarity(row["Artist2"], row["Spotify Name2"]), axis=1)

In [133]:
#df_leftover2[df_leftover2["Similarity"] < df_leftover2["Similarity2"]]

df_leftover2_pickup = df_leftover2[df_leftover2["Similarity"] < df_leftover2["Similarity2"]]
df_leftover3a = df_leftover2[df_leftover2["Similarity"] >= df_leftover2["Similarity2"]]
df_filtered2 = df_leftover2_pickup[df_leftover2_pickup["Similarity2"] >= 95]
df_leftover3b = df_leftover2_pickup[df_leftover2_pickup["Similarity2"] < 95]

df_leftover3 = pd.concat([df_leftover3a, df_leftover3b])

print(len(df_leftover2), len(df_filtered2), len(df_leftover3))

df_filtered2

3771 102 3669


Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2
486,Lil' Keke,Lil’ Keke,"[chopped and screwed, crunk, dirty south rap, ...",55.0,89,lil keke,lil keke,100
546,N*E*R*D,N.E.R.D,"[hip hop, virginia hip hop]",60.0,57,nerd,nerd,100
570,Pan?American,Pan-American,"[ambient, compositional ambient, dark jazz, dr...",32.0,92,panamerican,panamerican,100
718,UK Subs,U.K. Subs,"[oi, pub rock, punk, uk82]",37.0,88,uk subs,uk subs,100
752,Joan Jett and the Blackhearts,Joan Jett & the Blackhearts,"[glam punk, rock]",67.0,93,joan jett and the blackhearts,joan jett the blackhearts,95
...,...,...,...,...,...,...,...,...
13121,Mitch Ryder & The Detroit Wheels,Mitch Ryder and The Detroit Wheels,[detroit rock],31.0,94,mitch ryder the detroit wheels,mitch ryder and the detroit wheels,95
13167,Ke$ha,Kesha,"[dance pop, pop, post-teen pop]",78.0,80,kesha,kesha,100
13232,Meade Lux Lewis,"Meade ""Lux"" Lewis","[boogie-woogie, piano blues, stride]",24.0,94,meade lux lewis,meade lux lewis,100
13547,Lil Troy,Lil' Troy,"[dirty south rap, gangster rap, houston rap, s...",52.0,94,lil troy,lil troy,100


In [134]:
df_leftover3

Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2
0,Bono,Bonobo,"[downtempo, electronica, jazztronica, nu jazz,...",67.0,80,bono,bonobo,80
6,Can,Cannons,,63.0,60,can,cannons,60
9,Justin Timberlake duet with Beyonce,Who TF Is Justin Time?,"[country rap, redneck]",49.0,39,justin timberlake duet with beyonce,who tf is justin time,39
19,Hana and the Goose,HANA,[dark pop],47.0,36,hana and the goose,hana,36
28,Paul Revere & Mark Lindsay formerly of Paul Re...,Paul Revere & The Raiders,"[bubblegum pop, classic garage rock, classic r...",42.0,56,paul revere mark lindsay formerly of paul rev...,paul revere the raiders,56
...,...,...,...,...,...,...,...,...
13561,Stones & feathers,The Rolling Stones,"[british invasion, classic rock, rock]",81.0,40,stones feathers,the rolling stones,41
13571,J Nash,J-Nasty,,30.0,62,j nash,jnasty,67
13586,CORNELIUS,Cornelius Brothers & Sister Rose,[classic soul],44.0,44,cornelius,cornelius brothers sister rose,45
13608,Johnny Hammond,"Johnny ""Hammond"" Smith","[hammond organ, jazz funk, jazz organ, soul jazz]",30.0,78,johnny hammond,johnny hammond smith,82


In [135]:
print(df_leftover3.isnull().sum())
df_leftover3[df_leftover3.isnull().any(axis=1)]

df_leftover3_pickup = df_leftover3.dropna()

print(len(df_leftover3), len(df_leftover3_pickup))

Artist              0
Spotify Name      683
Genres           1133
Popularity        722
Similarity          0
Artist2             0
Spotify Name2       0
Similarity2         0
dtype: int64
3669 2530


In [136]:
df_leftover3_pickup

Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2
0,Bono,Bonobo,"[downtempo, electronica, jazztronica, nu jazz,...",67.0,80,bono,bonobo,80
9,Justin Timberlake duet with Beyonce,Who TF Is Justin Time?,"[country rap, redneck]",49.0,39,justin timberlake duet with beyonce,who tf is justin time,39
19,Hana and the Goose,HANA,[dark pop],47.0,36,hana and the goose,hana,36
28,Paul Revere & Mark Lindsay formerly of Paul Re...,Paul Revere & The Raiders,"[bubblegum pop, classic garage rock, classic r...",42.0,56,paul revere mark lindsay formerly of paul rev...,paul revere the raiders,56
38,Jeff Beck Group,Jeff Beck,"[album rock, art rock, blues rock, british blu...",62.0,75,jeff beck group,jeff beck,75
...,...,...,...,...,...,...,...,...
13555,Of Mice And Men,Of Mice & Men,"[melodic metalcore, metalcore, post-screamo, s...",63.0,86,of mice and men,of mice men,89
13561,Stones & feathers,The Rolling Stones,"[british invasion, classic rock, rock]",81.0,40,stones feathers,the rolling stones,41
13586,CORNELIUS,Cornelius Brothers & Sister Rose,[classic soul],44.0,44,cornelius,cornelius brothers sister rose,45
13608,Johnny Hammond,"Johnny ""Hammond"" Smith","[hammond organ, jazz funk, jazz organ, soul jazz]",30.0,78,johnny hammond,johnny hammond smith,82


In [137]:
def split_artist_name(name):
    words = name.split()
    normalized_words = []
    for word in words:
        normalized_word = ""
        for char in word:
            if unicodedata.category(char)[0] != 'M':
                normalized_word += unicodedata.normalize('NFKD', char).encode('ASCII', 'ignore').decode('utf-8')
        normalized_words.append(normalized_word)
    return normalized_words
    
# def find_longest_consecutive_match(words1, words2):

#     i, j = 0, 0
#     longest_match, current_match = 0, 0
    
#     while i < len(words1) and j < len(words2):
#         if words1[i] == words2[j]:
#             current_match += 1
#             longest_match = max(longest_match, current_match)
#         else:
#             current_match = 0
#         i += 1
#         j += 1
    
#     return longest_match

def find_longest_consecutive_match(words1, words2):
    longest_match = 0
    for i in range(len(words1)):
        for j in range(len(words2)):
            if words1[i] == words2[j]:
                current_match = 1
                while (i + current_match < len(words1)
                        and j + current_match < len(words2)
                        and words1[i + current_match] == words2[j + current_match]):
                    current_match += 1
                longest_match = max(longest_match, current_match)
    return longest_match

In [138]:
df_leftover3_pickup_test = df_leftover3_pickup.copy(deep=True)

df_leftover3_pickup_test["Artist2_Key"] = df_leftover3_pickup_test["Artist2"].apply(split_artist_name)
df_leftover3_pickup_test["Spotify2_Key"] = df_leftover3_pickup_test["Spotify Name2"].apply(split_artist_name)

df_leftover3_pickup_test

df_leftover3_pickup_test["Longest_Consecutive_Words"] = df_leftover3_pickup_test.apply(
    lambda x: find_longest_consecutive_match(x["Artist2_Key"], x["Spotify2_Key"]), axis=1)

df_leftover3_pickup_test[df_leftover3_pickup_test["Longest_Consecutive_Words"] >= 2]

Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2,Artist2_Key,Spotify2_Key,Longest_Consecutive_Words
28,Paul Revere & Mark Lindsay formerly of Paul Re...,Paul Revere & The Raiders,"[bubblegum pop, classic garage rock, classic r...",42.0,56,paul revere mark lindsay formerly of paul rev...,paul revere the raiders,56,"[paul, revere, mark, lindsay, formerly, of, pa...","[paul, revere, the, raiders]",4
38,Jeff Beck Group,Jeff Beck,"[album rock, art rock, blues rock, british blu...",62.0,75,jeff beck group,jeff beck,75,"[jeff, beck, group]","[jeff, beck]",2
42,Frankie Trumbauer,Frankie Trumbauer And His Orchestra,[man's orchestra],15.0,65,frankie trumbauer,frankie trumbauer and his orchestra,65,"[frankie, trumbauer]","[frankie, trumbauer, and, his, orchestra]",2
63,DJ Magic,DJ Magic Mike,"[electro, miami bass, trival]",33.0,76,dj magic,dj magic mike,76,"[dj, magic]","[dj, magic, mike]",2
66,Esme Patterson,Esmé Patterson,[new americana],33.0,93,esme patterson,esmé patterson,93,"[esme, patterson]","[esme, patterson]",2
...,...,...,...,...,...,...,...,...,...,...,...
13505,Jah Wobble & The Invaders of the Heart,Jah Wobble,"[experimental dub, uk dub]",32.0,42,jah wobble the invaders of the heart,jah wobble,43,"[jah, wobble, the, invaders, of, the, heart]","[jah, wobble]",2
13541,Ronnie Earl & the Broadcasters,Ronnie Earl,"[blues, blues rock, electric blues, modern blues]",38.0,54,ronnie earl the broadcasters,ronnie earl,55,"[ronnie, earl, the, broadcasters]","[ronnie, earl]",2
13555,Of Mice And Men,Of Mice & Men,"[melodic metalcore, metalcore, post-screamo, s...",63.0,86,of mice and men,of mice men,89,"[of, mice, and, men]","[of, mice, men]",2
13608,Johnny Hammond,"Johnny ""Hammond"" Smith","[hammond organ, jazz funk, jazz organ, soul jazz]",30.0,78,johnny hammond,johnny hammond smith,82,"[johnny, hammond]","[johnny, hammond, smith]",2


In [139]:
df_filtered3 = df_leftover3_pickup_test[df_leftover3_pickup_test["Longest_Consecutive_Words"] >= 2]
df_leftover4 = df_leftover3_pickup_test[df_leftover3_pickup_test["Longest_Consecutive_Words"] < 2]

print(len(df_filtered3), len(df_leftover4))

1121 1409


In [140]:
df_leftover4

Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2,Artist2_Key,Spotify2_Key,Longest_Consecutive_Words
0,Bono,Bonobo,"[downtempo, electronica, jazztronica, nu jazz,...",67.0,80,bono,bonobo,80,[bono],[bonobo],0
9,Justin Timberlake duet with Beyonce,Who TF Is Justin Time?,"[country rap, redneck]",49.0,39,justin timberlake duet with beyonce,who tf is justin time,39,"[justin, timberlake, duet, with, beyonce]","[who, tf, is, justin, time]",1
19,Hana and the Goose,HANA,[dark pop],47.0,36,hana and the goose,hana,36,"[hana, and, the, goose]",[hana],1
64,Come,Comethazine,"[meme rap, rap, trap, underground hip hop, vap...",67.0,53,come,comethazine,53,[come],[comethazine],0
74,Sister C,Sister Crayon,[sacramento indie],24.0,76,sister c,sister crayon,76,"[sister, c]","[sister, crayon]",1
...,...,...,...,...,...,...,...,...,...,...,...
13287,Brick & Lace,Brick + Mortar,"[asbury park indie, modern alternative rock, m...",55.0,62,brick lace,brick mortar,67,"[brick, lace]","[brick, mortar]",1
13296,Biosphere & Higher Intelligence Agency,biosphere,"[lo-fi beats, lo-fi rap]",52.0,38,biosphere higher intelligence agency,biosphere,39,"[biosphere, higher, intelligence, agency]",[biosphere],1
13299,Wendy & Lisa,WENDY,"[k-pop, korean pop]",58.0,59,wendy lisa,wendy,62,"[wendy, lisa]",[wendy],1
13561,Stones & feathers,The Rolling Stones,"[british invasion, classic rock, rock]",81.0,40,stones feathers,the rolling stones,41,"[stones, feathers]","[the, rolling, stones]",1


In [141]:
df_leftover4

Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2,Artist2_Key,Spotify2_Key,Longest_Consecutive_Words
0,Bono,Bonobo,"[downtempo, electronica, jazztronica, nu jazz,...",67.0,80,bono,bonobo,80,[bono],[bonobo],0
9,Justin Timberlake duet with Beyonce,Who TF Is Justin Time?,"[country rap, redneck]",49.0,39,justin timberlake duet with beyonce,who tf is justin time,39,"[justin, timberlake, duet, with, beyonce]","[who, tf, is, justin, time]",1
19,Hana and the Goose,HANA,[dark pop],47.0,36,hana and the goose,hana,36,"[hana, and, the, goose]",[hana],1
64,Come,Comethazine,"[meme rap, rap, trap, underground hip hop, vap...",67.0,53,come,comethazine,53,[come],[comethazine],0
74,Sister C,Sister Crayon,[sacramento indie],24.0,76,sister c,sister crayon,76,"[sister, c]","[sister, crayon]",1
...,...,...,...,...,...,...,...,...,...,...,...
13287,Brick & Lace,Brick + Mortar,"[asbury park indie, modern alternative rock, m...",55.0,62,brick lace,brick mortar,67,"[brick, lace]","[brick, mortar]",1
13296,Biosphere & Higher Intelligence Agency,biosphere,"[lo-fi beats, lo-fi rap]",52.0,38,biosphere higher intelligence agency,biosphere,39,"[biosphere, higher, intelligence, agency]",[biosphere],1
13299,Wendy & Lisa,WENDY,"[k-pop, korean pop]",58.0,59,wendy lisa,wendy,62,"[wendy, lisa]",[wendy],1
13561,Stones & feathers,The Rolling Stones,"[british invasion, classic rock, rock]",81.0,40,stones feathers,the rolling stones,41,"[stones, feathers]","[the, rolling, stones]",1


In [158]:
strings_to_search = ['and', 'the']
df_leftover4_test = df_leftover4[df_leftover4['Artist2_Key'].apply(lambda x: any(elem.lower() in strings_to_search for elem in x)) | 
                 df_leftover4['Spotify2_Key'].apply(lambda x: any(elem.lower() in strings_to_search for elem in x))]
df_leftover4_test


Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2,Artist2_Key,Spotify2_Key,Longest_Consecutive_Words
19,Hana and the Goose,HANA,[dark pop],47.0,36,hana and the goose,hana,36,"[hana, and, the, goose]",[hana],1
76,The Faces,Faces,"[album rock, art rock, blues rock, classic roc...",60.0,71,the faces,faces,71,"[the, faces]",[faces],1
92,The Trash Can Sinatras,Trashcan Sinatras,"[britpop, c86, chamber pop, power pop, scottis...",33.0,87,the trash can sinatras,trashcan sinatras,87,"[the, trash, can, sinatras]","[trashcan, sinatras]",1
115,The Stevens,The Stevenson Ranch Davidians,[psych gaze],18.0,55,the stevens,the stevenson ranch davidians,55,"[the, stevens]","[the, stevenson, ranch, davidians]",1
154,Vibrators,The Vibrators,"[hardcore punk, oi, pub rock, punk]",34.0,82,vibrators,the vibrators,82,[vibrators],"[the, vibrators]",1
...,...,...,...,...,...,...,...,...,...,...,...
12629,Seals & Crofts,Seals and Crofts,"[adult standards, album rock, bubblegum pop, c...",58.0,87,seals crofts,seals and crofts,90,"[seals, crofts]","[seals, and, crofts]",1
12799,Sly And Robbie,Sly & Robbie,"[dub, reggae, roots reggae]",46.0,85,sly and robbie,sly robbie,88,"[sly, and, robbie]","[sly, robbie]",1
12880,Angels and Airwaves,Angels & Airwaves,"[pop punk, socal pop punk]",59.0,89,angels and airwaves,angels airwaves,91,"[angels, and, airwaves]","[angels, airwaves]",1
13000,Renaldo & the Loaf,Renaldo Domino,[traditional soul],15.0,56,renaldo the loaf,renaldo domino,58,"[renaldo, the, loaf]","[renaldo, domino]",1


In [159]:
pattern = r'\b(?:{})\b'.format('|'.join(strings_to_search))
replace_func = lambda lst: ''.join([re.sub(pattern, '', s, flags=re.IGNORECASE).strip() for s in lst])
df_leftover4_test['Artist2_Key_NoStopWords'] = df_leftover4_test['Artist2_Key'].apply(replace_func).astype(str)
df_leftover4_test['Spotify2_Key_NoStopWords'] = df_leftover4_test['Spotify2_Key'].apply(replace_func).astype(str)


df_leftover4_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leftover4_test['Artist2_Key_NoStopWords'] = df_leftover4_test['Artist2_Key'].apply(replace_func).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leftover4_test['Spotify2_Key_NoStopWords'] = df_leftover4_test['Spotify2_Key'].apply(replace_func).astype(str)


Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2,Artist2_Key,Spotify2_Key,Longest_Consecutive_Words,Artist2_Key_NoStopWords,Spotify2_Key_NoStopWords
19,Hana and the Goose,HANA,[dark pop],47.0,36,hana and the goose,hana,36,"[hana, and, the, goose]",[hana],1,hanagoose,hana
76,The Faces,Faces,"[album rock, art rock, blues rock, classic roc...",60.0,71,the faces,faces,71,"[the, faces]",[faces],1,faces,faces
92,The Trash Can Sinatras,Trashcan Sinatras,"[britpop, c86, chamber pop, power pop, scottis...",33.0,87,the trash can sinatras,trashcan sinatras,87,"[the, trash, can, sinatras]","[trashcan, sinatras]",1,trashcansinatras,trashcansinatras
115,The Stevens,The Stevenson Ranch Davidians,[psych gaze],18.0,55,the stevens,the stevenson ranch davidians,55,"[the, stevens]","[the, stevenson, ranch, davidians]",1,stevens,stevensonranchdavidians
154,Vibrators,The Vibrators,"[hardcore punk, oi, pub rock, punk]",34.0,82,vibrators,the vibrators,82,[vibrators],"[the, vibrators]",1,vibrators,vibrators
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12629,Seals & Crofts,Seals and Crofts,"[adult standards, album rock, bubblegum pop, c...",58.0,87,seals crofts,seals and crofts,90,"[seals, crofts]","[seals, and, crofts]",1,sealscrofts,sealscrofts
12799,Sly And Robbie,Sly & Robbie,"[dub, reggae, roots reggae]",46.0,85,sly and robbie,sly robbie,88,"[sly, and, robbie]","[sly, robbie]",1,slyrobbie,slyrobbie
12880,Angels and Airwaves,Angels & Airwaves,"[pop punk, socal pop punk]",59.0,89,angels and airwaves,angels airwaves,91,"[angels, and, airwaves]","[angels, airwaves]",1,angelsairwaves,angelsairwaves
13000,Renaldo & the Loaf,Renaldo Domino,[traditional soul],15.0,56,renaldo the loaf,renaldo domino,58,"[renaldo, the, loaf]","[renaldo, domino]",1,renaldoloaf,renaldodomino


In [162]:
df_filtered4 = df_leftover4_test[df_leftover4_test['Artist2_Key_NoStopWords'] == df_leftover4_test['Spotify2_Key_NoStopWords']]
df_filtered4


Unnamed: 0,Artist,Spotify Name,Genres,Popularity,Similarity,Artist2,Spotify Name2,Similarity2,Artist2_Key,Spotify2_Key,Longest_Consecutive_Words,Artist2_Key_NoStopWords,Spotify2_Key_NoStopWords
76,The Faces,Faces,"[album rock, art rock, blues rock, classic roc...",60.0,71,the faces,faces,71,"[the, faces]",[faces],1,faces,faces
92,The Trash Can Sinatras,Trashcan Sinatras,"[britpop, c86, chamber pop, power pop, scottis...",33.0,87,the trash can sinatras,trashcan sinatras,87,"[the, trash, can, sinatras]","[trashcan, sinatras]",1,trashcansinatras,trashcansinatras
154,Vibrators,The Vibrators,"[hardcore punk, oi, pub rock, punk]",34.0,82,vibrators,the vibrators,82,[vibrators],"[the, vibrators]",1,vibrators,vibrators
379,Spaniels,The Spaniels,"[doo-wop, rhythm and blues]",38.0,80,spaniels,the spaniels,80,[spaniels],"[the, spaniels]",1,spaniels,spaniels
665,Honeydogs,The Honeydogs,[minneapolis indie],12.0,82,honeydogs,the honeydogs,82,[honeydogs],"[the, honeydogs]",1,honeydogs,honeydogs
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9551,Chris And Cosey,Chris & Cosey,"[british experimental, british industrial, ebm...",29.0,86,chris and cosey,chris cosey,89,"[chris, and, cosey]","[chris, cosey]",1,chriscosey,chriscosey
10807,Barkays,The Bar-Kays,"[classic soul, disco, electro, funk, memphis s...",46.0,74,barkays,the barkays,78,[barkays],"[the, barkays]",1,barkays,barkays
12629,Seals & Crofts,Seals and Crofts,"[adult standards, album rock, bubblegum pop, c...",58.0,87,seals crofts,seals and crofts,90,"[seals, crofts]","[seals, and, crofts]",1,sealscrofts,sealscrofts
12799,Sly And Robbie,Sly & Robbie,"[dub, reggae, roots reggae]",46.0,85,sly and robbie,sly robbie,88,"[sly, and, robbie]","[sly, robbie]",1,slyrobbie,slyrobbie


In [167]:
df_leftover4_r = df_leftover4[~df_leftover4.isin(df_filtered4)].dropna()
print(len(df_leftover4), len(df_leftover4_r), len(df_filtered4))

1409 1328 81


In [176]:
df_result = pd.concat([df_filtered, df_filtered2, df_filtered3, df_filtered4])
print(len(df), len(df_result))
df_result = df_result.iloc[:, :4]
df_result



13661 11194


Unnamed: 0,Artist,Spotify Name,Genres,Popularity
1,Motörhead,Motörhead,"[album rock, hard rock, metal, rock, speed metal]",68.0
2,Nils Petter Molvær,Nils Petter Molvær,"[contemporary jazz, ecm-style jazz, fourth wor...",39.0
3,Color Me Badd,Color Me Badd,"[boy band, contemporary r&b, new jack swing, r...",53.0
4,Trumans Water,Trumans Water,[noise rock],7.0
5,Cliff Eberhardt,Cliff Eberhardt,[contemporary folk],10.0
...,...,...,...,...
9551,Chris And Cosey,Chris & Cosey,"[british experimental, british industrial, ebm...",29.0
10807,Barkays,The Bar-Kays,"[classic soul, disco, electro, funk, memphis s...",46.0
12629,Seals & Crofts,Seals and Crofts,"[adult standards, album rock, bubblegum pop, c...",58.0
12799,Sly And Robbie,Sly & Robbie,"[dub, reggae, roots reggae]",46.0


In [177]:
df_result.to_parquet("filtered_artists_with_infos.parquet")
