In [1]:
import numpy as np
import pandas as pd
import re

data = pd.read_csv('spotify-compressed.csv')
data = data.drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,artistname,user,artist-value
0,Elvis Costello,0,3
1,Elvis Costello & The Attractions,0,3
2,Tiffany Page,0,4
3,Lissie,0,8
4,Paul McCartney,0,4


In [2]:
print(data.shape)

# Removing blanks
data['artistname'] = data['artistname'].replace(' ', np.nan)
data = data.dropna()

# Regex time, dropping any observations with only blank spaces
data['artistname'] = data['artistname'].replace(r'^\s*$', np.nan, regex=True)
data = data.dropna()

print(data.shape)
print(data.dtypes)

(3285631, 3)
(3285588, 3)
artistname      object
user             int64
artist-value     int64
dtype: object


In [3]:
# Changing '&' to 'and' so later on, the character will not be removed
data['artistname'] = data['artistname'].str.replace('&', 'and')

# Changing ` to '
# Removing the '
data['artistname'] = data['artistname'].str.replace('`', "'")
data['artistname'] = data['artistname'].str.replace("'", '')
data.head()

Unnamed: 0,artistname,user,artist-value
0,Elvis Costello,0,3
1,Elvis Costello and The Attractions,0,3
2,Tiffany Page,0,4
3,Lissie,0,8
4,Paul McCartney,0,4


In [4]:
import unicodedata
import re

def remove_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

pattern = re.compile(r'[^\w$!@|,\']+', re.UNICODE)

data['artistname'] = data['artistname'].apply(remove_accents)
data['artistname'] = data['artistname'].str.replace(pattern, ' ', regex=True)


In [5]:
data['artistname']= data['artistname'].str.replace('Ø', 'o')
data['artistname']= data['artistname'].str.replace('ø', 'o')
data['artistname'] = data['artistname'].str.replace(r'(?<!A\$AP\s)Rocky', 'A$AP Rocky', regex=True)
data['artistname'] = data['artistname'].str.replace(r'(?<!A\$AP\s)Rockie', 'A$AP Rocky', regex=True)
data['artistname'] = data['artistname'].str.replace(r'(?<!A\$AP\s)Ferg', 'A$AP Ferg', regex=True)
data['artistname']= data['artistname'].str.replace('ASAP ', '')
data['artistname']= data['artistname'].str.upper()
data['artistname']= data['artistname'].str.replace('2PAC', '2 PAC')
data['artistname']= data['artistname'].str.replace(' FEATURE ', 'FEAT')
data['artistname']= data['artistname'].str.replace(' FEATURING ', 'FEAT')
data['artistname']= data['artistname'].str.replace(' FT ', 'FEAT')


In [6]:
# Splitting all observations on feat or Feat or feat. or Feat. to split and create a new observation.
# Holding user and artist-value the same for the dupe.
# Define the pattern
pattern = r'\s+(FEAT\.?|\&)\s+'

# Split the artistname column based on the pattern
data['artistname'] = data['artistname'].str.split(pattern)

# Repeat the other columns for each split
data = data.explode('artistname').reset_index(drop=True)
data = data.reindex(data.index.repeat(data['user'].apply(lambda x: len(x) if isinstance(x, list) else 1)))
data['artistname'] = data['artistname'].str.strip()
data['user'] = data['user'].astype(int)

In [7]:
# Preview what will be dropped
dropped = data[data['artistname'].str.contains(r'[^\x20-\x7E]')]
sample = dropped['artistname'].sample(n=25)
print(sample)

2369663                                    葉麗儀
1962541          محمد السالم MOHAMMAD AL SALEM
2856085                               TANK 呂建中
2964476                        СЕРГЕИ ТРОФИМОВ
249662     HIJOS DEL 5² PATIO MALDITA VECINDAD
1307887                                     王菲
245823                              WANNSKRÆKK
1657102                           MICHAŁ BAJOR
2940161                 СМЫСЛОВЫЕ ГАЛЛЮЦИНАЦИИ
1974040                                    王若琳
2430545                          CÆCILIE NORBY
1626548              소녀시대 GIRLS GENERATION
2158684                               소녀시대
2546840                         CŒUR DE PIRATE
1197021                                     白光
1169018                                คาราบาว
709848                                      李玟
233532                               АНИ ЛОРАК
2451373                                    莫文蔚
2233812                     WITOLD LUTOSŁAWSKI
1690460                                  HÆLOS
1187960      

In [8]:
print(data.shape)

# Deleting anything that has a weird notation or emojis, such as any character that is not in the English alphabet, a number, or a printable character.
# create a boolean mask for the observations that match the pattern
mask = data['artistname'].str.contains(r'[^\x20-\x7E]')
data = data[~mask]
data = data.dropna()

# Regex time, dropping any observations with only blank spaces
data['artistname'] = data['artistname'].replace(r'^\s*$', np.nan, regex=True)

# Drop observations that contain only FEAT, this is from the split I did earlier
data['artistname'] = data['artistname'].replace('FEAT', np.nan, regex=True)

data = data.dropna()

# Dropping all observations that contain a , as that is obscure/rare. 
# If an artist was a feature it should have been listed as so.
data = data[~data['artistname'].str.contains(',')]

data = data.dropna()

print(data.shape)

(3333132, 3)
(3274305, 3)


In [9]:
# Dropping observations that only contain numbers in the artist name column
data = data[~data['artistname'].str.isdigit()]

print(data.shape)

(3272298, 3)


In [10]:
# Drops
data = data[data['artistname'] != 'VARIOUS ARTIST']
data = data[data['artistname'] != 'VARIOUS ARTISTS']
data = data[data['artistname'] != 'ARTIST']


In [11]:
grouped = data.groupby('artistname')['artist-value'].sum().reset_index()

# drop all artist below 50 plays
grouped = grouped[grouped['artist-value'] >= 50]

# Sorting 
grouped = grouped.sort_values('artist-value', ascending=True)

# mask of names to keep
artist_keep = grouped['artistname']

# applying the mask
data = data[data['artistname'].isin(artist_keep)]

In [12]:
data['artistname'] = data['artistname'].str.title()

In [13]:
# Creating the sparse matrix
sparse_df = data.pivot_table(index='artistname', columns='user', values='artist-value', fill_value=0)

In [14]:
# checking to make sure the columns contain at least one value.
zero_sum_columns = sparse_df.columns[sparse_df.sum() == 0].to_list()

In [15]:
print(sparse_df.iloc[:5,:5])

user              0  1  2  3  4
artistname                     
!!!               0  0  0  0  0
!Dela Dap         0  0  0  0  0
007 Collective    0  0  0  0  0
009 Sound System  0  0  0  0  0
1 Dads            0  0  0  0  0


In [16]:
sparse_df = sparse_df.reset_index()

In [17]:
print(sparse_df['artistname'].head(20))

0                                  !!!
1                            !Dela Dap
2                       007 Collective
3                     009 Sound System
4                               1 Dads
5                           1 Hits Now
6                             10 Years
7                                 100S
8     101 Classical Music Masterpieces
9                101 Strings Orchestra
10                                10Cc
11                          116 Clique
12                           12 Rounds
13                           12 Stones
14                            120 Days
15                     1200 Micrograms
16                         12Th Planet
17                             13 Cats
18                           140 Farts
19                                14Kt
Name: artistname, dtype: object


In [18]:
sparse_df.to_csv('sparse-artist-cleaning.csv', index= False)

load the df and continue from this point

In [19]:
import pandas as pd
sparse_df = pd.read_csv('sparse-artist-cleaning.csv')

In [20]:
sparse_df = sparse_df.groupby(['artistname']).sum()

- observations like 2020SOUNDSYSTEM and 009 SOUND SYSTEM need to be fixed, and concat
- observations like 2PAC and 2 PAC need to be fixed, and concat
- observations that only contain numbers need to be removed
- observations like DEF LEOPARD and DEF LEPPARD, FEMKE and FEMME, MOCKI and MOCKINGBYRDS, PURSUIT and PURSUIT OF HAPPINESS need to be spell corrected


In [21]:
import numpy as np

i = np.random.random_integers(23372)
j = i + 26

print(sparse_df.iloc[i:j,:5])

                                            0  1  2  3  4
artistname                                               
Chicks On Speed                             0  0  0  0  0
Chico And Rita New York Band And Orchestra  0  0  0  0  0
Chico Buarque                               0  0  0  0  0
Chico Debarge                               0  0  0  0  0
Chico Hamilton                              0  0  0  0  0
Chico Mann                                  0  0  0  0  0
Chico Science                               0  0  0  0  0
Chico Trujillo                              0  0  0  0  0
Chico Y Chica                               0  0  0  0  0
Chiddy Bang                                 0  0  0  0  0
Chief                                       0  0  0  0  0
Chief Commander Ebenezer Obey               0  0  0  0  0
Chief Keef                                  0  0  0  0  0
Chiens De Paille                            0  0  0  0  0
Chihei Hatakeyama                           0  0  0  0  0
Chikita Violen

  i = np.random.random_integers(23372)
