Import Libraries that will likely be used

In [None]:
import pandas as pd
import numpy as np
import spotipy as sp
import os

import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

Take a look at first file to see what cleaning needs to be done

In [None]:
pd_BI = pd.read_csv('../spotify_artist_data/BI.csv')
pd_BI.head()

Let's check whether there are duplicates.
<br>
Popularity feature is the important one in later analysis, so let's sort on that

In [None]:
# Let's check for duplicates by sorting
pd_BI = pd_BI.sort_values('popularity', ascending=False)
pd_BI.head()

The 'year' column can be dropped so that we get true duplicates of entries
<br>All of the features are the same, as they are based on the id feature

In [None]:
pd_BI.drop(columns=['year'], inplace=True)
pd_BI.drop_duplicates(subset= "name", keep= 'first', inplace= True)

pd_BI.head()

Let's take a look at the distribution of artists at each popularity
<br>That way, we can decide if we should leave any out

In [None]:
print(f"We have {pd_BI.size} unique artists")

for n in range(0, 100, 10):
    tmp = pd_BI[pd_BI['popularity']>=n].size
    print(f"There are {tmp} artists with popularity above {n}")

Looks like about half of the artists have a popularity within {0, 9]
<br> Let's visualize with a histogram so we can take a look from another perspective

In [None]:
# Create a simple histogram
binsize = 1
bins = np.arange(0, pd_BI['popularity'].max()+binsize, binsize)
plt.figure(figsize=[14, 8])

plt.hist(data = pd_BI, x = 'popularity', bins = bins)
plt.title('Distribution of Artists by Popularity in the BI Market')
plt.xlabel('Popularity (range of 0 to 100)')
plt.ylabel('Number of artists')
plt.show()

Looks like about half the artists have popularity = 0
<br>We will keep this in mind, but will not filter them out here.
<br>We may want to use this information in future analyses.
<br>
<br>For now, we will create clean sets of data for each market.
<br>First, we will get all the filenames and market data as dataframes

In [None]:
# Walk through all files, storing file names
path, dirs, files = next(os.walk("../spotify_artist_data/"))

# List for holding all the market names
artist_market = []

# List for holding each file's data (as a dataframe)
artist_df_list = []

# for each file, remove duplicates 
#   and store the cleaned dataframe in a list
# DFs will be saved to new files, but in a different loop
#   so that the data can be further examined and cleaned
#   if necessary
for f in range(len(files)):
    temp_df = pd.read_csv("../spotify_artist_data/"+files[f],
                      index_col = None, header = 0)
    
#   drop the year column so that duplicates can be dropped
    temp_df.drop(columns=['year'], inplace=True)
    temp_df.drop_duplicates(subset= "name", keep= 'first', inplace=True)

#   add this df to the list
    artist_df_list.append(temp_df)

#   get the market name from the filename
    tmp_market = str(files[f])
#   store the market name
    artist_market.append(tmp_market)

Next, we store each dataframe as a .csv file in a new directory

In [None]:
'''
# Sanity Check
if(len(artist_df_list) == len(artist_market)):
    print("List lengths match")
else:
    print("List lengths do not match")
'''

# Store the cleaned data in a new file
#   Using the market name as the filename
for df in range(len(artist_df_list)):
    artist_df_list[df].to_csv("../artist_market_data/" + str(artist_market[df]), index=False)
#   for debugging
#    print(f"Wrote a df of size {artist_df_list[df].size} to file {artist_market[df]}")