In [175]:
import pandas as pd 
import numpy as np

albums = pd.read_csv("data/spotify_spotify_albums.csv", index_col=0) 
artists = pd.read_csv("data/spotify_spotify_artists.csv", index_col=0) 
tracks = pd.read_csv("data/spotify_spotify_tracks.csv",index_col=0)

#### Profiling Data

In [176]:
albums.head() # We see first 5 rows across all columns. This looks to be a wide dataset with many columns containing categorical variables.
# Notice the first column: 'Unnamed: 0'. This is an index column that wasn't read correctly before we re-ran the previous block with 'index_col = 0'. 
# If we preferred not to re-read a large file like this, we could have typed:
  # albums.drop("Unnamed: 0", axis=1, inplace=True)
# Or:
  # albums = albums.iloc[:,1:] 

albums.tail() # Shows last 5 rows and lets us determine number of rows in dataframe (75511)
albums.shape # Gives dimensionality of dataframe

albums.loc[10:20,['name', 'release_date']]

Unnamed: 0,name,release_date
10,Beethoven: 6 Bagatelles & Piano Sonatas Nos. 3...,2019-03-01
11,Au Long de la Loire,2019-03-01
12,I Still Miss U,2019-03-15
13,Kolmekymppinen,1980
14,Bruja,2018-04-24
15,Sonatas for two violins,2019-03-01
16,Sirènes,2019-03-01
17,Light of Day,2019-03-01
18,Madame del Campo,2015-08-03
19,Beethoven: Piano Concerto No. 2 & Triple Conce...,2019-03-01


In [177]:
artists.head() # This has two quantitative variable columns in 'artist_popularity' and 'followers'.
artists.tail()
artists.shape

(56129, 8)

In [178]:
tracks.head() # Another wide dataset with multiple quantitative variables. joining with the other two sets could allow for interesting statistical analysis.
tracks.tail()
tracks.shape # Logically, there are more tracks than albums, and there are more albums than artists

(101939, 31)

#### Cleaning and Normalizing Data

In [179]:
albums.drop_duplicates(inplace=True)
artists.drop_duplicates(inplace=True)
tracks.drop_duplicates(inplace=True)

In [180]:
def listFill(genre_list): 
  if genre_list == '[]': # didn't set the dtypes on read, so artists.genres was just read as strings
    return np.nan
  else: 
    return genre_list

artists.genres = artists.genres.map(listFill)

In [181]:
print(tracks.columns)
tracks.lyrics # These lyrics seem to render just fine in LibreOffice Calc. I imagine all the escape characters leading at the front of each line could cause issues for some programs.
tracks.drop('lyrics', axis=1, inplace=True)
print(tracks.columns)

Index(['acousticness', 'album_id', 'analysis_url', 'artists_id',
       'available_markets', 'country', 'danceability', 'disc_number',
       'duration_ms', 'energy', 'href', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'lyrics', 'mode', 'name', 'playlist',
       'popularity', 'preview_url', 'speechiness', 'tempo', 'time_signature',
       'track_href', 'track_name_prev', 'track_number', 'uri', 'valence',
       'type'],
      dtype='object')
Index(['acousticness', 'album_id', 'analysis_url', 'artists_id',
       'available_markets', 'country', 'danceability', 'disc_number',
       'duration_ms', 'energy', 'href', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'playlist', 'popularity',
       'preview_url', 'speechiness', 'tempo', 'time_signature', 'track_href',
       'track_name_prev', 'track_number', 'uri', 'valence', 'type'],
      dtype='object')


#### Joining Data

In [182]:
artists.head(1)

Unnamed: 0,artist_popularity,followers,genres,id,name,track_id,track_name_prev,type
0,44,23230,"['sertanejo', 'sertanejo pop', 'sertanejo trad...",4mGnpjhqgx4RUdsIJiURdo,Juliano Cezar,0wmDmAILuW9e2aRttkl4aC,track_9,artist


In [183]:
albums.head(1)

Unnamed: 0,album_type,artist_id,available_markets,external_urls,href,id,images,name,release_date,release_date_precision,total_tracks,track_id,track_name_prev,uri,type
0,single,3DiDSECUqqY1AuBP8qtaIa,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/1g...,https://api.spotify.com/v1/albums/1gAM7M4rBwEb...,1gAM7M4rBwEbSPeAQR2nx1,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",If I Ain't Got You EP,2019-02-08,day,6,2iejTMy9XZ8Gaae0aQ2yl0,track_32,spotify:album:1gAM7M4rBwEbSPeAQR2nx1,album


In [184]:
albums_artists = pd.merge(albums, artists, how='left', left_on="artist_id", right_on="id")
# I chose a left join on albums because I want to analyze the albums primarily. 
# If we did an inner join with artists or a right join on artists, we might lose any albums that didn't have any artists listed with them.
# We wouldn't want an outer join because we might include certain artists with only singles available on spotify and no albums.
# However, you might want those if you aren't just interested in looking at the albums, which is what I stated at the outset.

albums_artists.head(3)

Unnamed: 0,album_type,artist_id,available_markets,external_urls,href,id_x,images,name_x,release_date,release_date_precision,...,uri,type_x,artist_popularity,followers,genres,id_y,name_y,track_id_y,track_name_prev_y,type_y
0,single,3DiDSECUqqY1AuBP8qtaIa,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/1g...,https://api.spotify.com/v1/albums/1gAM7M4rBwEb...,1gAM7M4rBwEbSPeAQR2nx1,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",If I Ain't Got You EP,2019-02-08,day,...,spotify:album:1gAM7M4rBwEbSPeAQR2nx1,album,79.0,5946074.0,"['hip pop', 'neo soul', 'pop', 'r&b', 'urban c...",3DiDSECUqqY1AuBP8qtaIa,Alicia Keys,6LGwYMXXgURfaequXipzHx,track_12,artist
1,album,6s1pCNXcbdtQJlsnM1hRIA,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/4K...,https://api.spotify.com/v1/albums/4KfJZV7WfolY...,4KfJZV7WfolYlxBzOTo66s,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Shostakovich Symphony No.5 - Four Romances on ...,2019-03-01,day,...,spotify:album:4KfJZV7WfolYlxBzOTo66s,album,57.0,135831.0,"['classical', 'compositional ambient', 'russia...",6s1pCNXcbdtQJlsnM1hRIA,Dmitri Shostakovich,5mJt2hHubIoqQj8Q0zSwWF,track_76,artist
2,single,5YjfNaHq05WrwldRe1QSBc,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/7n...,https://api.spotify.com/v1/albums/7nLYY7uAVUb5...,7nLYY7uAVUb57kpd7tZxnS,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Take My Bass,2019-03-14,day,...,spotify:album:7nLYY7uAVUb57kpd7tZxnS,album,18.0,118.0,,5YjfNaHq05WrwldRe1QSBc,Brandon Mints,3jJKj4QTK3v18ZSwpk7AcV,track_30,artist


The join did not work. Let's look at the columns where things got messy.

In [185]:
albums_artists.filter(regex='_[xy]$', axis=1).head(3)

Unnamed: 0,id_x,name_x,track_id_x,track_name_prev_x,type_x,id_y,name_y,track_id_y,track_name_prev_y,type_y
0,1gAM7M4rBwEbSPeAQR2nx1,If I Ain't Got You EP,2iejTMy9XZ8Gaae0aQ2yl0,track_32,album,3DiDSECUqqY1AuBP8qtaIa,Alicia Keys,6LGwYMXXgURfaequXipzHx,track_12,artist
1,4KfJZV7WfolYlxBzOTo66s,Shostakovich Symphony No.5 - Four Romances on ...,1WQfghEjszJJ4H8MAWrQ2C,track_11,album,6s1pCNXcbdtQJlsnM1hRIA,Dmitri Shostakovich,5mJt2hHubIoqQj8Q0zSwWF,track_76,artist
2,7nLYY7uAVUb57kpd7tZxnS,Take My Bass,3jJKj4QTK3v18ZSwpk7AcV,track_15,album,5YjfNaHq05WrwldRe1QSBc,Brandon Mints,3jJKj4QTK3v18ZSwpk7AcV,track_30,artist


Notice that `albums` and `artists` have multiple columns with the same names.

* `name`: name of the album vs. the name of the artist.
* `id`: id of the album vs. the id of the artist. We tried to join on `albums.artist_id` = `artists.id` but the fact that there is also an `albums.id` column confuses things.
* `type`: 'album' vs. 'artist' (a static, dummy categorical variable we can drop from both dataframes)
* `track_id`: id of every track in each album vs id of one sample song by each artist. We can drop `artists.track_id` when performing the join
* `track_name_prev`: Frankly I'm not sure what this column is. I'm going to drop it in both.

In [186]:
albums.rename({'name':'album_name','id':'album_id'},axis=1, inplace=True)
artists.rename({'name':'artist_name','id':'artist_id'}, axis=1, inplace=True)
albums.drop(['type', 'track_name_prev'], axis=1, inplace=True)
artists.drop(['type', 'track_name_prev'], axis=1, inplace=True)

In [187]:
albums_artists2 = pd.merge(albums, artists.drop(['track_id'],axis=1), how='left', on='artist_id')
albums_artists2

Unnamed: 0,album_type,artist_id,available_markets,external_urls,href,album_id,images,album_name,release_date,release_date_precision,total_tracks,track_id,uri,artist_popularity,followers,genres,artist_name
0,single,3DiDSECUqqY1AuBP8qtaIa,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/1g...,https://api.spotify.com/v1/albums/1gAM7M4rBwEb...,1gAM7M4rBwEbSPeAQR2nx1,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",If I Ain't Got You EP,2019-02-08,day,6,2iejTMy9XZ8Gaae0aQ2yl0,spotify:album:1gAM7M4rBwEbSPeAQR2nx1,79.0,5946074.0,"['hip pop', 'neo soul', 'pop', 'r&b', 'urban c...",Alicia Keys
1,album,6s1pCNXcbdtQJlsnM1hRIA,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/4K...,https://api.spotify.com/v1/albums/4KfJZV7WfolY...,4KfJZV7WfolYlxBzOTo66s,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Shostakovich Symphony No.5 - Four Romances on ...,2019-03-01,day,8,1WQfghEjszJJ4H8MAWrQ2C,spotify:album:4KfJZV7WfolYlxBzOTo66s,57.0,135831.0,"['classical', 'compositional ambient', 'russia...",Dmitri Shostakovich
2,single,5YjfNaHq05WrwldRe1QSBc,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/7n...,https://api.spotify.com/v1/albums/7nLYY7uAVUb5...,7nLYY7uAVUb57kpd7tZxnS,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Take My Bass,2019-03-14,day,1,3jJKj4QTK3v18ZSwpk7AcV,spotify:album:7nLYY7uAVUb57kpd7tZxnS,18.0,118.0,,Brandon Mints
3,single,2G9Vc16JCpnZmK4uGH46Fa,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/6p...,https://api.spotify.com/v1/albums/6p20Rt4x2Qn5...,6p20Rt4x2Qn5mUMRi1s6pj,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Hypnotizing (Are U),2016-11-16,day,1,1xGtDafUZbHyYC3Xarcbrj,spotify:album:6p20Rt4x2Qn5mUMRi1s6pj,24.0,309.0,,DØ CHEF DØ
4,single,2dwM9OcE4c3Ph1UBINSodx,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/1X...,https://api.spotify.com/v1/albums/1XeoOqC1q7U2...,1XeoOqC1q7U2iyLEQJ64cu,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Sunshine,2018-07-20,day,1,0gWtsXvXOzAT6FtM3ur8in,spotify:album:1XeoOqC1q7U2iyLEQJ64cu,24.0,1394.0,,Alejo García
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75506,album,0S7CCsKpXJ8e1dIstizHw5,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/4Z...,https://api.spotify.com/v1/albums/4ZbkQn6amf1t...,4ZbkQn6amf1t3a202zen2u,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Latino For Life,2014-01-24,day,13,3CZY7jD1Zf30zIdbwS5212,spotify:album:4ZbkQn6amf1t3a202zen2u,42.0,23280.0,['reggaeton chileno'],DJ Mendez
75507,album,73Vk4dL8kYkbRJxWJDq1wL,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/4D...,https://api.spotify.com/v1/albums/4DifRQy4Dyey...,4DifRQy4Dyey9NSS7ywnEx,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Dime Qué Pasa,2011-03-31,day,11,7GOBsKucPG5Bh8qzStmrQ6,spotify:album:4DifRQy4Dyey9NSS7ywnEx,40.0,22403.0,"['andean', 'chilean indie', 'chilean rock', 'n...",La Floripondio
75508,album,3yW6jTzGjHUUkLvLkjLOVn,"['AU', 'NZ']",{'spotify': 'https://open.spotify.com/album/4P...,https://api.spotify.com/v1/albums/4PXy3cBCNeY0...,4PXy3cBCNeY0ZVKTOGi9Cw,"[{'height': 640, 'url': 'https://i.scdn.co/ima...","Black Fingernails, Red Wine",2006-06-10,day,13,0ujklxrVM2jwpLMgbTwTd1,spotify:album:4PXy3cBCNeY0ZVKTOGi9Cw,47.0,59326.0,"['australian alternative rock', 'australian in...",Eskimo Joe
75509,album,4iudEcmuPlYNdbP3e1bdn1,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",{'spotify': 'https://open.spotify.com/album/51...,https://api.spotify.com/v1/albums/511p6iaCuK8S...,511p6iaCuK8Sr0BYdpcfkq,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Freak Show,1997,year,13,58nHFSWj5N5JxNtWgS85TL,spotify:album:511p6iaCuK8Sr0BYdpcfkq,63.0,552438.0,"['alternative metal', 'alternative rock', 'aus...",Silverchair
