### Exploratory analysis on the spotify_tracks dataset

In [4]:
# import required libraries
import pandas as pd
import seaborn as sns
import os

In [7]:
# import the spotify tracks dataset
tracks_df = pd.read_csv("dataset.csv")

In [8]:
print(tracks_df.head())

   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826     False   


In [None]:
# The file has 114k records, so create a sample of the dataset, that I can more easily look at and understand it in excel within VSCode

# take a sample of 5,000 records
tracks_sample_df = spotify_tracks_df.sample(n = 5000, random_state = 42) # set the random state for reproducibility

# save to excel in the working folder
output_path = os.path.join(os.getcwd(), "dataset_5ksample.xlsx")
tracks_sample_df.to_excel(output_path, index = False)

In [15]:
# understand dataset structure
print(tracks_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [None]:
# Make sure that the field track_id is a unique primary index

# how many records in the dataset?
print(tracks_df.shape) # 114,000

# how many unique track_ids are there
print(tracks_df['track_id'].nunique())  # 89,741 (so a lot of duplication at track_id). investigate this next


(114000, 21)
89741


In [25]:

# create a dataframe showing the number of occurrences of each track_id
track_id_agg_df = tracks_df['track_id'].value_counts().reset_index()
track_id_agg_df.columns = ['track_id', 'num_occ']
print(track_id_agg_df.head())

                 track_id  num_occ
0  6S3JlDAGk3uu3NtZbPnuhS        9
1  2Ey6v4Sekh3Z0RUSISRosD        8
2  2kkvB3RNRzwjFdGhaUA0tz        8
3  2vU6bm5hVF2idVknGzqyPL        7
4  5ZsAhuQ24mWHiduaxJqnhW        7


In [22]:
# aggregate this further to show how many track_ids have how many occurrences
track_id_agg_df_agg  = track_id_agg_df['num_occ'].value_counts().reset_index()
track_id_agg_df_agg.columns = ['num_occ', 'num_track_id']
track_id_agg_df_agg.sort_values('num_occ', ascending = True)
track_id_agg_df_agg['cumsum_num_track_id'] = track_id_agg_df_agg['num_track_id'].cumsum()
track_id_agg_df_agg['prop'] = track_id_agg_df_agg['num_track_id'] / track_id_agg_df_agg['num_track_id'].sum()
track_id_agg_df_agg['cumsum_prop'] = track_id_agg_df_agg['prop'].cumsum()
print(track_id_agg_df_agg)

   num_occ  num_track_id  cumsum_num_track_id      prop  cumsum_prop
0        1         73100                73100  0.814566     0.814566
1        2         11712                84812  0.130509     0.945075
2        3          2984                87796  0.033251     0.978327
3        4          1372                89168  0.015288     0.993615
4        5           431                89599  0.004803     0.998418
5        6           117                89716  0.001304     0.999721
6        7            22                89738  0.000245     0.999967
7        8             2                89740  0.000022     0.999989
8        9             1                89741  0.000011     1.000000


In [None]:
# only 81% of the track ids are unique, but 99% have 4 or less occurences
# take a sample of the duplicates to begin figuring out some strategies for handling this issue

In [None]:
# create a random sample  of duplicate track_ids
track_id_dup = track_id_agg_df[track_id_agg_df['num_occ'] > 1]
print(track_id_dup.head(10))
# sort it to make sure there's no 1s
track_id_dup.sort_values('num_occ', ascending = True)
# all good

                 track_id  num_occ
0  6S3JlDAGk3uu3NtZbPnuhS        9
1  2Ey6v4Sekh3Z0RUSISRosD        8
2  2kkvB3RNRzwjFdGhaUA0tz        8
3  2vU6bm5hVF2idVknGzqyPL        7
4  5ZsAhuQ24mWHiduaxJqnhW        7
5  4XYieGKSlJlHpzB3bl6WMP        7
6  3dJjsWCy6ZwMfw5NbD226G        7
7  0YLSjVxSb5FT1Bo8Tnxr8j        7
8  4aqS25F3ywJ9TGnNkOqilC        7
9  0e5LcankE0UyJUuCoq1uH2        7


Unnamed: 0,track_id,num_occ
16624,27nGU2v3syK7aU3AVY2vUO,2
16625,2TgTGJyiWf1ptW5g3QG938,2
16626,63vjnB6EeQuVf64zLxIo90,2
16627,0sSjIvTvd6fUSZZ5rnTPDW,2
16628,2zg3iJW4fK7KZgHOvJU67z,2
...,...,...
17,08kTa3SL9sV6Iy8KLKtGql,7
22,4GPQDyw9hC1DiZVh0ouDVL,7
1,2Ey6v4Sekh3Z0RUSISRosD,8
2,2kkvB3RNRzwjFdGhaUA0tz,8


In [29]:
# take 20 at random
dup_tracks_sample = track_id_dup.sample(n = 20, random_state = 58) # set the random state for reproducibility
print(dup_tracks_sample)

                     track_id  num_occ
1976   3McXs3tqyP5VwBg4FdiILD        3
1844   2GqmufsIytFAXE9sxz4m4S        4
12003  3rGol3Ivk8X9fvzKL0U8ih        2
350    3AedlLCGKNhQdhtLzObIbR        5
61     1ycq2btx7kf6bXdWILZhXa        6
5952   22sQUmLhT8umlEhQzDrzfJ        2
15270  4XSpyx2fV1c9Iq9TYY2zXk        2
16067  4SeZpWSgq1c8a0MY3iVy1f        2
8637   6FOZMPsV4o3dMj7juFrXMT        2
10468  0zqBZqm5czQ3A4EoSdKFHj        2
4378   1ErVAnzjNvyXSodWbOTYNu        3
7208   46jni4B6gnRB5EbjIEnsf3        2
10030  7lUB18zW8flnzCpHmIA0lL        2
6656   46RvocxUkMUhCtDxk1BUyT        2
7947   2YNgcIiD73XsXFNM3UuxlM        2
666    5ln5yQdUywVbf8HhFsOcd6        4
15379  1DIGB5kZKiHnEb2uWxzjzY        2
3079   0E9ZjEAyAwOXZ7wJC0PD33        3
13953  3qNERYCjkBFyhYUNh4Hl9K        2
920    4jJ3rcbPevOdXr6hFIVW45        4


In [35]:
tracks_df_dup = tracks_df[tracks_df['track_id'].isin(dup_tracks_sample['track_id'])] 
tracks_df_dup.sort_values(by = 'track_id', ascending = True)

print(tracks_df_dup)


        Unnamed: 0                track_id                        artists  \
5867          5867  22sQUmLhT8umlEhQzDrzfJ                           Reol   
9331          9331  6FOZMPsV4o3dMj7juFrXMT                    Juliano Son   
20259        20259  5ln5yQdUywVbf8HhFsOcd6                         Halsey   
21143        21143  3McXs3tqyP5VwBg4FdiILD        Vybz Kartel;Arif Cooper   
23143        23143  46jni4B6gnRB5EbjIEnsf3                      BLOND:ISH   
25277        25277  46jni4B6gnRB5EbjIEnsf3                      BLOND:ISH   
26165        26165  7lUB18zW8flnzCpHmIA0lL                      Jon Sarta   
28105        28105  46RvocxUkMUhCtDxk1BUyT            ILLENIUM;Dana Salah   
29060        29060  46RvocxUkMUhCtDxk1BUyT            ILLENIUM;Dana Salah   
30518        30518  0E9ZjEAyAwOXZ7wJC0PD33         Dynoro;Gigi D'Agostino   
31025        31025  5ln5yQdUywVbf8HhFsOcd6                         Halsey   
31118        31118  0zqBZqm5czQ3A4EoSdKFHj  Alan Walker;Benjamin Ingrosso   

In [36]:
# that sorting didn't work, since they're alphanumeric. try again
import re

# Extract leading number
tracks_df_dup['numeric_prefix'] = tracks_df_dup['track_id'].str.extract(r'^(\d+)').astype(int)

# Sort by numeric prefix first, then full string
tracks_df_dup.sort_values(by=['numeric_prefix', 'track_id'], inplace=True)

# Optional: drop helper column
tracks_df_dup.drop(columns='numeric_prefix', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df_dup['numeric_prefix'] = tracks_df_dup['track_id'].str.extract(r'^(\d+)').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df_dup.sort_values(by=['numeric_prefix', 'track_id'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df_dup.drop(columns='numeric_prefix', inplace=True)


In [38]:
print(tracks_df_dup)

        Unnamed: 0                track_id                        artists  \
30518        30518  0E9ZjEAyAwOXZ7wJC0PD33         Dynoro;Gigi D'Agostino   
31572        31572  0E9ZjEAyAwOXZ7wJC0PD33         Dynoro;Gigi D'Agostino   
53361        53361  0E9ZjEAyAwOXZ7wJC0PD33         Dynoro;Gigi D'Agostino   
31118        31118  0zqBZqm5czQ3A4EoSdKFHj  Alan Walker;Benjamin Ingrosso   
53114        53114  0zqBZqm5czQ3A4EoSdKFHj  Alan Walker;Benjamin Ingrosso   
86919        86919  1DIGB5kZKiHnEb2uWxzjzY       Big D and the Kids Table   
100373      100373  1DIGB5kZKiHnEb2uWxzjzY       Big D and the Kids Table   
67784        67784  1ErVAnzjNvyXSodWbOTYNu                 Greeicy;Anitta   
88542        88542  1ErVAnzjNvyXSodWbOTYNu                 Greeicy;Anitta   
89493        89493  1ErVAnzjNvyXSodWbOTYNu                 Greeicy;Anitta   
55435        55435  1ycq2btx7kf6bXdWILZhXa                     Anupam Roy   
56134        56134  1ycq2btx7kf6bXdWILZhXa                     Anupam Roy   

In [None]:
# save to excel so that I can see it properly
output_path2 = os.path.join(os.getcwd(), "dup_sample.xlsx")
tracks_df_dup.to_excel(output_path2, index = False)

In [45]:
# ok the issue is that there aren't duplicate tracks as such
# if a track has multiple genres it has a record for the first genre, a record for the second genre etc.
# one solution to fix this would be to make the genre column 'wide' 
# i.e. a column for pop that has a 1 or 0 if the track has the pop 'genre', a column 'rock' (1 or 0 if track is rock) etc.

# what are the distinct values on the track_genre field?
genres = tracks_df['track_genre'].value_counts().reset_index()

In [46]:
output_path3 = os.path.join(os.getcwd(), "genres.xlsx")
genres.to_excel(output_path3, index = False)

In [None]:
# ok, it appears there are 114 different genres, with 1000 tracks for each, this means that there will be a bias to mitigate against later
# next make the wide column from genre