# Exploratory Data Analysis

## Libraries

In [1]:
# import other files
import sys

sys.path.append("..")

In [2]:
# common functions
from common import missing_data_check, plot_histograms
# fetch functions
from etl.fetch import *
# environment variables
from dotenv import load_dotenv
import os
# data visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
# working with data
import numpy as np
from collections import Counter
from itertools import chain
from sklearn.preprocessing import MinMaxScaler

## Consolidating Data

In [3]:
# load environment variables
load_dotenv()

like_playlist_id = os.getenv("LIKE_PLAYLIST_ID")
dislike_playlist_id = os.getenv("DISLIKE_PLAYLIST_ID")

token = get_token()

'''
Personal playlist columns:
- artist_id, track_id, track_name, popularity, user_like
'''
like_playlist = get_playlist_tracks(token, like_playlist_id)
like_playlist['user_like'] = True
dislike_playlist = get_playlist_tracks(token, dislike_playlist_id)
dislike_playlist['user_like'] = False
songs = pd.concat([like_playlist, dislike_playlist])

'''
Track feature columns:
- 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
    'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
    'time_signature'
'''
track_ids = songs.track_id.to_list()
track_features = get_track_features(token, track_ids).drop(columns=['analysis_url', 'track_href', 'type', 'uri'])

'''
Artist feature columns:
- 'artist_id', 'genres', 'name', 'popularity'
'''
artist_ids = songs.artist_id.to_list()
artist_features = get_artist_attribute(token, artist_ids).drop(columns=['name', 'popularity'])

2024-05-13 14:11:57 INFO Fetching tracks...
2024-05-13 14:11:59 INFO Fetching tracks...
2024-05-13 14:12:06 INFO Fetching track features...
2024-05-13 14:12:09 INFO Fetching artist(s)...


## Exploratory Analysis

In [None]:
missing_data_check(track_features)

In [None]:
track_features.describe()

Checked to see if for any reason any of the tracks have missing audio features. This does not seem to be the case.

We note that:
- key is a categorical variable representing the key the track is in and can range from -1 to 11 (0 to 11 in our case)
- mode is binary and indicates the modality of a track (major or minor) where 1 is major and 0 is minor
- time_signature is an ordinal variable specifying the number of beats in each bar and can range from 3 to 7 (3 to 5 in our case)
- loudness, key, tempo, and duration_ms are continuous numerical variables that we can rescale to range from 0 and 1
- the rest of the variables are continuous numerical and are all already scaled between 0 and 1

In [None]:
# Normalization: X_new = (X — X_min)/ (X_max — X_min)
def normalize(df, features):
    for feature in features:
        df[feature] = (df[feature] - min(df[feature]))/(max(df[feature]) - min(df[feature]))

# normalize the continuous numerical features as mentioned
normalize(track_features, ['loudness', 'key', 'tempo', 'duration_ms'])

In [None]:
track_features.describe()

In [None]:
songs.user_like.describe()

Of the 481 songs in the dataset, 284 are labelled dislike (59%) and 197 are labelled like (41%). This imbalance should be noted for later when the dataset is split into training/testing and during modelling as well.

In [None]:
def missing_genres_check(dataframe):
    data = dataframe.copy(deep=True)
    data['is_missing_genre'] = data.genres.astype(bool)    
    total = data.is_missing_genre[data.is_missing_genre == False].count()
    pct = (total / data.is_missing_genre.count()) * 100
    result = pd.DataFrame({'Total':total, 'Percent':pct}, index=['is_missing_genre'])
    return result

missing_genres_check(artist_features)

We have that ~13% of the artists in my personal playlist are missing genres. This might be because these artists are not as well known and so Spotify has yet to assign them a genre.

An interesting observation is that genres are associated with an artist rather than a song. If I were to associate a song's genre(s) by its artist(s), this would mean that the song would belong to all of the genres of all of the participating artists. This assumption may cause problems when a particular artist has multiple genres that are vastly different but I will proceed this way for simplicity's sake.

In [None]:
# Drop artists with no genres assigned
artist_features = artist_features[artist_features['genres'].apply(len) > 0].reset_index(drop=True)

In [None]:
# Build final dataframe
df = songs.merge(track_features, how='inner', left_on='track_id', right_on='id').drop(columns=['id'])
# Split up artist(s) for each song
df_artists = df['artist_id'].str.split(',', expand=True)

In [None]:
# Create artist column for max number of artists associated with a song
for col in range(len(df_artists.columns)):
    df[f'artist_id{col}'] = df_artists[col]
df = df.drop(columns=['artist_id'])
# Join in genres on artist ids
for col in range(len(df_artists.columns)):
    df = df.merge(
            artist_features, how='left', left_on=f'artist_id{col}', right_on='artist_id'
            ).drop(
                columns=['artist_id', f'artist_id{col}']
                ).rename(
                    columns={'genres':f'genres{col}'}
                )
df = df.replace(np.nan, "")

Check the genres associated with each song...

In [None]:
genre_columns = [f'genres{i}' for i in range(len(df_artists.columns))]
columns = ['track_name']
columns.extend(genre_columns)
df[columns]

In [None]:
# Merge lists of genres across columns
def combine_lists(row):
    combined_list = []
    for col in row:
        if len(col) > 0:
            combined_list.extend(col)
    return list(set(combined_list))

# Combine genres columns
df['genres'] = df[genre_columns].apply(combine_lists, axis=1)
df = df.drop(columns=genre_columns)


In [None]:
# Check missing genre distribution amongst like and dislike songs
missing_genres_like_songs = len(df[(df.user_like == True) & (df.genres.apply(len) == 0)])
missing_genres_dislike_songs = len(df[(df.user_like == False) & (df.genres.apply(len) == 0)])
print(f"Like songs: {missing_genres_like_songs}, Dislike songs: {missing_genres_dislike_songs}")

Considering only 7% of my like songs are missing genres and around 17.5% of my dislike songs are missing genres, I decide to remove songs with no genres from the dataset. Although I end up with a smaller dataset, this deletion lessens the effects of my imbalanced data class distribution.

In [None]:
# Drop songs with no genres
df = df[df.genres.apply(len) > 0].reset_index(drop=True)

Now visualizing the audio features of songs that I like vs. songs that I do not like...

In [None]:
# TODO: data visualization on distribution of audio features
# TODO: data visualization on distribution of genres

features = df[df.user_like == True][['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]

plot_histograms(features)

In [None]:
features = df[df.user_like == False][['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]

plot_histograms(features)

There are 481 songs in total, 197 liked songs and 284 disliked songs. 

Summary of exploratory analysis on audio features:
- Songs that I dislike are on average: 
    - higher in danceability and energy
    - similar in loudness but skewed left heavier (i.e. some songs are a lot louder)
- Other audio features, based on visual inspection, are either similar or have negligible differences


In [None]:
like_genres = df[df.user_like == True]['genres'].explode().value_counts().reset_index()
like_genres.columns = ['Genre', 'Frequency']
dislike_genres = df[df.user_like == False]['genres'].explode().value_counts().reset_index()
dislike_genres.columns = ['Genre', 'Frequency']


In [None]:
like_most_freq_genre = like_genres[like_genres.Frequency == max(like_genres.Frequency)]
like_least_freq_genre = like_genres[like_genres.Frequency == min(like_genres.Frequency)]
dislike_most_freq_genre = dislike_genres[dislike_genres.Frequency == max(dislike_genres.Frequency)]
dislike_least_freq_genre = dislike_genres[dislike_genres.Frequency == min(dislike_genres.Frequency)]

In [None]:
like_songs_sparse_genre = round(like_genres[like_genres.Frequency == 1].shape[0] / like_genres.shape[0] * 100, 2)
dislike_songs_sparse_genre = round(dislike_genres[dislike_genres.Frequency == 1].shape[0] / dislike_genres.shape[0] * 100, 2)

res = pd.DataFrame({'Sparsity_Percent': [like_songs_sparse_genre, dislike_songs_sparse_genre]}, index=['Like_Songs', 'Dislike_Songs'])
res

Considering around half of the genres of both like and dislike songs are sparse, it might be beneficial to group similar genres together.

In [None]:
exploded_genres = pd.DataFrame(df['genres'].explode()).reset_index()
exploded_genres.columns = ['idx', 'genre']

In [None]:
# exploded_genres[exploded_genres['genre'].apply(lambda f: f.find('pop') != -1)]
set(exploded_genres[exploded_genres['genre'].apply(lambda f: f.find('pop') != -1)]['genre'])

In [None]:
like_most_freq_genre, dislike_most_freq_genre

In [None]:
like_least_freq_genre, dislike_least_freq_genre

In [None]:
missing_genres_check(df[df.user_like == True]), missing_genres_check(df[df.user_like == False])

Of the 482 songs in total, 34 songs are missing genres where 11 are liked songs and 23 are disliked songs.

Of the liked songs the pop genre is the most frequent. In fact, 60 songs are considered pop songs making up ~30% of all liked songs. On the otherhand, 71 genres amongst liked songs are only assigned to one song.

Of the disliked songs the pop genre is also the most frequent. 69 songs are considered pop songs making up ~24% of all disliked songs and 184 genres only have one song associated with them.

Judging from the sparsity of genres, this may indicate that genres, at least in how I attributed them to each song, may not be a very useful feature to include. I'll keep this in mind as I build my models.

## Write To CSV

In [None]:
# relabelling for model building
df.user_like.replace({True: 1, False: 0}, inplace=True)
df.to_csv("songs.csv", index=False)