### Cleaning and processing the data fetched from the Spotify API

In [77]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
spotify_data = pd.read_csv('../data/local/spotify_million_tracks.csv')

spotify_data.head()

#### Overview

In [None]:
def display_basic_info(df):
    """
    Display basic information about the dataset including shape, data types, and missing values
    """
    print('Dataset Shape:', df.shape)
    print('\nData Types:')
    print(df.dtypes)
    print('\nMissing Values:')
    print(df.isnull().sum())

def display_numerical_summary(df):
    """
    Display summary statistics for numerical columns
    """
    print('Numerical Columns Summary:')
    print(df.describe())

def check_duplicates(df):
    """
    Check for duplicate entries in the dataset
    """
    duplicates = df.duplicated().sum()
    print(f'Number of duplicate entries: {duplicates}')
    
def display_unique_values(df, columns):
    """
    Display number of unique values for specified columns, with special handling for the genres column
    """
    print('Unique Values Count:')
    for col in columns:
        if col == 'genres':
            # Flatten the list of genres and get unique values
            all_genres = []
            for genre_list in df[col].dropna():
                # Convert string representation of list to actual list if needed
                if isinstance(genre_list, str):
                    genre_list = eval(genre_list)
                all_genres.extend(genre_list)
            unique_genres = len(set(all_genres))
            print(f'{col}: {unique_genres} unique genres')
        else:
            print(f'{col}: {df[col].nunique()} unique values')

display_basic_info(spotify_data)
display_numerical_summary(spotify_data)
check_duplicates(spotify_data)
display_unique_values(spotify_data, ['original_artist', 'genres'])

#### Drop duplicates and empty values, reset index

In [None]:
spotify_data = spotify_data.dropna().drop_duplicates().reset_index(drop=True)
display_basic_info(spotify_data)

#### Standardize release year

In [81]:
def standardize_release_year(df):
    """
    Standardize release_date column to only show years.
    Handles both YYYY and YYYY-MM-DD formats.
    """
    df = df.copy()
    
    df['release_date'] = df['release_date'].apply(lambda x: str(x)[:4]) # extract year from release_date
    
    df['release_date'] = pd.to_numeric(df['release_date'], errors='coerce')
    
    return df

spotify_data = standardize_release_year(spotify_data)

In [None]:
spotify_data.head()

#### Track duration to seconds

In [None]:
def add_duration_seconds(df):
    """
    Convert duration from milliseconds to seconds and add as new column 'duration_sec'
    """
    df = df.copy()
    
    df['duration_sec'] = (df['duration_ms'] / 1000).round(0).astype(int) # convert to seconds and round to integers
    
    return df

spotify_data = add_duration_seconds(spotify_data)

print(spotify_data[['duration_ms', 'duration_sec']].head())

#### Drop and rename columns

In [None]:
def drop_columns(df):
    """
    Drop defined columns from the dataset
    """
    columns_to_drop = [
        'original_title',
        'original_artist',        
        'duration_ms',   
    ]
    
    df = df.drop(columns=columns_to_drop, errors='ignore')
    return df

spotify_data = drop_columns(spotify_data)

print('Remaining columns:')
print(spotify_data.columns.tolist())

In [None]:
def rename_columns(df):
    """
    Rename columns to more intuitive names
    """
    column_mapping = {
        'spotify_title': 'title',
        'spotify_artist': 'artist',
        'release_date': 'release_year',
        'explicit': 'is_explicit',
        'duration_sec': 'duration_seconds'
    }
    
    df = df.rename(columns=column_mapping)
    return df

spotify_data = rename_columns(spotify_data)

print('New column names:')
print(spotify_data.columns.tolist())

Reorder columns

In [86]:
spotify_data = spotify_data[['title', 'artist', 'album', 'release_year', 'popularity', 'genres', 'is_explicit',  'duration_seconds', 'album_cover']]

In [None]:
display(spotify_data)