### Cleaning and processing the data fetched from the Spotify API

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
spotify_data = pd.read_csv('../data/local/spotify_million_tracks.csv')

spotify_data.head()

#### Data overview

In [None]:
def display_basic_info(df):
    """
    Display basic information about the dataset including shape, data types, and missing values
    """
    print('Dataset Shape:', df.shape)
    print('\nData Types:')
    print(df.dtypes)
    print('\nMissing Values:')
    print(df.isnull().sum())

def display_numerical_summary(df):
    """
    Display summary statistics for numerical columns
    """
    print('Numerical Columns Summary:')
    print(df.describe())

def check_duplicates(df):
    """
    Check for duplicate entries in the dataset
    """
    duplicates = df.duplicated().sum()
    print(f'Number of duplicate entries: {duplicates}')
    
def display_unique_values(df, columns):
    """
    Display number of unique values for specified columns, with special handling for the genres column
    """
    print('Unique Values Count:')
    for col in columns:
        if col == 'genres':
            all_genres = []
            for genre_list in df[col].dropna():
                if isinstance(genre_list, str):
                    genre_list = eval(genre_list)
                all_genres.extend(genre_list)
            unique_genres = len(set(all_genres))
            print(f'{col}: {unique_genres} unique genres')
        else:
            print(f'{col}: {df[col].nunique()} unique values')

display_basic_info(spotify_data)
display_numerical_summary(spotify_data)
check_duplicates(spotify_data)
display_unique_values(spotify_data, ['original_artist', 'genres'])

#### Drop duplicates and empty values, reset index

In [None]:
spotify_data = spotify_data.dropna().drop_duplicates().reset_index(drop=True)
display_basic_info(spotify_data)

#### Standardize release year

In [5]:
def standardize_release_year(df):
    """
    Standardize release_date column to only show years.
    Handles both YYYY and YYYY-MM-DD formats.
    """
    df = df.copy()
    
    df['release_date'] = df['release_date'].apply(lambda x: str(x)[:4]) # extract year from release_date
    
    df['release_date'] = pd.to_numeric(df['release_date'], errors='coerce')
    
    return df

spotify_data = standardize_release_year(spotify_data)

In [None]:
spotify_data.head()

#### Track duration to seconds

In [None]:
def add_duration_seconds(df):
    """
    Convert duration from milliseconds to seconds and add as new column 'duration_sec'
    """
    df = df.copy()
    
    df['duration_sec'] = (df['duration_ms'] / 1000).round(0).astype(int) # convert to seconds and round to integers
    
    return df

spotify_data = add_duration_seconds(spotify_data)

print(spotify_data[['duration_ms', 'duration_sec']].head())

#### Drop and rename columns

In [None]:
def drop_columns(df):
    """
    Drop defined columns from the dataset
    """
    columns_to_drop = [
        'original_title',
        'original_artist',        
        'duration_ms',   
    ]
    
    df = df.drop(columns=columns_to_drop, errors='ignore')
    return df

spotify_data = drop_columns(spotify_data)

print('Remaining columns:')
print(spotify_data.columns.tolist())

In [None]:
def rename_columns(df):
    """
    Rename columns to more intuitive names
    """
    column_mapping = {
        'spotify_title': 'title',
        'spotify_artist': 'artist',
        'release_date': 'release_year',
        'explicit': 'is_explicit',
        'duration_sec': 'duration_seconds'
    }
    
    df = df.rename(columns=column_mapping)
    return df

spotify_data = rename_columns(spotify_data)

print('New column names:')
print(spotify_data.columns.tolist())

Reorder columns

In [10]:
spotify_data = spotify_data[['title', 'artist', 'album', 'release_year', 'popularity', 'genres', 'is_explicit',  'duration_seconds', 'album_cover']]

In [None]:
display(spotify_data)

#### Check the most common words in the genres column

In [None]:
def analyze_genre_words(df, n_words=25):
    """
    Analyze the most common words in the genres column.
    Splits on both spaces and hyphens.
    
    Parameters:
    df : DataFrame containing the genres column
    n_words : Number of top words to display (default=30)
    """
    all_words = []
    
    for genre_list in df['genres'].dropna():    # iterate through every row
        if isinstance(genre_list, str):
            genre_list = eval(genre_list)
            
        for genre in genre_list:    # split by spaces and hyphens
            space_split = genre.split()
            for term in space_split:
                hyphen_split = term.split('-')
                all_words.extend(hyphen_split)
    
    all_words = [word.lower() for word in all_words] # convert to lowercase
    
    word_freq = pd.Series(all_words).value_counts() # count frequencies
    
    plt.figure(figsize=(12, 6))
    ax = word_freq.head(n_words).plot(
        kind='bar',
        color=plt.cm.tab20(np.arange(n_words)),  
        width=0.8
    )
    
    plt.title(f'Top {n_words} Most Common Words in Genres', pad=20)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    
    for i, v in enumerate(word_freq.head(n_words)):
        ax.text(
            i, v, str(v), 
            ha='center', va='bottom'
        )
    
    plt.tight_layout()
    
    print(f'\nTop {n_words} most common words in genres:')
    for word, count in word_freq.head(n_words).items():
        print(f"{word}: {count}")
    
    return word_freq

word_frequencies = analyze_genre_words(spotify_data)

#### Hot encode genres

In [None]:
def hot_encode_genre_words(df, word_list):
    """
    Create binary columns for each genre word in the word_list.
    Returns a dataframe with new columns indicating if each word appears in the genres.
    """
    df = df.copy()
       
    for word in word_list:
        df[word] = 0 # init columns with 0
    
    for idx, row in df.iterrows():  # iterate through rows
        genre_list = row['genres']
        if isinstance(genre_list, str):
            genre_list = eval(genre_list)
            
        words = []
        for genre in genre_list:
            space_split = genre.split()
            for term in space_split:
                words.extend(term.split('-'))
        
        words = [w.lower() for w in words]
        
        if 'hip' in words and 'hop' in words:   # special case for hip-hop
            words.append('hip-hop')
        
        for word in word_list:  # set binary values
            if word in words:
                df.at[idx, word] = 1
    
    return df

# define most common genres to encode
most_common_genres = [
    'rock',
    'pop',
    'blues',
    'metal',
    'hip-hop',
    'country',
    'punk',
    'jazz',
    'rap',
    'reggae',
    'folk',
    'soul',
    'latin',
    'dance',
    'indie',
    'classical'
]

spotify_data_encoded = hot_encode_genre_words(spotify_data, most_common_genres)

print('\nSample of encoded genres:')
print(spotify_data_encoded[most_common_genres].head())

print('\nTotal songs per genre:')
print(spotify_data_encoded[most_common_genres].sum().sort_values(ascending=False))

Reorder columns

In [None]:
new_column_order = [
    'title', 'artist', 'album', 'release_year', 'popularity', 'is_explicit', 'duration_seconds',
    'rock', 'pop', 'blues', 'metal', 'hip-hop', 'country', 'punk', 'jazz', 
    'rap', 'reggae', 'folk', 'soul', 'latin', 'dance', 'indie', 'classical',
    'album_cover', 'genres'
]

spotify_data_encoded = spotify_data_encoded[new_column_order]
display(spotify_data_encoded)
spotify_data_encoded.info()

#### Save to csv

In [15]:
# spotify_data_encoded.to_csv('../data/clean/spotify_data_encoded.csv', index=False)