In [89]:
import pandas as pd
import numpy as np
import re

Run the cleaning functions code chunk, and then run this chunk to get cleaned data

In [None]:
dat = pd.read_csv('yourdatapath.csv')

dat = drop_dupes_and_cols(dat)
dat = bin_album_label(dat)
dat = transform_date(dat)
dat = convert_duration_to_min(dat)
dat = extract_section_vars(dat)
dat = convert_to_numeric(dat)

# this last step is optional
# dat = dummy_code_categoricals(dat)

### Cleaning functions

In [88]:
def drop_dupes_and_cols(df):
    '''
    Takes a dataframe and returns a dataframe with duplicates and unwanted columns dropped.
    '''
    df = df.drop_duplicates(subset = ["uri"])
    df = df.reset_index(drop=True)
    
    df.drop(['tempo_confidence', 'overall_key_confidence', 'mode_confidence', 'time_signature_confidence',
             'section_tempo_confidences', 'section_keys', 'section_key_confidences', 'section_modes',
             'section_mode_confidences', 'section_time_signatures', 'section_time_signature_confidences'],
            axis = 1, inplace = True)
    
    return df

def bin_album_label(df, drop = True):
    '''
    Takes a dataframe and returns a dataframe with a new column 'album_label_category'.
    Categorizes album_label into 'big label' (>= 80 songs produced by label), 'medium label' (<80 and >=11 songs produced by label), and 'small label' (<11 songs produced by label).
    If drop = False, then the album_label and album_label_count (a feature created in this function) columns are not dropped.
    '''
    album_label_counts = df.groupby(['album_label']).count()['album_uri']
    album_label_counts.name = 'album_label_count'

    df = df.join(album_label_counts, on = 'album_label')
     
    def categorize_album_label(obs):
        if obs >= 80:
            return 'big_label'
        if 80 > obs >= 11:
            return 'medium_label'
        else:
            return 'small_label'

    df['album_label_category'] = df['album_label_count'].apply(categorize_album_label)
    
    if drop == True:
        df = df.drop(['album_label', 'album_label_count'], axis = 1)
    
    return df

def transform_date(df):
    '''
    Takes dataframe, returns dataframe with new col 'release_year_from_2020'.
    Drops 'album_release_date'.
    'release_year_from_2020' = 2020 - year album was released
    '''
    df['release_year_from_2020'] = df['album_release_date'].str.extract(r'(\d\d\d\d)', expand = False)
    df['release_year_from_2020'] = df['release_year_from_2020'].apply(lambda x: 2020 - int(x))
    df.drop('album_release_date', axis = 1, inplace = True)
    
    return df

def convert_duration_to_min(df):
    df["duration_minutes"] = df["duration_ms"]/60000
    df.drop(columns="duration_ms", inplace=True)
    return df

def extract_section_vars(df):
    '''
    Takes dataframe, returns dataframe with section variables turned into corresponding min, max, and variance columns.
    Section variable is dropped, and 3 new columns created in its stead.
    Drops rows that have 'None' or empty lists
    '''
    section_vars = ['section_durations', 'section_loudnesses', 'section_tempos']
    
    for var in section_vars:
        df[var + '_variance'] = [np.var(i) if (len(i) > 0) & (i != 'None') else np.nan for i in df[var].values]
        df[var + '_min'] = [np.min(i) if (len(i) > 0) & (i != 'None') else np.nan for i in df[var].values]
        df[var + '_max'] = [np.max(i) if (len(i) > 0) & (i != 'None') else np.nan for i in df[var].values]
        
        df.drop(var, axis = 1, inplace = True)
    
    df = df.dropna(how = "any")
    df = df.reset_index(drop=True)
    
    return df

def convert_to_numeric(df):
    df[["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", 
       "valence", "tempo", "overall_key",  "mode", "time_signature", "num_of_sections", "num_of_keys", 
       "num_of_modes", "num_of_time_signatures"
      ]] = df[["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", 
                  "speechiness", "valence", "tempo", "overall_key",  "mode", "time_signature", "num_of_sections", 
                  "num_of_keys",  "num_of_modes", "num_of_time_signatures"
                 ]].apply(pd.to_numeric, errors="coerce")
    
    return df

def dummy_code_categoricals(df):
    '''
    Takes df and returns df with dummy coded categorical variables.
    Explicit with 1 = explicit, 0 = not.
    Album label category with 'small label' as reference class. 
    '''
    
    df_dummies = pd.get_dummies(df['album_label_category'], prefix = 'album')
    df_dummies.reset_index(drop = True, inplace = True)
    df.reset_index(drop = True, inplace = True)
    df = pd.concat([df, df_dummies], axis = 1).drop(['album_label_category', 'album_small_label'], axis = 1)
    
    df['explicit'] = df['explicit']*1

    return df