# Library Imports

In [70]:
# import necessary libraries
import pandas as pd
from collections import Counter

# Parsing and Cleaning Data

In [71]:
file_name = "SpotifyFeatures.csv"
spotify_data = pd.read_csv(file_name)
spotify_data.dropna()
spotify_data.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [72]:
def get_counter_from_column(column):
    unique = Counter(spotify_data[column])
    print(f"Unique values for {column}: {dict(unique)}")
    return unique

# getting an idea of what non-numerical values we might have for data so we can map them to numbers later
# excluding things such as artist_name, track_name, and track_id due to the sheer size and number of unique elements
column_names = ["genre", "key", "mode", "time_signature"]
unique_genres, unique_keys, unique_modes, unique_time_signatures = [get_counter_from_column(column) for column in column_names]

# 0/4 time signature seems to be an outlier value for time signature, so we'll likely want to remove them
zero_four_time_signature = spotify_data[spotify_data["time_signature"] == "0/4"]
print(f"Number of tracks with 0/4 time signature: {len(zero_four_time_signature)}")

Unique values for genre: {'Movie': 7806, 'R&B': 8992, 'A Capella': 119, 'Alternative': 9263, 'Country': 8664, 'Dance': 8701, 'Electronic': 9377, 'Anime': 8936, 'Folk': 9299, 'Blues': 9023, 'Opera': 8280, 'Hip-Hop': 9295, "Children's Music": 5403, 'Children’s Music': 9353, 'Rap': 9232, 'Indie': 9543, 'Classical': 9256, 'Pop': 9386, 'Reggae': 8771, 'Reggaeton': 8927, 'Jazz': 9441, 'Rock': 9272, 'Ska': 8874, 'Comedy': 9681, 'Soul': 9089, 'Soundtrack': 9646, 'World': 9096}
Unique values for key: {'C#': 23201, 'F#': 15222, 'C': 27583, 'F': 20279, 'G': 26390, 'E': 17390, 'D#': 7566, 'G#': 15159, 'D': 24077, 'A#': 15526, 'A': 22671, 'B': 17661}
Unique values for mode: {'Major': 151744, 'Minor': 80981}
Unique values for time_signature: {'4/4': 200760, '5/4': 5238, '3/4': 24111, '1/4': 2608, '0/4': 8}
Number of tracks with 0/4 time signature: 8


In [73]:
def remove_time_signature(dataframe, time_signature):
    return dataframe[dataframe["time_signature"] != time_signature]

def create_integer_mapping(items):
    return {item: i for i, item in enumerate(items)}

# remove rows that have a 0/4 time signature 
spotify_data = remove_time_signature(spotify_data, "0/4")
zero_four_time_signature = spotify_data[spotify_data["time_signature"] == "0/4"]
print(f"Number of tracks with 0/4 time signature: {len(zero_four_time_signature)}")

genre_integer_mapping = create_integer_mapping(list(unique_genres))
print(f"Genre to integer mapping: {genre_integer_mapping}")
key_integer_mapping = create_integer_mapping(list(unique_keys))
print(f"Key to integer mapping: {key_integer_mapping}")
mode_integer_mapping = create_integer_mapping(list(unique_modes))
print(f"Mode to integer mapping: {mode_integer_mapping}")
time_signature_mapping = create_integer_mapping(list(unique_time_signatures))
print(f"Time signature to integer mapping: {time_signature_mapping}")

Number of tracks with 0/4 time signature: 0
Genre to integer mapping: {'Movie': 0, 'R&B': 1, 'A Capella': 2, 'Alternative': 3, 'Country': 4, 'Dance': 5, 'Electronic': 6, 'Anime': 7, 'Folk': 8, 'Blues': 9, 'Opera': 10, 'Hip-Hop': 11, "Children's Music": 12, 'Children’s Music': 13, 'Rap': 14, 'Indie': 15, 'Classical': 16, 'Pop': 17, 'Reggae': 18, 'Reggaeton': 19, 'Jazz': 20, 'Rock': 21, 'Ska': 22, 'Comedy': 23, 'Soul': 24, 'Soundtrack': 25, 'World': 26}
Key to integer mapping: {'C#': 0, 'F#': 1, 'C': 2, 'F': 3, 'G': 4, 'E': 5, 'D#': 6, 'G#': 7, 'D': 8, 'A#': 9, 'A': 10, 'B': 11}
Mode to integer mapping: {'Major': 0, 'Minor': 1}
Time signature to integer mapping: {'4/4': 0, '5/4': 1, '3/4': 2, '1/4': 3, '0/4': 4}


# 