In [1]:
# Imports
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Intro
The relationship between musica  characteristics of each song and the popularity of each song is important for music production and song artists companies to create the next bit songs hits. The Spotify dataset contains songs that were on the Top 200 Weekly Global charts for Spotify in 2020 & 2021 along with each song’s genre, song artist id, and various musical attributes. The popularity score of each song is a number ranging from 0 to 100 that is used by Spotify to rank each song’s popularity. In this project, we will attempt to predict song popularity based on both song metadata as well as musical features generated by Spotify in-house algorithms. Our classification algorithm can be used by musical production comapnies and song artists to gauge how successful each candidate song would be.

# Data Prep

This dataset contains songs that were on the Top 200 Weekly Global charts for Spotify in 2020 & 2021. For each song, we have the following features.

| Feature                   | Feature Type |
|---------------------------|--------------|
| Highest Charting Position | Numeric      |
| Number of Times Charted   | Numeric      |
| Song Name                 | Categorical  |
| Song ID                   | Categorical  |
| Artist                    | Categorical  |
| Streams                   | Numeric      |
| Artist Followers          | Numeric      |
| Genre                     | Categorical  |
| Release Date              | Numeric      |
| Weeks Charted             | Numeric      |
| Popularity                | Numeric      |
| Danceability              | Numeric      |
| Acousticness              | Numeric      |
| Energy                    | Numeric      |
| Instrumentalness          | Numeric      |
| Loudness                  | Numeric      |
| Speechiness               | Numeric      |
| Tempo                     | Numeric      |
| Valence                   | Numeric      |
| Chord                     | Numeric      |



In [2]:
# Entire Dataset
path = 'spotify_dataset.csv'
data = pd.DataFrame(pd.read_csv(path))
data.head()

Unnamed: 0,Index,Highest Charting Position,Number of Times Charted,Week of Highest Charting,Song Name,Streams,Artist,Artist Followers,Song ID,Genre,...,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord
0,1,1,8,2021-07-23--2021-07-30,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",...,0.714,0.8,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B
1,2,2,3,2021-07-23--2021-07-30,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],...,0.591,0.764,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db
2,3,1,11,2021-06-25--2021-07-02,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],...,0.563,0.664,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A
3,4,3,5,2021-07-02--2021-07-09,Bad Habits,37799456,Ed Sheeran,83293380,6PQ88X9TkUIAUIZJHW2upE,"['pop', 'uk pop']",...,0.808,0.897,-3.712,0.0348,0.0469,0.364,126.026,231041,0.591,B
4,5,5,1,2021-07-23--2021-07-30,INDUSTRY BABY (feat. Jack Harlow),33948454,Lil Nas X,5473565,27NovPIUIRrOZoCHxABJwK,"['lgbtq+ hip hop', 'pop rap']",...,0.736,0.704,-7.409,0.0615,0.0203,0.0501,149.995,212000,0.894,D#/Eb


## Cleaning the dataset

We will exclude Song Name and Song ID from our feature set. 

In [3]:
# columns corresponding to: index, song name, song id
columns_to_exclude = [0, 4, 8]
data.drop(columns=data.columns[columns_to_exclude], inplace=True)

### Mapping Categorical Features to Classes

In [4]:
# Converting each artist to a class
artists = data['Artist'].tolist()
artists_set = set()
for artist in artists:
    multiple_artists = artist.split(", ")
    for each_artist in multiple_artists:
        artists_set.add(each_artist)

classes = range(len(artists_set))
artists_to_class = dict(zip(artists_set, classes))

# Adding classes as a column
artist_column = [[artists_to_class.get(each_artist) for each_artist in artist.split(", ")] for artist in artists]
data.insert(5, 'Artist_Class', artist_column)

In [5]:
# Binning genres
def cat_onehot(feat_name):
    cat_feats = data[feat_name].tolist()
    genres = set()
    for feat_list in cat_feats:
        temp = feat_list.replace("'", "")
        temp = temp.replace("[", "")
        temp = temp.replace("]", "")
        temp = temp.replace(" ", "")
        feats  = temp.split(",")
        for feat in feats:
            if "pop" in feat:
                genres.add("pop")
            elif "rock" in feat:
                genres.add("rock")
            elif "rap" in feat:
                genres.add("rap")
            elif "hiphop" in feat:
                genres.add("hiphop")
            else:
                genres.add(feat)
    print(genres)
    print(len(genres))

cat_onehot("Genre")

{'', 'edm', 'ranchera', 'funkcarioca', 'girlgroup', 'newwave', 'mariachi', 'indier&b', 'soul', 'dreamo', 'soundtrack', 'tekk', 'nuevoregionalmexicano', 'adultstandards', 'pop', 'acappella', 'sheffieldindie', 'trance', 'sunnlensktonlist', 'newromantic', 'escaperoom', 'talentshow', 'norteno', 'australianpsych', 'funk', 'classicsoul', 'canadiancontemporaryr&b', 'irishsinger-songwriter', 'house', 'boyband', 'brooklyndrill', 'dreamsmp', 'punk', 'funkbh', 'hiphop', 'francoton', 'indiesurf', 'motown', 'progressiveelectrohouse', 'beatlesque', 'britishsoul', 'eurodance', 'electrohouse', 'germandrill', 'disco', 'progressivehouse', 'forro', 'folktronica', 'madchester', 'plugg', 'vancouverindie', 'ravefunk', 'alternativemetal', 'permanentwave', 'hollywood', 'metalcore', 'post-grunge', 'deepeurohouse', 'easylistening', 'singer-songwriter', 'swing', 'r&b', 'surfpunk', 'comic', 'oulumetal', 'perreo', 'basshall', 'sertanejouniversitario', 'bregafunk', 'cubaton', 'torchsong', 'dutchedm', 'reggaeton', '

In [6]:
data.head()

Unnamed: 0,Highest Charting Position,Number of Times Charted,Week of Highest Charting,Streams,Artist,Artist_Class,Artist Followers,Genre,Release Date,Weeks Charted,...,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord
0,1,8,2021-07-23--2021-07-30,48633449,Måneskin,[264],3377762,"['indie rock italiano', 'italian pop']",2017-12-08,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,0.714,0.8,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B
1,2,3,2021-07-23--2021-07-30,47248719,The Kid LAROI,[686],2230022,['australian hip hop'],2021-07-09,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,0.591,0.764,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db
2,1,11,2021-06-25--2021-07-02,40162559,Olivia Rodrigo,[458],6266514,['pop'],2021-05-21,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,0.563,0.664,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A
3,3,5,2021-07-02--2021-07-09,37799456,Ed Sheeran,[111],83293380,"['pop', 'uk pop']",2021-06-25,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,0.808,0.897,-3.712,0.0348,0.0469,0.364,126.026,231041,0.591,B
4,5,1,2021-07-23--2021-07-30,33948454,Lil Nas X,[57],5473565,"['lgbtq+ hip hop', 'pop rap']",2021-07-23,2021-07-23--2021-07-30,...,0.736,0.704,-7.409,0.0615,0.0203,0.0501,149.995,212000,0.894,D#/Eb


# Data Exploration

# Feature Engineering

# Data Analysis

# Results