In [None]:
# Imports
import csv
import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer

# Intro
The relationship between the musical characteristics of each song and the popularity of each song is important for music production and song artists companies to create the next bit songs hits. This Spotify dataset contains songs that were on the Top 200 Weekly Global charts for Spotify in 2020 & 2021 along with each song’s genre, song artist ID, and various musical attributes. The highest charting position of each song is a number that indicates the highest position the song ever ranked. In this project, we will attempt to predict song genre based on both song metadata as well as musical features generated by Spotify in-house algorithms. We will also try to predict the highest charting position of each song. Our classification algorithm can be used by musical production companies and song artists to gauge how certain metrics relate to genre, how genre can be predicted based on these features, and how these features relate to the ability of the song to make the top charts.

# Data Prep

This dataset contains songs that were on the Top 200 Weekly Global charts for Spotify in 2020 & 2021. For each song, we have the following features.

| Feature                   | Feature Type |
|---------------------------|--------------|
| Highest Charting Position | Numeric      |
| Number of Times Charted   | Numeric      |
| Song Name                 | Categorical  |
| Song ID                   | Categorical  |
| Artist                    | Categorical  |
| Streams                   | Numeric      |
| Artist Followers          | Numeric      |
| Genre                     | Categorical  |
| Release Date              | Numeric      |
| Weeks Charted             | Numeric      |
| Popularity                | Numeric      |
| Danceability              | Numeric      |
| Acousticness              | Numeric      |
| Energy                    | Numeric      |
| Instrumentalness          | Numeric      |
| Loudness                  | Numeric      |
| Speechiness               | Numeric      |
| Tempo                     | Numeric      |
| Valence                   | Numeric      |
| Chord                     | Categorical  |



In [None]:
# Entire Dataset
path = 'spotify_dataset.csv'
data = pd.DataFrame(pd.read_csv(path))
data.head()

## Cleaning the dataset

### Removing irrelevant columns and missing values
For our analysis, we will exclude song name and song ID from our feature set. These features do not give any information that would be pertinent to classifying the genre of each song. We will also drop any records that do not have any classified song genres, chords, or duration recorded as these are considered incomplete records. We chose to drop these records as opposed to replacing values with the mean for that feature because each song varies significatly in terms of its features and there is no "true average" for each feature.

In [None]:
# Drop columns corresponding to: index, song name, song id
columns_to_exclude = [0, 4, 8]
data.drop(columns=data.columns[columns_to_exclude], inplace=True)

# some songs have no defined Chord
print('Songs with no defined Chord', np.sum([data['Chord'] == ' ']))
# some song have not defined genre
print("Song with no defined genre", np.sum([data['Genre'] == '[]']))
# some songs have no defined duration
print('Songs with no defined Duration', np.sum([data["Duration (ms)"] == ' ']))

# Drop rows with missing values
data['Chord'].replace(' ', np.nan, inplace=True)
data['Genre'].replace('[]', np.nan, inplace=True)
data['Duration (ms)'].replace(' ', np.nan, inplace=True)
data.dropna(inplace=True)

### Fixing Data Types

This was done to make the calculations and comparisons during exploration easier since each feature would have the same data type.


In [None]:
# fixing dtypes of values, so that they're floats
data['Streams'] = [int(stream.replace(',', '')) for stream in data['Streams'].tolist()]
types = {'Artist Followers':'int64', 'Duration (ms)':'int64', 'Popularity':'float','Danceability':'float', 'Acousticness':'float', 'Energy':'float', 'Liveness':'float','Loudness':'float', 'Speechiness':'float', 'Tempo':'float', 'Valence':'float'}
data = data.astype(types)

### Mapping Categorical Features to Classes

We mapped the artist feature to a new feature "Artist Class". This avoids dealing with string names in classification and instead, each artist corresponds to a unique identifier for that artist. The chord feature and genre feature were also engineered into new classes "Chord Class" and "Genre Class" for the same reason.

In [None]:
# Converting each artist to a class
artists = data['Artist'].tolist()
artists_set = set()
for artist in artists:
    multiple_artists = artist.split(", ")
    for each_artist in multiple_artists:
        artists_set.add(each_artist)

classes = range(len(artists_set))
artists_to_class = dict(zip(artists_set, classes))

# Adding classes as a column
artist_column = [[artists_to_class.get(each_artist) for each_artist in artist.split(", ")] for artist in artists]
data.insert(5, 'Artist_Class', artist_column)

In [None]:
# Converting each chord to a class
chords = set(data['Chord'].tolist())

classes = range(len(chords))
chords_to_class = dict(zip(chords, classes))

# Adding classes as a column
chord_column = [chords_to_class.get(chord_name) for chord_name in data['Chord'].tolist()]
data['Chord_Class'] = chord_column

In [None]:
popular = ["pop", "rock", "rap", "hiphop", "country", "latin", "other"]
latin = ["mexican", "sertanejo", "forro", "piseiro", "latin", "espanol"]
genres = []

# Checks if a genre is a latin genre
def is_latin(genre):
    for latin_genre in latin:
        if latin_genre in genre:
            return True
    return False

genre_feat = data["Genre"].tolist()
for feat_list in genre_feat:
    # Parse genres
    temp = feat_list.replace("'", "")
    temp = temp.replace("[", "")
    temp = temp.replace("]", "")
    temp = temp.replace(" ", "")
    feats  = temp.split(",")
    song_genre = set()
    # Replace all pop, rock, rap, and hiphop genres with general genres
    for feat in feats:
        if "pop" in feat:
            song_genre.add("pop")
        elif "rock" in feat:
            song_genre.add("rock")
        elif "rap" in feat or "brooklyndrill" in feat:
            song_genre.add("rap")
        elif "hiphop" in feat:
            song_genre.add("hiphop")
        elif "country" in feat:
            song_genre.add("country")
        elif is_latin(feat):
            song_genre.add("latin")
    if (len(song_genre) == 0):
        song_genre.add("other")
    genres.append(song_genre)
        
# Find genre frequencies
genre_freq = Counter([genre for song_genres in genres for genre in song_genres])
# Replace song genres with the most common genre
for i in range(len(genres)):
    song_genres = list(genres[i])
    max_freq_genre = None
    max_freq = -1
    for j in range(len(song_genres)):
        if song_genres[j] in genre_freq:
            cur_freq = genre_freq.get(song_genres[j])
            if max_freq < cur_freq:
                max_freq = cur_freq
                max_freq_genre = song_genres[j]
    genres[i] = max_freq_genre

# Converting each genre to a class
unique_genres = set(genres)

classes = range(len(unique_genres))
genres_to_class = dict(zip(unique_genres, classes))

# Adding classes as a column
genre_column = [genres_to_class.get(genre_name) for genre_name in genres]
data['Genre_Class'] = genre_column


In [None]:
data.head()

# Data Exploration

After some data preparation, we explored the features and plotted their ranges to visualize any outliers in the dataset.

In [None]:
# Exploring and Checking for Noise and Outliers

stream_bins = list(range(0, 10000001, 1000000)) + [99999999]
# streams = [int(stream.replace(',', '')) for stream in data['Streams'].tolist()]
hist, bin_edges = np.histogram(data['Streams'], bins=stream_bins)
percentages = hist/np.sum(hist)
labels = np.array(['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100', '100+'])
labels = labels[percentages != 0]
percentages = percentages[percentages != 0]
fig, ax = plt.subplots(1, 1)
ax.pie(percentages, labels=labels)
ax.set_title('Number of Streams (millions)')
plt.show()

# should sort chords from low to high?
ax = data['Chord'].value_counts().plot.bar()
ax.set_title('Chords')
plt.show()

duration_bins = list(range(0, 30000*2*10, 30000))
fig, ax = plt.subplots(1, 1)
ax.hist(pd.to_numeric(data["Duration (ms)"]), bins=duration_bins)
ax.ticklabel_format(style='plain')
ax.set_title('Histogram of duration(ms)')
plt.show()

zero_to_one_bins = np.linspace(0.1, 1.0, 10)
fig, ax = plt.subplots(2, 3, constrained_layout=True)
zero_to_one_columns = ['Danceability', 'Acousticness', 'Energy', 'Liveness', 'Speechiness', 'Valence']
for i in range(len(zero_to_one_columns)):
    idx = np.unravel_index(i, (2, 3))
    ax[idx].hist(pd.to_numeric(data[zero_to_one_columns[i]]), bins=zero_to_one_bins)
    ax[idx].set_title(zero_to_one_columns[i])
fig.suptitle('Histograms')
plt.show()

duration_bins = list(range(0, max(data['Streams']), 1000000))
fig, ax = plt.subplots(1, 1)
ax.hist(pd.to_numeric(data["Streams"]), bins=duration_bins)
ax.ticklabel_format(style='plain')
ax.set_title('Histogram of Stream Count')
plt.show()

duration_bins = list(range(0, max(data['Highest Charting Position']), 10))
fig, ax = plt.subplots(1, 1)
ax.hist(pd.to_numeric(data['Highest Charting Position']), bins=duration_bins)
ax.ticklabel_format(style='plain')
ax.set_title('Histogram of Highest Charting Positions')
plt.show()

In [None]:
# Exploring 

data.groupby('Number of Times Charted').describe()['Highest Charting Position']

In [None]:
# Exploring
numeric_data = data.drop(["Chord_Class", "Genre_Class"], axis=1)
sns.heatmap(data.corr(method='pearson'))

# Feature Engineering

In [None]:
# Clean data a bit more
# drop non-number data (including data we've converted to numbers)
cols_to_drop = ['Week of Highest Charting', 'Artist', 'Genre', 'Release Date', 'Weeks Charted', 'Chord']
data_only_nums = data.drop(cols_to_drop, axis=1)

# only consider first artist for ease, round streams
data_only_nums['Artist_Class'] = data_only_nums.apply(lambda x: x[0], axis=1)

# round streams so we have a chance at predicting it well
data_only_nums['Streams'] = (data_only_nums['Streams'] / 10000000).astype(int) * 10000000
data_only_nums['Artist Followers'] = (data_only_nums['Artist Followers'] / 100000).astype(int) * 100000
data_only_nums.head()

In [None]:
# Create data/labels frames
genre_labels = data_only_nums['Genre_Class'].values.ravel()
genre_data = data_only_nums.drop(['Genre_Class'], axis=1)

charting_labels = data_only_nums['Highest Charting Position'].values
charting_data = data_only_nums.drop(['Highest Charting Position'], axis=1)

# Bin the highest charting positions
discretize = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
charting_labels = np.array(charting_labels).reshape(-1, 1)
discretize.fit(charting_labels)
charting_labels = pd.Series(discretize.transform(charting_labels).flatten())


print('Genre Data Shape:', genre_data.shape, '\nGenre Labels Shape:', len(genre_labels))
print('Charting Data Shape:', charting_data.shape, '\nCharting Labels Shape:', len(charting_labels))

## Class Imbalances 
From our visualization of the song genres, it is apparent that there are many more pop and rap records represented in the Spotify dataset than the other song genres. To address this class imbalance, we will use the TomkLinks method implemented by the *imblearn* package which undersamples the majority classes to balance the dataset. It is notable that the imblearn's pipeline does not apply the TomkLinks to the test set automatically.

In [None]:
# should sort chords from low to high?
ax = data['Genre_Class'].value_counts().plot.bar()
ax.set_title('Song Genre')
genre_names = [name for name, _ in genre_freq.most_common(len(popular))]
ax.set_xticklabels(labels=genre_names)
plt.show()

# Data Analysis

For our data analysis, we will train four classifiers on both predicing song genre and highest charting position. The four classifier will be use are:

1. Decision Tree
2. K-Nearest Neighbors
3. Adaboost
4. Neural Network

## Decision Tree

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import GridSearchCV

def do_decision_tree(data, labels, label_name):
    # fit a decision tree classifier across 5 cross folds
    clf = DecisionTreeClassifier(criterion='entropy')
    
    pipe = make_pipeline_with_sampler(        
                                    TomekLinks(),
                                    clf
            )

    # Create grid search
    param_grid = {
        'decisiontreeclassifier__criterion': ["gini", "entropy"],
        'decisiontreeclassifier__max_depth': list(range(1, 10, 2))
    }
    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
    # Perform Cross Validation
    preds = cross_val_predict(grid_search, data, labels, cv=10)
    print('Accuracy:', (accuracy_score(labels, preds) * 100))
    print(classification_report(labels, preds, zero_division=0))
    
print("Genre Classification")
do_decision_tree(genre_data, genre_labels, 'Genre')
print("Charting Position Classification")
do_decision_tree(charting_data, charting_labels, 'Highest Charting Position')

### Results
A decision tree working this well with our data is surprising, especially with how well it can predict charting position! This is likely due to the fact that there are a lot of numerical features such as danceability and energy. This allows the tree to split on these features and make more accurate predictions.

## KNN

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer

def do_knn(data, labels, label_name):

    numeric_feat = ["Danceability", "Energy", "Loudness", "Speechiness", "Acousticness", "Liveness", "Tempo", 
                    "Duration (ms)", "Valence", "Artist Followers", "Popularity", "Number of Times Charted", "Streams"]

    cat_feat = ["Artist_Class", "Chord_Class"]


    # Add chart pos as feat for genre classifier
    if label_name == "Genre":
        numeric_feat.append("Highest Charting Position")
    # Add genre as feat for char pos classifier
    else:
        cat_feat.append("Genre_Class")

    # Don't process features that we dropped for testing
    numeric_feat = [feat for feat in numeric_feat if feat in data.columns]
    cat_feat = [feat for feat in cat_feat if feat in data.columns]

    # Sampler
    sampler = TomekLinks()

    # One-hot encode categorical features
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Standardize numeric features
    scalar = StandardScaler()

    # PCA
    pca = PCA()

    # Create feature transformation pipeline
    numeric_transform = make_pipeline_with_sampler(scalar, pca)
    cat_transform = make_pipeline_with_sampler(encoder)

    feat_transform = ColumnTransformer(
                            transformers=[ ('num_transform', numeric_transform, numeric_feat),
                                        ('cat_transform', cat_transform, cat_feat)]
    )

    # KNN 
    knn = KNeighborsClassifier(n_neighbors=7)

    # Pipeline 
    pipe = make_pipeline_with_sampler(
                                    sampler,
                                    feat_transform,
                                    knn
            )

    # Create grid search
    param_grid = {
        'kneighborsclassifier__n_neighbors': list(range(1, 10, 2))
    }
    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

    # Perform Cross Validation
    preds = cross_val_predict(grid_search, data, labels, cv=10)
    print('Accuracy:', (accuracy_score(labels, preds) * 100))
    print(classification_report(labels, preds, zero_division=0))

print("Genre Classification")
do_knn(genre_data, genre_labels, "Genre")
print("Charting Position Classification")
do_knn(charting_data, charting_labels, "Highest Charting Position")

### Results
While KNN performed similarly to the decision tree for predicting genre, KNN did much worse with predicting highest charting position. In order to try and improve this classification, we will reduce the dimensionality of our data by pruning extraneous features and only including columns like "danceability", "tempo", etc which are directly related to the song.

In [None]:
genre_drop_cols = ['Highest Charting Position', 'Number of Times Charted', 'Streams', 'Artist_Class', 'Artist Followers', 'Popularity']
charting_drop_cols = ['Genre_Class', 'Number of Times Charted', 'Streams', 'Artist_Class', 'Artist Followers', 'Popularity']
new_genre_data = genre_data.drop(genre_drop_cols, axis=1)
new_charting_data = charting_data.drop(charting_drop_cols, axis=1)

do_knn(new_genre_data, genre_labels, 'New Genre')
do_knn(new_charting_data, charting_labels, 'New Highest Charting Position')

This general dimensionality reduction actually reduced the accuracy of KNN in predicting highest charting position. This may be reconciled based on the information we lost during reduction. It is possible that artists with more followers will probably tend to have higher charting positions with their song than someone else who may happen to have a similar "danceability" rating. Because of this, features like artist followers that are not directly related to the song may allow for better prediction of charting position than features more correlated with the song like dancability and tempo since these are more generalized across the music industry.

## Ensemble (Adaboost)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

def do_adaboost(data, labels, label_name):
    data = data.values
    tree = DecisionTreeClassifier(criterion='entropy')
    clf = AdaBoostClassifier(base_estimator=tree, n_estimators=25)

    pipe = make_pipeline_with_sampler(
        TomekLinks(),
        clf
    )
    
    preds = cross_val_predict(pipe, data, labels, cv=10)
    print('Accuracy:', (accuracy_score(labels, preds) * 100))
    print(classification_report(labels, preds, zero_division=0))

do_adaboost(genre_data, genre_labels, 'Genre')
do_adaboost(charting_data, charting_labels, 'Highest Charting Position')

### Results
The ensemble method of classification does a similar job at classifying genre as decision trees and KNN. However, it is very accurate in terms of prediction highest charting position. This is likely because AdaBoost is boosting the more difficult to classify records and giving them a higher weight in the next model instance. 

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



param_grid = {
    'nn__hidden_layer_sizes': list(zip(range(100, 501, 100))),
    'nn__activation': ['relu', 'logistic'],
    'nn__learning_rate_init':[0.001, 0.0001]
}

pipe = Pipeline([('scaler', StandardScaler()), ('nn', MLPClassifier(learning_rate='adaptive'))])
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

# To suppress warning
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

accuracies = cross_val_score(grid_search, genre_data, genre_labels, cv=5)
print('Accuracies of Folds:', accuracies)
print('Average Accuracy: %.4f%%' % (accuracies.mean() * 100))


accuracies = cross_val_score(grid_search, charting_data, charting_labels, cv=5)
print('Accuracies of Folds:', accuracies)
print('Average Accuracy: %.4f%%' % (accuracies.mean() * 100))

## Overall Conclusions
In order to predict genre and highest charting position, we used three classifiers: decision tree, KNN, and ensemble method with boosting. Prior to classification, in data preparation, we removed extraneous features such as song ID and mapped categorical features to numeric. Then, in data exploration, we visualized the range of each feature to easily remove any noise in the data or outliers for a particular feature. For feature engineering, we modulated some features for easier classification such as only considering the first artist on the song and rounding streams for better predictions. We also did one hot encoding for some of the categorical features. In the data exploration phase, we found that a majority of the records were from the pop and rap genres resulting in a class imbalance. In order to compensate for this, we used TomkLinks which undersamples the majority class to offset the imbalance. Then, the data was ready for classiication. For genre, the accuracy of predictions were around 63% (decision tree), 59% (KNN), and 57%(ensemble boosting). These accuracies are all pretty similar with decision tree doing the best. For highest charting position, the prediction accuracies were 100% (decision tree), 12.58% (KNN), and 99.18% (ensemble boosting). We attempted to improve the accuracy of the KNN classifier through dimensionality reduction, but this actually diminished the accuracy. This is likely due to the fact that when dimensionality is reduced, information is lost from the dataset that may be useful to the classfier. Overall, the decision tree classifier performed the best at predicting the genre and highest charting position of the songs in this Spotify dataset.