In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

# spotify_data

This dataset was extracted from the Spotify platform using the Python library "Spotipy", which allows users to access music data provided via APIs. The dataset collected includes about 1 Million tracks with 19 features between 2000 and 2023. Also, there is a total of 61,445 unique artists and 82 genres in the data.

This clean data has been prepared and utilized for research purposes. Its significance lies in its potential to unravel patterns and predict song popularity prior to its release. This dataset could be used to create various predictive models with machine-learning/deep-learning techniques.

https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks

In [20]:
pd.set_option('display.max_columns', None)

In [21]:
spotify = pd.read_csv('reduced_spotify_2022.csv')
spotify.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,530310,Gen Hoshino,Comedy,5SuOikwiRyPMVoIQDJUgSV,68,2022,acoustic,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,230667,4
1,530311,Steve Moakler,Make A Little Room,62yXckrKNy9Sylka6oifst,55,2022,acoustic,0.758,0.722,1,-6.252,1,0.0511,0.165,6e-06,0.106,0.66,146.031,180493,4
2,530312,Ben Rector,Steady Love,6UOkF086zUgYXnqFxqUvEI,57,2022,acoustic,0.678,0.277,11,-10.537,1,0.0602,0.843,4e-06,0.104,0.238,76.935,224333,4
3,530313,Ben Rector,Living My Best Life,0R7EWhquaAICmyE5MZqt3q,56,2022,acoustic,0.566,0.733,4,-5.302,1,0.0699,0.083,2e-06,0.362,0.636,178.828,215213,4
4,530314,Drew Holcomb,Coming Home,2C3gh14oarzTtv3aY3HJ0m,54,2022,acoustic,0.625,0.81,10,-5.995,1,0.033,0.146,2.7e-05,0.135,0.741,104.066,146360,4


In [22]:
spotify.shape

(53692, 20)

In [23]:
spotify.isna().sum()

Unnamed: 0          0
artist_name         0
track_name          0
track_id            0
popularity          0
year                0
genre               0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
dtype: int64

# 1. Preprocessing 

In [24]:
grouped_df = spotify.groupby('artist_name')['track_name'].apply(list).reset_index()
grouped_df.shape

(12779, 2)

In [25]:
#created a new encoded column that identifies each singer 
#motive, maybe if a singer is know it'll have more popularity
spotify['artist_label'] = pd.factorize(spotify['artist_name'])[0]
spotify['artist_label'].describe()

count    53692.000000
mean      6201.992755
std       3747.952493
min          0.000000
25%       2739.000000
50%       6378.000000
75%       9597.250000
max      12778.000000
Name: artist_label, dtype: float64

In [26]:
#Maybe the shorter the name better popularity
# Create a new column 'word_count' with the total word count for each track name

def count_word(text):
    return len(text.split())


spotify['word_count_track123'] = spotify['track_name'].apply(count_word)
spotify['word_count_track123'].describe()

count    53692.000000
mean         3.465749
std          2.617360
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max         40.000000
Name: word_count_track123, dtype: float64

In [27]:
#Maybe the shorter the name better popularity
# Create a new column 'word_count_track' with the total word count for each track name

def word_count(data, text_column_name):
    data['word_count_track'] = data[text_column_name].apply(lambda text: len(text.split()))
    return data

word_count(spotify, 'track_name')


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_label,word_count_track123,word_count_track
0,530310,Gen Hoshino,Comedy,5SuOikwiRyPMVoIQDJUgSV,68,2022,acoustic,0.676,0.461,1,-6.746,0,0.1430,0.03220,0.000001,0.3580,0.715,87.917,230667,4,0,1,1
1,530311,Steve Moakler,Make A Little Room,62yXckrKNy9Sylka6oifst,55,2022,acoustic,0.758,0.722,1,-6.252,1,0.0511,0.16500,0.000006,0.1060,0.660,146.031,180493,4,1,4,4
2,530312,Ben Rector,Steady Love,6UOkF086zUgYXnqFxqUvEI,57,2022,acoustic,0.678,0.277,11,-10.537,1,0.0602,0.84300,0.000004,0.1040,0.238,76.935,224333,4,2,2,2
3,530313,Ben Rector,Living My Best Life,0R7EWhquaAICmyE5MZqt3q,56,2022,acoustic,0.566,0.733,4,-5.302,1,0.0699,0.08300,0.000002,0.3620,0.636,178.828,215213,4,2,4,4
4,530314,Drew Holcomb,Coming Home,2C3gh14oarzTtv3aY3HJ0m,54,2022,acoustic,0.625,0.810,10,-5.995,1,0.0330,0.14600,0.000027,0.1350,0.741,104.066,146360,4,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53687,583997,Ursula 1000,Don't Cheat The Beat,4pmoZnemYFd8r0Hoet7bTe,1,2022,trip-hop,0.696,0.806,4,-6.447,0,0.0347,0.01190,0.013100,0.0708,0.963,120.015,251803,4,12762,4,4
53688,583998,Morcheeba,What New York Couples Fight About - Instrumental,2z73p5IpvIxuLkyRCo7inP,0,2022,trip-hop,0.667,0.634,2,-10.974,1,0.0275,0.37600,0.648000,0.1020,0.412,155.991,375667,4,12711,8,8
53689,583999,Morcheeba,Way Beyond - Instrumental,3oOztziCNDx8fPmkZePB0u,0,2022,trip-hop,0.630,0.472,7,-9.448,1,0.0310,0.00779,0.802000,0.1300,0.379,81.001,216507,4,12711,4,4
53690,584000,Jay-Jay Johanson,"Both sides now - From ""Love Actually""",3jyWKT91FkQYijWGCMAF0h,9,2022,trip-hop,0.331,0.280,11,-10.733,1,0.0360,0.69800,0.004090,0.1910,0.263,73.156,270187,5,12737,7,7


In [28]:
def labelencoder(df: pd.DataFrame, column:str) -> pd.DataFrame:
    if isinstance(column, str):  # If a single column name is provided
        column = [column]  # Convert it to a list

        missing_columns = [col for col in column if col not in df]
        if missing_columns:
            raise ValueError(f"Columns {missing_columns} not found in the DataFrame.")
        
    encoder = LabelEncoder()
    spotify[f'{column}_encoded'] = encoder.fit_transform(df[column])
    return df

labelencoder(spotify, 'genre')

  y = column_or_1d(y, warn=True)


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_label,word_count_track123,word_count_track,['genre']_encoded
0,530310,Gen Hoshino,Comedy,5SuOikwiRyPMVoIQDJUgSV,68,2022,acoustic,0.676,0.461,1,-6.746,0,0.1430,0.03220,0.000001,0.3580,0.715,87.917,230667,4,0,1,1,0
1,530311,Steve Moakler,Make A Little Room,62yXckrKNy9Sylka6oifst,55,2022,acoustic,0.758,0.722,1,-6.252,1,0.0511,0.16500,0.000006,0.1060,0.660,146.031,180493,4,1,4,4,0
2,530312,Ben Rector,Steady Love,6UOkF086zUgYXnqFxqUvEI,57,2022,acoustic,0.678,0.277,11,-10.537,1,0.0602,0.84300,0.000004,0.1040,0.238,76.935,224333,4,2,2,2,0
3,530313,Ben Rector,Living My Best Life,0R7EWhquaAICmyE5MZqt3q,56,2022,acoustic,0.566,0.733,4,-5.302,1,0.0699,0.08300,0.000002,0.3620,0.636,178.828,215213,4,2,4,4,0
4,530314,Drew Holcomb,Coming Home,2C3gh14oarzTtv3aY3HJ0m,54,2022,acoustic,0.625,0.810,10,-5.995,1,0.0330,0.14600,0.000027,0.1350,0.741,104.066,146360,4,3,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53687,583997,Ursula 1000,Don't Cheat The Beat,4pmoZnemYFd8r0Hoet7bTe,1,2022,trip-hop,0.696,0.806,4,-6.447,0,0.0347,0.01190,0.013100,0.0708,0.963,120.015,251803,4,12762,4,4,81
53688,583998,Morcheeba,What New York Couples Fight About - Instrumental,2z73p5IpvIxuLkyRCo7inP,0,2022,trip-hop,0.667,0.634,2,-10.974,1,0.0275,0.37600,0.648000,0.1020,0.412,155.991,375667,4,12711,8,8,81
53689,583999,Morcheeba,Way Beyond - Instrumental,3oOztziCNDx8fPmkZePB0u,0,2022,trip-hop,0.630,0.472,7,-9.448,1,0.0310,0.00779,0.802000,0.1300,0.379,81.001,216507,4,12711,4,4,81
53690,584000,Jay-Jay Johanson,"Both sides now - From ""Love Actually""",3jyWKT91FkQYijWGCMAF0h,9,2022,trip-hop,0.331,0.280,11,-10.733,1,0.0360,0.69800,0.004090,0.1910,0.263,73.156,270187,5,12737,7,7,81


In [29]:
print(spotify['genre'].nunique())
encoder = LabelEncoder()
spotify['genre_encoded'] = encoder.fit_transform(spotify['genre'])

82


In [30]:
spotify['years_since_out'] = datetime.today().year - spotify.year 
# Print the result
print(spotify[['year', 'years_since_out']].head())

   year  years_since_out
0  2022                1
1  2022                1
2  2022                1
3  2022                1
4  2022                1


In [31]:
#Also drop the ones I don't think apport any value
columns1 = ['track_id', 'Unnamed: 0']
spotify = spotify.drop(columns1, axis=1)

In [32]:
spotify.describe()

Unnamed: 0,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_label,word_count_track123,word_count_track,['genre']_encoded,genre_encoded,years_since_out
count,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0,53692.0
mean,31.08897,2022.0,0.548411,0.641324,5.26937,-8.962979,0.617839,0.098261,0.31657,0.250889,0.217718,0.425979,122.101232,216580.7,3.893727,6201.992755,3.465749,3.465749,38.001062,38.001062,1.0
std,17.848236,0.0,0.181508,0.274144,3.56453,6.23788,0.48592,0.135566,0.35344,0.366219,0.192361,0.25806,30.03513,105009.8,0.454595,3747.952493,2.61736,2.61736,23.780676,23.780676,0.0
min,0.0,2022.0,0.0542,5.6e-05,0.0,-47.322,0.0,0.0222,0.0,0.0,0.00995,0.0,31.237,15559.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
25%,17.0,2022.0,0.43075,0.462,2.0,-10.458,0.0,0.0384,0.00734,0.0,0.1,0.207,99.60575,161666.8,4.0,2739.0,2.0,2.0,16.0,16.0,1.0
50%,31.0,2022.0,0.564,0.702,5.0,-7.185,1.0,0.0521,0.143,0.00133,0.131,0.401,123.002,200000.0,4.0,6378.0,3.0,3.0,38.0,38.0,1.0
75%,44.0,2022.0,0.685,0.873,8.0,-5.131,1.0,0.0928,0.618,0.609,0.284,0.626,140.038,249364.5,4.0,9597.25,4.0,4.0,58.0,58.0,1.0
max,94.0,2022.0,0.985,1.0,11.0,2.82,1.0,0.962,0.996,1.0,0.998,0.995,229.949,4318932.0,5.0,12778.0,40.0,40.0,81.0,81.0,1.0


# 2. Feature creation

In [40]:
def create_binary_energy_column(data, column_name, threshold=0.5):
    data['is_' + column_name + '_high'] = (data[column_name] > threshold).astype(int)
    return data


# Call the function to create a categorical bin column for 'energy'
spotify = create_binary_energy_column(spotify, 'energy', threshold=0.5)
spotify = create_binary_energy_column(spotify, 'loudness', threshold=-10) 
spotify.head()

Unnamed: 0,artist_name,track_name,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_label,word_count_track123,word_count_track,['genre']_encoded,genre_encoded,years_since_out,energy_category,loudness_category,party_song,energy_binary,loudness_binary
0,Gen Hoshino,Comedy,68,2022,acoustic,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,230667,4,0,1,1,0,0,1,low_energy,loud,0,0,1
1,Steve Moakler,Make A Little Room,55,2022,acoustic,0.758,0.722,1,-6.252,1,0.0511,0.165,6e-06,0.106,0.66,146.031,180493,4,1,4,4,0,0,1,high_energy,loud,0,1,1
2,Ben Rector,Steady Love,57,2022,acoustic,0.678,0.277,11,-10.537,1,0.0602,0.843,4e-06,0.104,0.238,76.935,224333,4,2,2,2,0,0,1,low_energy,soft,0,0,0
3,Ben Rector,Living My Best Life,56,2022,acoustic,0.566,0.733,4,-5.302,1,0.0699,0.083,2e-06,0.362,0.636,178.828,215213,4,2,4,4,0,0,1,high_energy,loud,0,1,1
4,Drew Holcomb,Coming Home,54,2022,acoustic,0.625,0.81,10,-5.995,1,0.033,0.146,2.7e-05,0.135,0.741,104.066,146360,4,3,2,2,0,0,1,high_energy,loud,0,1,1


In [None]:
def create_categorical_bin(data, column_name, bins, labels):
    data[column_name + '_category'] = pd.cut(data[column_name], bins=bins, labels=labels)
    return data

spotify = create_categorical_bin(spotify, 'energy', bins=[0, 0.5, 1], labels=['low', 'high'])
spotify = create_categorical_bin(spotify, 'loudness', bins=[-60, -10, 3], labels=['soft', 'loud'])

In [34]:
spotify['energy_category'] = pd.cut(spotify['energy'], bins=[0, 0.5, 1], labels=['low_energy', 'high_energy'])
spotify['loudness_category'] = pd.cut(spotify['loudness'], bins=[-60, -10, 3], labels=['soft', 'loud'])

In [35]:
spotify['party_song'] = ((spotify['danceability'] > 0.5) & (spotify['energy'] > 0.5) & (spotify['loudness'] < -10) & (spotify['liveness'] > 0.5))
spotify['party_song'] = encoder.fit_transform(spotify['party_song'])
spotify['party_song'].head()

0    0
1    0
2    0
3    0
4    0
Name: party_song, dtype: int64

In [36]:
def add_sleep_music_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a binary variable column 'sleep_music' to a DataFrame based on specified conditions

    Args:
        df (pd.DataFrame): The DataFrame containing columns 'instrumentalness', 'duration_minutes', and 'energy'

    Returns:
        pd.DataFrame: The DataFrame with an added 'sleep_music' column
    """
    conditions = (df['instrumentalness'] > 0.6) & (df['duration_minutes'] > 5) & (df['energy'] < 0.5)

    # Add a binary variable column 'party_music' based on the specified conditions
    df['sleep_music'] = conditions.astype(int)

    return df

In [37]:
print(spotify['party_song'].nunique())

2


# 3. Split or cross validation

# 4. Train model and generate predictions

# 5. Hyperparameter tuning and choose the best set of hyperparameters.

# 6.Model’s performance on a set of metrics.