# 1. Setup and Dependencies

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

# 2. Data Loading and Initial Analysis

In [None]:
expanded_df = pd.read_csv('Popular_Spotify_Songs.csv', encoding='latin-1')
print("Expanded Dataset Details:")
print("-" * 50)
print(f"Number of rows: {expanded_df.shape[0]}")
print(f"Number of columns: {expanded_df.shape[1]}")
print("\nColumns in the dataset:")
print(expanded_df.columns.tolist())
print("\nFirst few rows of the dataset:")
print(expanded_df.head())
print("\nDataset information:")
print(expanded_df.info())
print("\nNumerical columns description:")
print(expanded_df.describe())
print("\nMissing values count:")
print(expanded_df.isnull().sum())

Expanded Dataset Details:
--------------------------------------------------
Number of rows: 953
Number of columns: 24

Columns in the dataset:
['track_name', 'artist(s)_name', 'artist_count', 'released_year', 'released_month', 'released_day', 'in_spotify_playlists', 'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm', 'key', 'mode', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']

First few rows of the dataset:
                            track_name    artist(s)_name  artist_count  \
0  Seven (feat. Latto) (Explicit Ver.)  Latto, Jung Kook             2   
1                                 LALA       Myke Towers             1   
2                              vampire    Olivia Rodrigo             1   
3                         Cruel Summer      Taylor Swift             1   
4                       WHERE SHE GOES         Bad Bunny  

# 3. Data Cleaning - Handling Missing Values

In [5]:
df = expanded_df.copy()

df['in_shazam_charts'] = df['in_shazam_charts'].fillna(0)

most_common_key = df['key'].mode()[0]
df['key'] = df['key'].fillna(most_common_key)

print("\nMissing values after handling:")
print(df.isnull().sum())

print("\nSample of rows where 'key' was imputed:")
print(df[df['key'] == most_common_key].head())


Missing values after handling:
track_name              0
artist(s)_name          0
artist_count            0
released_year           0
released_month          0
released_day            0
in_spotify_playlists    0
in_spotify_charts       0
streams                 0
in_apple_playlists      0
in_apple_charts         0
in_deezer_playlists     0
in_deezer_charts        0
in_shazam_charts        0
bpm                     0
key                     0
mode                    0
danceability_%          0
valence_%               0
energy_%                0
acousticness_%          0
instrumentalness_%      0
liveness_%              0
speechiness_%           0
dtype: int64

Sample of rows where 'key' was imputed:
                                           track_name     artist(s)_name  \
1                                                LALA        Myke Towers   
5                                            Sprinter  Dave, Central Cee   
8                                            fukumean         

# 4. Feature Engineering - Encoding Categorical Variables

In [6]:
df = expanded_df.copy()

key_encoded = pd.get_dummies(df['key'], prefix='key')
mode_encoded = pd.get_dummies(df['mode'], prefix='mode')

df = df.drop(['key', 'mode'], axis=1)
df = pd.concat([df, key_encoded, mode_encoded], axis=1)

track_encoder = LabelEncoder()
artist_encoder = LabelEncoder()

df['track_name_encoded'] = track_encoder.fit_transform(df['track_name'])
df['artist_encoded'] = artist_encoder.fit_transform(df['artist(s)_name'])

print("\nFirst few rows of encoded dataframe:")
print(df.head())

print("\nNew columns after encoding:")
print(df.columns.tolist())

encoders = {
    'track_encoder': track_encoder,
    'artist_encoder': artist_encoder
}
joblib.dump(encoders, 'label_encoders.joblib')


First few rows of encoded dataframe:
                            track_name    artist(s)_name  artist_count  \
0  Seven (feat. Latto) (Explicit Ver.)  Latto, Jung Kook             2   
1                                 LALA       Myke Towers             1   
2                              vampire    Olivia Rodrigo             1   
3                         Cruel Summer      Taylor Swift             1   
4                       WHERE SHE GOES         Bad Bunny             1   

   released_year  released_month  released_day  in_spotify_playlists  \
0           2023               7            14                   553   
1           2023               3            23                  1474   
2           2023               6            30                  1397   
3           2019               8            23                  7858   
4           2023               5            18                  3133   

   in_spotify_charts    streams  in_apple_playlists  ...  key_D#  key_E  \
0        

['label_encoders.joblib']

# 5. Feature Scaling:  

In [17]:
print("Sample of streams column:")
print(expanded_df['streams'].head())

scaler = MinMaxScaler()
expanded_df['streams_scaled'] = scaler.fit_transform(expanded_df[['streams']])

print("\nOriginal vs Scaled Streams:")
print(pd.DataFrame({
    'Original Streams': expanded_df['streams'],
    'Scaled Streams': expanded_df['streams_scaled']
}).head())


Sample of streams column:
0    141381703
1    133716286
2    140003974
3    800840817
4    303236322
Name: streams, dtype: int64

Original vs Scaled Streams:
   Original Streams  Scaled Streams
0         141381703        0.038170
1         133716286        0.036101
2         140003974        0.037798
3         800840817        0.216215
4         303236322        0.081869


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expanded_df['streams_scaled'] = scaler.fit_transform(expanded_df[['streams']])


# 5. Data Splitting - Training and Testing Sets

In [22]:
features_to_drop = ['track_name', 'artist(s)_name', 'streams', 'streams_scaled']
X = expanded_df.drop(features_to_drop, axis=1)
y = expanded_df['streams_scaled']  

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

print("Dataset shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")


Dataset shapes:
X_train: (761, 21)
X_test: (191, 21)
y_train: (761,)
y_test: (191,)
