In [2]:
#Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

In [4]:
file_path = 'Spotify Most Streamed Songs.csv'
data = pd.read_csv(file_path)

In [5]:
data['streams'] = pd.to_numeric(data['streams'], errors='coerce')
columns_to_convert = ['in_deezer_playlists', 'in_shazam_charts']
for col in columns_to_convert:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [7]:
data_cleaned = data.dropna(subset=['streams'])
numeric_cols = data_cleaned.select_dtypes(include=['int64', 'float64']).columns
data_cleaned.loc[:, numeric_cols] = data_cleaned[numeric_cols].fillna(data_cleaned[numeric_cols].median())
data_cleaned = data_cleaned.drop(['cover_url', 'track_name', 'artist(s)_name'], axis=1)

In [8]:
data_cleaned['hit_potential'] = np.where(data_cleaned['in_spotify_charts'] > 0, 1, 0)

In [9]:
X = data_cleaned.drop(['in_spotify_charts', 'hit_potential'], axis=1)
y = data_cleaned['hit_potential']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
numeric_features = ['artist_count', 'released_year', 'released_month', 'released_day', 'streams',
                    'in_spotify_playlists', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists',
                    'in_deezer_charts', 'in_shazam_charts', 'bpm', 'danceability_%', 'valence_%',
                    'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']
categorical_features = ['key', 'mode']

In [12]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [14]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

In [15]:
results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    results[name] = {
        'accuracy': accuracy,
        'classification_report': report
    }

for name, result in results.items():
    print(f"\nModel: {name}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Classification Report:")
    print(pd.DataFrame(result['classification_report']).transpose())



Model: Random Forest
Accuracy: 0.7749
Classification Report:
              precision    recall  f1-score     support
0              0.742857  0.675325  0.707483   77.000000
1              0.793388  0.842105  0.817021  114.000000
accuracy       0.774869  0.774869  0.774869    0.774869
macro avg      0.768123  0.758715  0.762252  191.000000
weighted avg   0.773017  0.774869  0.772862  191.000000

Model: Logistic Regression
Accuracy: 0.7644
Classification Report:
              precision    recall  f1-score     support
0              0.681818  0.779221  0.727273   77.000000
1              0.834951  0.754386  0.792627  114.000000
accuracy       0.764398  0.764398  0.764398    0.764398
macro avg      0.758385  0.766803  0.759950  191.000000
weighted avg   0.773217  0.764398  0.766280  191.000000

Model: SVM
Accuracy: 0.8115
Classification Report:
              precision    recall  f1-score     support
0              0.746988  0.805195  0.775000   77.000000
1              0.861111  0.815789 

In [16]:
print("\nCross-Validation Scores:")
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    cv_scores = cross_val_score(pipeline, X, y, cv=5)
    print(f"{name}: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Cross-Validation Scores:
Random Forest: 0.7299 ± 0.0677
Logistic Regression: 0.7403 ± 0.0898
SVM: 0.7014 ± 0.0928
Gradient Boosting: 0.7215 ± 0.0747
K-Nearest Neighbors: 0.6395 ± 0.1011
Decision Tree: 0.6742 ± 0.0579
