In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import make_scorer, fbeta_score, precision_score, recall_score

In [2]:
# Constants
POPULARITY_THRESHOLD = 85
SEED = 42
BETA = 1.5

RECALL_THRESHOLD = 0.7
PRECISION_THRESHOLD = 0.2

In [3]:
# Download latest version
data_path = kagglehub.dataset_download("amitanshjoshi/spotify-1million-tracks")
data = pd.read_csv(f"{data_path}/spotify_data.csv")

In [4]:
data.dropna(inplace=True)

In [5]:
data['artist_song_count'] = data.groupby('artist_name')['track_id'].transform('count')
data['year'] = data['year'].astype(int)
yearly_thresholds = data.groupby('year')['popularity'].quantile(POPULARITY_THRESHOLD / 100).to_dict()
data['verdict'] = data.apply(lambda row: 1 if row['popularity'] >= yearly_thresholds[row['year']] else 0, axis=1)
# calculate the quantiles for duration_ms
Q1 = data['duration_ms'].quantile(0.25)
Q4 = data['duration_ms'].quantile(0.95)
# add feature normal vs long duration
data['long_duration'] = data['duration_ms'].apply(lambda x: 1 if x > Q4 else 0)
# add feature normal vs short duration
data['short_duration'] = data['duration_ms'].apply(lambda x: 1 if x < Q1 else 0)

In [9]:
drop_cols = ['Unnamed: 0', 'artist_name', 'track_name', 'track_id', 'popularity', 'year', 'duration_ms']
data.drop(columns=drop_cols, inplace=True, errors='ignore')

In [None]:
TARGET = 'verdict'
X = data.drop(columns=[TARGET], errors='ignore')
y = data[TARGET]

categorical_features = [col for col in ['genre'] if col in X.columns]
numerical_features = [col for col in X.columns if col not in categorical_features]

FEATURES = numerical_features + categorical_features
features_target = data[FEATURES + [TARGET]]

In [None]:
train_data, test_data = train_test_split(features_target, test_size=0.2, random_state=SEED, stratify=features_target[TARGET])
train_input = train_data[FEATURES]
train_output = train_data[TARGET]
test_input = test_data[FEATURES]
test_output = test_data[TARGET]

In [None]:
numerical_transformer = Pipeline([
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, categorical_features),
])

def build_pipeline(estimator):
    return Pipeline([
        ("preprocessor", preprocessor),
        ("estimator", estimator),
    ])

# MLFlow

In [None]:
# Configure MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("spotify_popularity_classification")

In [None]:
def evaluate_mlflow(y_true, y_pred, run_name="model"):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    # Log core metrics
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall) 
    mlflow.log_metric("f1", f1)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("false_negatives", fn) 
    mlflow.log_metric("true_positives", tp)
    
    # Rates
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    mlflow.log_metric("false_positive_rate", fpr)
    mlflow.log_metric("false_negative_rate", fnr)
    
    print(f"\n{run_name} Results:")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")

# III. Modeling

In [None]:


dt_model = DecisionTreeClassifier(random_state=SEED,
                                class_weight='balanced',
                                max_depth=5)

rf_model = RandomForestClassifier(random_state=SEED,
                                  class_weight='balanced',
                                  n_estimators=100,
                                  max_depth=5,
                                  max_features='sqrt',
                                  min_samples_split=10)



In [None]:
# Train decision tree 
print("Training Decision Tree")
dt_pipeline = build_pipeline(dt_model)
dt_pipeline.fit(train_input, train_output)
dt_predictions = dt_pipeline.predict(test_input)

with mlflow.start_run(run_name="decision_tree"):
    mlflow.log_params(dt_model.get_params())
    evaluate_mlflow(test_output, dt_predictions, run_name="Decision Tree")
    mlflow.sklearn.log_model(dt_pipeline, "decision_tree")

In [None]:
# Train Random Forest
print("Training Random Forest")
rf_pipeline = build_pipeline(rf_model)
rf_pipeline.fit(train_input, train_output)
rf_predictions = rf_pipeline.predict(test_input)

with mlflow.start_run(run_name="random_forest"):
    mlflow.log_params(rf_model.get_params())
    evaluate_mlflow(test_output, rf_predictions, run_name="Random Forest")
    mlflow.sklearn.log_model(rf_pipeline, "random_forest")

## SearchGridCV

In [None]:
xgb_model = XGBClassifier(random_state=SEED, 
                          objective='binary:logistic',
                          n_jobs=-1)
param_grid = {
    'n_estimators': [200, 600, 800, 1000],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.6, 0.8],
    'scale_pos_weight': [6, 12]
}
xgb_pipeline = build_pipeline(xgb_model)
grid_search = GridSearchCV(xgb_pipeline, 
                           param_grid, 
                           scoring='recall_macro', 
                           cv=3, 
                           n_jobs=-1, 
                           verbose=2)
grid_search.fit(train_input, train_output)
best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

In [None]:
xgb_predictions = best_estimator.predict(test_input)
xgb_predictions_proba = best_estimator.predict_proba(test_input)[:, 1]

# Log to MLflow
with mlflow.start_run(run_name="xgboost") as run:
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    mlflow.log_metric("best_cv_score", grid_search.best_score_)
    mlflow.log_metric("cv_score_std", grid_search.cv_results_['std_test_score'][grid_search.best_index_])
    
    # Evaluate on test set
    evaluate_mlflow(test_output, xgb_predictions, run_name="XGBoost")
    
    # Log the best model - FIXED: Ensure model is properly logged
    mlflow.sklearn.log_model(
        sk_model=best_estimator,
        artifact_path="model",  # Changed from "xgboost_tuned" to "model" for standard practice
        registered_model_name="spotify_popularity_predictor2"  # Register the model
    )
    
    # Also log additional information
    mlflow.set_tag("model_type", "XGBoost")
    mlflow.set_tag("pipeline", "True")
    mlflow.set_tag("feature_count", str(test_input.shape[1]))
    
    run_id = run.info.run_id
    print(f"Run ID: {run_id}")
    print(f"Model logged successfully! Artifact path: model")
    
    # Verify the model can be loaded back
    try:
        # Test loading the model immediately
        loaded_model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
        test_pred = loaded_model.predict(test_input[:1])
        print(f"✓ Model verification: Successfully loaded and made prediction: {test_pred[0]}")
    except Exception as e:
        print(f"✗ Model verification failed: {e}")