# Baseline #1
* Always predict the majority class

In [1]:
import pandas as pd
pd.options.display.max_columns = 200

songs = pd.read_csv('data/spotify_simplified.csv', index_col=[0])
# Transform duration attribute to minutes for better interpretability
songs['duration_ms'] = songs['duration_ms'] / (1000 * 60)
songs.rename(columns={'duration_ms': 'duration_min'}, inplace=True)
# Delete duplicates and rows with NAs
songs.dropna(inplace=True)
songs = songs.drop_duplicates(['track_id'])

In [2]:
# Find majority class
grouped_by_genre = songs.groupby(['track_genre']).size()
max_value = max(grouped_by_genre)
max_i = list(grouped_by_genre).index(max_value)
genre_names = list(grouped_by_genre.index)
majority_class = genre_names[max_i]
print(f"Majority class is: {majority_class}")

Majority class is: edm


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

def evaluate_baseline(y_test, predictions):
    base_accuracy = accuracy_score(y_test, predictions)
    base_f1_weighted = f1_score(y_test, predictions, average='weighted')
    print(f"Accuracy: {base_accuracy:.6f}")
    print(f"F1-score: {base_f1_weighted:.6f}")
    return base_f1_weighted

songs_data = songs.drop(columns = ["track_genre"])
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
# Perform data split
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3, 
                                                    stratify=encoded_genres, shuffle=True, random_state=100)
# Compute predictions (always predict majority class)
majority_class_label = list(label_encoder.classes_).index(majority_class)
predictions = [majority_class_label] * len(y_test)
# Evaluate baseline model
print("Baseline performance predicting always the majority class")
majority_baseline_f1 = evaluate_baseline(y_test, predictions)

Baseline performance predicting always the majority class
Accuracy: 0.121388
F1-score: 0.026280


# Baseline #2
* Predict using a set of rules that were determined by looking at the data

In [60]:
import random

def predict_using_rules(training_record:pd.Series, genre_mapping:dict) -> int:
    # Strongest rules first (rules with high purity)
    if training_record['speechiness'] > 0.75:
        prediction = 'comedy'
    elif training_record['liveness'] > 0.6:
        prediction = 'pagode'
    elif training_record['valence'] < 0.03:
        prediction = 'sleep'
    elif training_record['acousticness'] > 0.99:
        prediction = random.choice(['piano', 'classical'])
    elif training_record['danceability'] > 0.90:
        prediction = random.choice(['children', 'kids'])
    elif training_record['popularity'] > 75:
        prediction = random.choice(['latino', 'dance'])
    elif training_record['duration_min'] > 6:
        prediction = random.choice(['chicago-house', 'minimal-techno', 'detroit-techno'])
    elif training_record['explicit']:
        prediction = 'emo'
    elif training_record['loudness'] > -3:
        prediction = random.choice(['drum-and-bass', 'j-idol', 'hardstyle', 'happy', 'party'])
    else:
        prediction = 'acoustic'  # Majority class
    return genre_mapping[prediction]

### Baseline for simplified dataset

In [4]:
import random

def predict_using_rules(training_record:pd.Series, genre_mapping:dict) -> int:
    # Strongest rules first (rules with high purity)
    if training_record['speechiness'] > 0.75:
        prediction = 'comedy'
    elif training_record['valence'] < 0.03 or training_record['tempo'] == 0:
        prediction = 'sleep'
    elif training_record['acousticness'] > 0.99:
        prediction = random.choice(['ambient', 'classical'])
    elif training_record['duration_min'] > 6:
        prediction = 'techno'
    elif training_record['danceability'] > 0.90:
        prediction = 'children'
    else:
        prediction = 'edm'  # Majority class
    return genre_mapping[prediction]

In [5]:
# Baseline using rules
label_map = {genre_name: i for i, genre_name in enumerate(list(label_encoder.classes_))}
predictions = [predict_using_rules(X_test.iloc[i], label_map) for i in range(len(X_test))]
print("Baseline performance using hand-crafted rules")
rule_baseline_f1 = evaluate_baseline(y_test, predictions)

Baseline performance using hand-crafted rules
Accuracy: 0.164661
F1-score: 0.087731


In [6]:
# Compare baselines
print(f"Rule baseline has {(rule_baseline_f1 - majority_baseline_f1)/majority_baseline_f1*100:.2f}% better performance than majority baseline")

Rule baseline has 233.83% better performance than majority baseline
