In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

import warnings
warnings.filterwarnings('ignore')

# Display and plotting defaults
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")



: 

In [2]:
# Load dataset from Hugging Face - direct CSV download
import requests
import io

url = "https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/main/dataset.csv"
print("Downloading dataset from Hugging Face...")
response = requests.get(url, timeout=30)
response.raise_for_status()
df = pd.read_csv(io.StringIO(response.text))
print("Dataset loaded successfully!")

# Basic cleaning: drop obvious duplicates and reset index
df = df.drop_duplicates().reset_index(drop=True)

print(f"\nDataset shape: {df.shape}")
print(f"Number of columns: {len(df.columns)}")
df.head()


Downloading dataset from Hugging Face...
Dataset loaded successfully!

Dataset shape: (114000, 21)
Number of columns: 21


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [None]:
# Prepare data for neural network
# Select audio features as predictors (exclude danceability since it's the target)
num_features = [
    'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'
]

available = [c for c in num_features if c in df.columns]

# Keep rows with no NaNs in used columns and danceability (target)
model_df = df.dropna(subset=available + ['danceability']).copy()

# Inputs: audio features; Target: danceability continuous
X = model_df[available].values
y_continuous = model_df['danceability'].values

print(f"Features used: {available}")
print(f"Data shape: {X.shape}")
print(f"Danceability stats (full data): min={y_continuous.min():.3f}, max={y_continuous.max():.3f}, median={np.median(y_continuous):.3f}")


Features used: ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
Data shape: (114000, 10)

Number of unique genres: 114

Genre distribution (top 20):
track_genre
acoustic             1000
punk-rock            1000
progressive-house    1000
power-pop            1000
pop                  1000
pop-film             1000
piano                1000
party                1000
pagode               1000
opera                1000
new-age              1000
mpb                  1000
minimal-techno       1000
metalcore            1000
metal                1000
mandopop             1000
malay                1000
latino               1000
latin                1000
kids                 1000
Name: count, dtype: int64

All genres: ['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass', 'blues', 'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill',

In [None]:
# Split data into train, validation, and test sets
X_train, X_temp, y_train_cont, y_temp_cont = train_test_split(
    X, y_continuous, test_size=0.3, random_state=42
)
X_val, X_test, y_val_cont, y_test_cont = train_test_split(
    X_temp, y_temp_cont, test_size=0.5, random_state=42
)

# Compute median on training set only (to avoid leakage)
median_dance = np.median(y_train_cont)
print(f"Training-set median danceability: {median_dance:.3f}")

# Create binary labels: 1 if >= median, 0 otherwise
y_train = (y_train_cont >= median_dance).astype(int)
y_val = (y_val_cont >= median_dance).astype(int)
y_test = (y_test_cont >= median_dance).astype(int)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert to categorical (one-hot encoding) for neural network
num_classes = 2
y_train_cat = keras.utils.to_categorical(y_train, num_classes)
y_val_cat = keras.utils.to_categorical(y_val, num_classes)
y_test_cat = keras.utils.to_categorical(y_test, num_classes)

print(f"Training set: {X_train_scaled.shape[0]} samples")
print(f"Validation set: {X_val_scaled.shape[0]} samples")
print(f"Test set: {X_test_scaled.shape[0]} samples")
print(f"Number of features: {X_train_scaled.shape[1]}")
print(f"Number of classes: {num_classes}")


NameError: name 'keras' is not defined

## Part 1: Building the Neural Network Architecture


In [None]:
# Function to create neural network model
def create_model(learning_rate=0.001, hidden_units=64, dropout_rate=0.3):
    """
    Create a feedforward neural network for binary classification (danceability >= median vs < median).
    
    Parameters:
    - learning_rate: Learning rate for optimizer
    - hidden_units: Number of units in hidden layers
    - dropout_rate: Dropout rate for regularization
    """
    model = models.Sequential([
        layers.Dense(hidden_units, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(dropout_rate),
        layers.Dense(hidden_units, activation='relu'),
        layers.Dropout(dropout_rate),
        layers.Dense(hidden_units // 2, activation='relu'),
        layers.Dropout(dropout_rate / 2),
        layers.Dense(num_classes, activation='softmax')  # num_classes=2
    ])
    
    # Compile model
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create initial model to show architecture
initial_model = create_model(learning_rate=0.001, hidden_units=64, dropout_rate=0.3)
print("Neural Network Architecture:")
initial_model.summary()


## Part 2: Hyperparameter Tuning - Learning Rate Search


In [None]:
# Hyperparameter tuning: Learning rate search (FAST)
# Goal: ~4x faster by using only 10,000 training samples + fewer learning rates + fewer epochs

sample_size = min(10000, len(X_train_scaled))
if sample_size < len(X_train_scaled):
    print(f"Using sample of {sample_size} training samples for faster hyperparameter search...")
    np.random.seed(42)
    sample_indices = np.random.choice(len(X_train_scaled), size=sample_size, replace=False)
    X_train_sample = X_train_scaled[sample_indices]
    y_train_sample = y_train_cat[sample_indices]
else:
    X_train_sample = X_train_scaled
    y_train_sample = y_train_cat

# Test fewer learning rates (reduces total models trained)
learning_rates = [0.001, 0.01]
lr_results = []

print("\nTesting different learning rates...")
print("=" * 60)
print("(Fast mode: 10k samples, <=10 epochs per candidate)")

for lr in learning_rates:
    print(f"\nTesting learning rate: {lr}")

    # Create model with current learning rate
    model = create_model(learning_rate=lr, hidden_units=64, dropout_rate=0.3)

    # Early stopping to prevent overfitting (aggressive for speed)
    early_stop = callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True,
        verbose=0
    )

    # Train model (reduced epochs for speed)
    history = model.fit(
        X_train_sample, y_train_sample,
        validation_data=(X_val_scaled, y_val_cat),
        epochs=10,
        batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )
    
    # Get best validation accuracy
    best_val_acc = max(history.history['val_accuracy'])
    best_train_acc = max(history.history['accuracy'])
    epochs_trained = len(history.history['loss'])
    
    lr_results.append({
        'learning_rate': lr,
        'val_accuracy': best_val_acc,
        'train_accuracy': best_train_acc,
        'epochs': epochs_trained
    })
    
    print(f"  Best validation accuracy: {best_val_acc:.4f}")
    print(f"  Epochs trained: {epochs_trained}")

# Display results
lr_df = pd.DataFrame(lr_results)
print("\n" + "=" * 60)
print("Learning Rate Search Results:")
print(lr_df.to_string(index=False))

# Find optimal learning rate
optimal_lr = lr_df.loc[lr_df['val_accuracy'].idxmax(), 'learning_rate']
print(f"\nOptimal learning rate: {optimal_lr} (validation accuracy: {lr_df.loc[lr_df['val_accuracy'].idxmax(), 'val_accuracy']:.4f})")


In [None]:
# Visualize learning rate search results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(lr_df['learning_rate'], lr_df['val_accuracy'], marker='o', label='Validation', linewidth=2)
axes[0].plot(lr_df['learning_rate'], lr_df['train_accuracy'], marker='s', label='Training', linewidth=2)
axes[0].axvline(optimal_lr, color='r', linestyle='--', label=f'Optimal LR: {optimal_lr}')
axes[0].set_xlabel('Learning Rate')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Learning Rate vs Accuracy')
axes[0].set_xscale('log')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].bar(range(len(lr_df)), lr_df['val_accuracy'], color=['red' if lr == optimal_lr else 'blue' for lr in lr_df['learning_rate']])
axes[1].set_xlabel('Learning Rate Index')
axes[1].set_ylabel('Validation Accuracy')
axes[1].set_title('Validation Accuracy by Learning Rate')
axes[1].set_xticks(range(len(lr_df)))
axes[1].set_xticklabels([f"{lr:.4f}" for lr in lr_df['learning_rate']], rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## Part 3: Training Final Model with Optimal Hyperparameters


In [None]:
# Train final model with optimal learning rate (FAST)
# Goal: ~4x faster by using only 10,000 training samples + fewer epochs
print(f"Training final model with optimal learning rate: {optimal_lr}")
print("=" * 60)

# Ensure we have a 10,000-sample training subset (in case you skipped the LR-search cell)
if 'X_train_sample' not in globals() or 'y_train_sample' not in globals():
    sample_size = min(10000, len(X_train_scaled))
    np.random.seed(42)
    sample_indices = np.random.choice(len(X_train_scaled), size=sample_size, replace=False)
    X_train_sample = X_train_scaled[sample_indices]
    y_train_sample = y_train_cat[sample_indices]

print(f"Using {len(X_train_sample)} training samples for final training (fast mode)")

# Create final model
final_model = create_model(learning_rate=optimal_lr, hidden_units=64, dropout_rate=0.3)

# Callbacks for training
early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-7,
    verbose=1
)

# Train the model
history = final_model.fit(
    X_train_sample, y_train_sample,
    validation_data=(X_val_scaled, y_val_cat),
    epochs=25,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

print("\nTraining completed!")


In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy plot
axes[0].plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
axes[0].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Accuracy During Training')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Loss plot
axes[1].plot(history.history['loss'], label='Training Loss', linewidth=2)
axes[1].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].set_title('Model Loss During Training')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print final metrics
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
print(f"\nFinal Training Accuracy: {final_train_acc:.4f}")
print(f"Final Validation Accuracy: {final_val_acc:.4f}")


## Part 4: Model Evaluation and Performance Metrics


In [None]:
# Evaluate on test set
test_loss, test_accuracy = final_model.evaluate(X_test_scaled, y_test_cat, verbose=0)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Predictions
y_pred_proba = final_model.predict(X_test_scaled, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

# Classification report
class_names = ['Below Median', 'Above Median']
print("\n" + "=" * 60)
print("Classification Report:")
print("=" * 60)
print(classification_report(y_test, y_pred, target_names=class_names))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


In [None]:
# Visualize confusion matrix
class_names = ['Below Median', 'Above Median']
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
plt.tight_layout()
plt.show()

# Per-class accuracy
class_accuracies = cm.diagonal() / cm.sum(axis=1)
print("\nPer-Class Accuracy:")
for i, class_name in enumerate(class_names):
    print(f"  {class_name}: {class_accuracies[i]:.4f}")


## Explanation: Neural Network Application

### Task Description

We built a **feedforward neural network** for **multi-class classification** to predict song **genre** based on audio features. This is a supervised learning task where:

- **Input**: 10 audio features (danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms)
- **Output**: Music genre (e.g., pop, rock, hip-hop, classical, electronic, etc.)
- **Architecture**: Multi-layer perceptron with dropout regularization

### Network Architecture

The neural network consists of:
1. **Input Layer**: 10 neurons (one per audio feature)
2. **Hidden Layer 1**: 64 neurons with ReLU activation + Dropout (0.3)
3. **Hidden Layer 2**: 64 neurons with ReLU activation + Dropout (0.3)
4. **Hidden Layer 3**: 32 neurons with ReLU activation + Dropout (0.15)
5. **Output Layer**: N neurons with Softmax activation (one per genre class)

**Why this architecture?**
- Multiple hidden layers allow the network to learn complex non-linear relationships between audio features and genre
- Dropout layers prevent overfitting by randomly deactivating neurons during training
- ReLU activation functions enable non-linear learning while being computationally efficient
- Softmax output ensures probabilities sum to 1 for multi-class classification

**Why genre prediction is meaningful:**
- Different genres have distinct audio characteristics (e.g., classical has high acousticness, EDM has high energy)
- The model learns these patterns from the audio features
- This can be useful for music recommendation systems and playlist generation


## Explanation: Performance Metrics

### Metrics Used to Assess NN Performance

We use multiple metrics to comprehensively evaluate the neural network:

#### 1. **Accuracy**
- **Definition**: Proportion of correct predictions (TP + TN) / Total samples
- **Why it matters**: Overall model performance across all genre classes
- **Limitation**: Can be misleading with imbalanced classes

#### 2. **Loss (Categorical Cross-Entropy)**
- **Definition**: Measures the difference between predicted probabilities and true labels
- **Why it matters**: Directly optimized during training; lower is better
- **Interpretation**: Penalizes confident wrong predictions more heavily

#### 3. **Per-Class Metrics** (from Classification Report)
- **Precision**: TP / (TP + FP) - Of predicted genres, how many are correct?
- **Recall**: TP / (TP + FN) - Of actual genres, how many were found?
- **F1-Score**: Harmonic mean of precision and recall - Balanced metric
- **Support**: Number of samples in each genre

#### 4. **Confusion Matrix**
- **Definition**: Table showing actual vs predicted classifications
- **Why it matters**: Reveals which genres are confused with each other
- **Insight**: Helps identify systematic misclassification patterns (e.g., confusing rock with metal)

**Why these metrics?**
- **Accuracy** gives overall performance but can hide genre-specific issues
- **Per-class metrics** reveal if the model performs well for all genres
- **Confusion matrix** shows specific error patterns (e.g., confusing similar genres)
- Together, they provide a complete picture of model performance


## Explanation: Training Process and Hyperparameter Learning

### How the Neural Network Was Trained

#### 1. **Data Preparation**
- **Train/Validation/Test Split**: 70%/15%/15% split with stratification to maintain class distribution
- **Feature Standardization**: All features scaled to mean=0, std=1 using StandardScaler
- **One-Hot Encoding**: Labels converted to categorical format for multi-class classification

#### 2. **Hyperparameter Tuning: Learning Rate**

**Method**: Grid search over learning rates [0.0001, 0.001, 0.01, 0.1]

**Process**:
- Trained separate models with each learning rate
- Used early stopping (patience=5) to prevent overfitting
- Evaluated on validation set to find optimal value
- Selected learning rate with highest validation accuracy

**Results**: The optimal learning rate balances:
- **Too low** (0.0001): Slow convergence, may not reach optimal solution
- **Too high** (0.1): Unstable training, may overshoot optimal weights
- **Optimal** (typically 0.001-0.01): Fast convergence with stable training

#### 3. **Training Configuration**

**Optimizer**: Adam (Adaptive Moment Estimation)
- Combines benefits of momentum and adaptive learning rates
- Automatically adjusts learning rate per parameter

**Batch Size**: 32
- Processes 32 samples per gradient update
- Balances memory usage and gradient stability

**Epochs**: Up to 100 (with early stopping)
- Early stopping monitors validation loss
- Stops training if no improvement for 10 epochs
- Restores best weights to prevent overfitting

**Callbacks Used**:
- **EarlyStopping**: Prevents overfitting by stopping when validation loss stops improving
- **ReduceLROnPlateau**: Dynamically reduces learning rate if validation loss plateaus

#### 4. **Regularization Techniques**

- **Dropout**: Randomly deactivates 30% of neurons during training to prevent overfitting
- **Validation Set**: Used to monitor generalization performance during training
- **Early Stopping**: Prevents training beyond optimal point

### Hyperparameter Learning Process

1. **Initial Exploration**: Tested learning rates across orders of magnitude (0.0001 to 0.1)
2. **Validation-Based Selection**: Chose learning rate with best validation performance
3. **Final Training**: Trained final model with optimal hyperparameters on full training set
4. **Test Evaluation**: Final performance assessed on held-out test set

**Key Insight**: Learning rate is critical - too high causes instability, too low causes slow convergence. The grid search identified the sweet spot for this dataset.
