In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_processing import load_features
from src.models import load_model

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Try custom-extracted first, then pre-extracted
try:
    features_path = "../data/processed/features.csv"
    X, y = load_features(features_path)
except FileNotFoundError:
    features_path = "../data/raw/Data/features_30_sec.csv"
    X, y = load_features(features_path)

df = pd.concat([X, y], axis=1)

print(f"Dataset shape: {df.shape}")
print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")

# Show genre distribution
print("\nGenre distribution:")
genre_counts = y.value_counts()
print(genre_counts)

In [None]:
if df is not None:
    # Feature statistics
    print("Feature Statistics:")
    print(X.describe())
    
    # Correlation matrix for key features
    key_features = ['tempo', 'spectral_centroid_mean', 'spectral_rolloff_mean', 'zero_crossing_rate_mean', 'rms_mean']
    
    available_features = [f for f in key_features if f in X.columns]
    
    if available_features:
        plt.figure(figsize=(10, 8))
        correlation_matrix = X[available_features].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Feature Correlation Matrix')
        plt.tight_layout()
        plt.show()

In [None]:
if df is not None:
    # Genre distribution plot
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    genre_counts.plot(kind='bar')
    plt.title('Genre Distribution')
    plt.xlabel('Genre')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    plt.pie(genre_counts.values, labels=genre_counts.index, autopct='%1.1f%%')
    plt.title('Genre Distribution (Pie Chart)')
    
    plt.tight_layout()
    plt.show()
    
    # Analyze tempo by genre
    label_col = 'label' if 'label' in df.columns else 'genre'
    if 'tempo' in df.columns:
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        df.boxplot(column='tempo', by=label_col, ax=plt.gca())
        plt.title('Tempo by Genre')
        plt.suptitle('')
        
        plt.subplot(1, 2, 2)
        if 'spectral_centroid_mean' in df.columns:
            df.boxplot(column='spectral_centroid_mean', by=label_col, ax=plt.gca())
            plt.title('Spectral Centroid by Genre')
            plt.suptitle('')
        
        plt.tight_layout()
        plt.show()

In [None]:
# Load trained model
model_path = "../models/random_forest_model.pkl"

try:
    model = load_model(model_path)
    print("Model loaded successfully!")
    print(f"Model type: {model.model_name}")
    
    # Example prediction (using a sample from the data)
    if df is not None:
        sample_features = X.iloc[0:1].values
        
        # Scale features using the model's scaler
        sample_features_scaled = model.scaler.transform(sample_features)
        
        # Make prediction
        y_pred_num = model.model.predict(sample_features_scaled)
        prediction = model.label_encoder.inverse_transform(y_pred_num)[0]
        
        true_label = y.iloc[0]
        
        print(f"\nSample prediction:")
        print(f"True genre: {true_label}")
        print(f"Predicted genre: {prediction}")
        print(f"Correct: {prediction == true_label}")
        
except FileNotFoundError:
    print("Model file not found. Please run training first.")
except Exception as e:
    print(f"Error loading model: {str(e)}")

In [None]:
if df is not None:
    # For Random Forest, we can get feature importance
    try:
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import train_test_split
        from sklearn.preprocessing import StandardScaler
        
        # Train a simple Random Forest for feature importance
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train_scaled, y_train)
        
        # Get feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
        # Plot top 20 features
        plt.figure(figsize=(12, 8))
        top_features = feature_importance.head(20)
        plt.barh(range(len(top_features)), top_features['importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Feature Importance')
        plt.title('Top 20 Most Important Features')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
        
        print("\nTop 10 Most Important Features:")
        print(feature_importance.head(10))
        
    except Exception as e:
        print(f"Error analyzing feature importance: {str(e)}")

Deep Learning:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import pandas as pd

# Scale test features
X_test_scaled = scaler.transform(X_test)

# Predict with MLP model
preds = model.model.predict(X_test_scaled)
print("Predictions shape:", preds.shape)

if preds.ndim == 1 or (preds.ndim == 2 and preds.shape[1] == 1):
    y_pred_num = preds.astype(int).flatten()
else:
    y_pred_num = np.argmax(preds, axis=1)
y_pred = model.label_encoder.inverse_transform(y_pred_num)

# If y_test is encoded, decode it for display
if hasattr(model, 'label_encoder') and hasattr(model.label_encoder, 'inverse_transform'):
    if np.issubdtype(y_test.dtype, np.integer):
        y_test_decoded = model.label_encoder.inverse_transform(y_test)
    else:
        y_test_decoded = y_test
else:
    y_test_decoded = y_test

# Confusion matrix
cm = confusion_matrix(y_test_decoded, y_pred, labels=np.unique(y_test_decoded))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test_decoded))
fig, ax = plt.subplots(figsize=(8, 8))
disp.plot(ax=ax, cmap='Blues')
plt.title('Confusion Matrix (MLP)')
plt.show()

In [None]:
#Classification Report

from sklearn.metrics import classification_report

print("Classification Report (MLP):")
print(classification_report(y_test_decoded, y_pred))

In [None]:
# Genre distribution bar chart
results_df = pd.DataFrame({'Actual': y_test_decoded, 'Predicted': y_pred})
plt.figure(figsize=(12, 5))
results_df['Predicted'].value_counts().sort_index().plot(kind='bar')
plt.title('Predicted Genre Distribution (MLP)')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Actual vs. Predicted genre distribution
actual_counts = pd.Series(y_test_decoded).value_counts().sort_index()
pred_counts = pd.Series(y_pred).value_counts().sort_index()
df_compare = pd.DataFrame({'Actual': actual_counts, 'Predicted': pred_counts}).fillna(0)
df_compare.plot(kind='bar', figsize=(12, 5))
plt.title('Actual vs. Predicted Genre Distribution (MLP)')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()