In [2]:
import torch
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import average_precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
import joblib
import time

# Function to calculate MAP
def calculate_map(targets, predictions):
    print("Calculating Mean Average Precision (MAP)...")
    map_score = average_precision_score(targets, predictions, average="macro")
    print(f"MAP calculation complete: {map_score:.4f}")
    return map_score

# Load data
print("Loading data...")
start_time = time.time()
train_data = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/train_rep.pt')
test_data = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/test_rep.pt')
train_labels = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/train/labels.pt')
test_labels = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/test/labels.pt')
print(f"Data loaded in {time.time() - start_time:.2f} seconds.")

# Apply max pooling along the time dimension (496)
print("Applying max pooling along the time dimension (496)...")
start_time = time.time()
train_data = torch.mean(train_data, dim=1)  # Max pooling across dimension 1 (time)
test_data = torch.mean(test_data, dim=1)    # Max pooling across dimension 1 (time)

train_data = train_data.numpy()
test_data = test_data.numpy()
# train_data = train_data.reshape(train_data.shape[0], -1)
# test_data = test_data.reshape(test_data.shape[0], -1)
# train_data = torch.max(train_data, dim=1).values.numpy()  # Max pooling across dimension 1 (time)
# test_data = torch.max(test_data, dim=1).values.numpy()    # Max pooling across dimension 1 (time)
print(train_data.shape)
train_labels = train_labels.numpy()  # [20550, 527]
test_labels = test_labels.numpy()    # [18886, 527]
print(f"Max pooled train data shape: {train_data.shape}")  # [20550, 768]
print(f"Max pooled test data shape: {test_data.shape}")    # [18886, 768]
print(f"Max pooling completed in {time.time() - start_time:.2f} seconds.")


Loading data...


  train_data = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/train_rep.pt')
  test_data = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/test_rep.pt')
  train_labels = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/train/labels.pt')
  test_labels = torch.load('/scratch/rd3629/ml-project/AudioSet-classification/Data/test/labels.pt')


Data loaded in 32.16 seconds.
Applying max pooling along the time dimension (496)...
(20550, 768)
Max pooled train data shape: (20550, 768)
Max pooled test data shape: (18886, 768)
Max pooling completed in 2.96 seconds.


In [1]:
import torch
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import average_precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
import joblib
import time
# Dimensionality reduction using PCA
pca_path = "/scratch/rd3629/ml-project/AudioSet-classification/Data/pca_model_100.pkl"
transformed_data_path = "/scratch/rd3629/ml-project/AudioSet-classification/Data/transformed_data_100.npz"

try:
    # Load saved PCA model and transformed data
    print("Loading saved PCA model and transformed data...")
    pca = joblib.load(pca_path)
    with np.load(transformed_data_path) as data:
        train_data = data['train']
        test_data = data['test']
    print("PCA model and transformed data loaded successfully.")
except FileNotFoundError:
    print("Saved PCA model not found. Applying PCA...")
    pca = PCA(n_components=100)
    train_data = pca.fit_transform(train_data)
    test_data = pca.transform(test_data)
    print(f"Dimensionality reduced to 512 dimensions. Train shape: {train_data.shape}, Test shape: {test_data.shape}")

    # Save PCA model and transformed data
    joblib.dump(pca, pca_path)
    np.savez(transformed_data_path, train=train_data, test=test_data)
    print("PCA model and transformed data saved.")

Loading saved PCA model and transformed data...
PCA model and transformed data loaded successfully.


In [5]:

# Train a Decision Tree Model with OneVsRestClassifier
print("Initializing Decision Tree model with OneVsRestClassifier...")
start_time = time.time()
model = OneVsRestClassifier(
    DecisionTreeClassifier(
        criterion='gini',  # Use 'entropy' if you prefer information gain
        max_depth=5,    # Adjust if you want to limit tree depth
        random_state=42    # For reproducibility
    ),
    n_jobs=-1
)
print("Training Decision Tree model...")
model.fit(train_data, train_labels)
print(f"Model training complete in {time.time() - start_time:.2f} seconds.")

# Predict probabilities on the test set
print("Generating predictions on test data...")
start_time = time.time()
test_predictions = model.predict_proba(test_data)  # [18886, 527]
print(f"Predictions generated in {time.time() - start_time:.2f} seconds.")

# Metrics
print("Calculating evaluation metrics...")
start_time = time.time()
test_map = calculate_map(test_labels, test_predictions)
test_f1 = f1_score(test_labels, (test_predictions > 0.5).astype(float), average="micro")
print(f"F1-Score calculation complete: {test_f1:.4f}")
print(f"Metrics calculated in {time.time() - start_time:.2f} seconds.")

# Print results
print("\n--- Evaluation Results ---")
print(f"Test MAP: {test_map:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")


Initializing Decision Tree model with OneVsRestClassifier...
Training Decision Tree model...


KeyboardInterrupt: 

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier

# Train a Decision Tree Model with OneVsRestClassifier
print("Initializing Decision Tree model with OneVsRestClassifier...")
start_time = time.time()
# model = OneVsRestClassifier(
#     DecisionTreeClassifier(
#         criterion='gini',  # Use 'entropy' if you prefer information gain
#         max_depth=5,    # Adjust if you want to limit tree depth
#         random_state=42    # For reproducibility
#     ),
#     n_jobs=-1
# )

# model = MultiOutputClassifier(
#     RandomForestClassifier(
#         n_estimators=100,      # Number of trees in the forest
#         criterion='gini',      # Use 'entropy' if you prefer information gain
#         max_depth=5,        # None means nodes are expanded until all leaves are pure or contain less than min_samples_split samples
#         random_state=42,       # For reproducibility
#         n_jobs=-1              # Use all processors for parallelization
#     ),
#     n_jobs=-1  # Parallelize OneVsRestClassifier as well
# )

# model = MultiOutputClassifier(
#     ExtraTreeClassifier(
#         criterion='gini',      # Use 'entropy' if you prefer information gain
#         max_depth=None,           # Limit tree depth if required
#         random_state=42,        # For reproducibility
#     ),
#     n_jobs=-1  # Parallelize the MultiOutputClassifier
# )

model = MultiOutputClassifier(
    KNeighborsClassifier(
        n_neighbors=5,      # Number of neighbors to use
        weights='uniform',  # Use 'distance' for weighted neighbors
        algorithm='auto',   # Auto-select the best algorithm
        n_jobs=-1           # Use all processors for parallelization
    )
)

# model = MultiOutputClassifier(
#     RadiusNeighborsClassifier(
#         radius=5.0,         # Increased radius to accommodate more neighbors
#         weights='uniform',  # Use 'distance' for weighted neighbors
#         algorithm='auto',   # Auto-select the best algorithm
#         outlier_label=-1,   # Label for outliers (you can set any value, here it's -1)
#         n_jobs=-1           # Use all processors for parallelization
#     )
# )

print("Training Decision Tree model...")
model.fit(train_data, train_labels)
print(f"Model training complete in {time.time() - start_time:.2f} seconds.")

# Predict probabilities on the test set
print("Generating predictions on test data...")
start_time = time.time()
test_predictions = model.predict_proba(test_data)  # [18886, 527]


Initializing Decision Tree model with OneVsRestClassifier...
Training Decision Tree model...
Model training complete in 3.72 seconds.
Generating predictions on test data...


In [4]:
import numpy as np

# Convert predictions to a NumPy array
test_predictions = np.array(test_predictions)

# Extract probabilities for the positive class (class 1)
print("Extracting probabilities for the positive class (class 1)...")
test_predictions = test_predictions[:, :, 1]  # Shape becomes (527, 18886)

# Transpose predictions to match the labels' shape (18886, 527)
test_predictions = test_predictions.T  # Shape becomes (18886, 527)

# Metrics
print("Calculating evaluation metrics...")
start_time = time.time()
test_map = calculate_map(test_labels, test_predictions)
test_f1 = f1_score(test_labels, (test_predictions > 0.5).astype(float), average="micro")
print(f"F1-Score calculation complete: {test_f1:.4f}")
print(f"Metrics calculated in {time.time() - start_time:.2f} seconds.")

# Print results
print("\n--- Evaluation Results ---")
print(f"Test MAP: {test_map:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")


Extracting probabilities for the positive class (class 1)...
Calculating evaluation metrics...
Calculating Mean Average Precision (MAP)...
MAP calculation complete: 0.2488
F1-Score calculation complete: 0.4414
Metrics calculated in 25.88 seconds.

--- Evaluation Results ---
Test MAP: 0.2488
Test F1-Score: 0.4414


In [17]:
print(len(test_predictions[0]))

18886
