In [1]:
import os
import librosa
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import concurrent.futures
import warnings

# Suppress PySoundFile and Audioread warnings
warnings.filterwarnings("ignore", message="PySoundFile failed. Trying audioread instead.")

def extract_features(audio_file, sr=22050, n_mfcc=13):
    try:
        # Load audio file
        y, sr = librosa.load(audio_file, sr=sr)
        
        # Extract MFCC features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        
        # Compute mean of each MFCC coefficient
        mfcc_mean = np.mean(mfcc, axis=1)
        
        return mfcc_mean
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None

# Function to process audio files in parallel
def process_audio_files(audio_files):
    features = []
    for audio_file in audio_files:
        mfcc_features = extract_features(audio_file)
        if mfcc_features is not None:
            file_name = os.path.basename(audio_file)  # Extract file name without path
            features.append([file_name, *mfcc_features])
    return features

# Function to normalize features
def normalize_features(features):
    # Min-max normalization
    normalized_features = (features - features.min()) / (features.max() - features.min())
    return normalized_features

# Function to standardize features
def standardize_features(features):
    # Standardization
    scaler = StandardScaler()
    standardized_features = scaler.fit_transform(features)
    return standardized_features

# Path to the folder containing subfolders of audio files
fma_large_folder = "audio"

# Initialize an empty list to store features
features_list = []

# Define the number of threads
num_threads = os.cpu_count()

# Loop through subfolders in fma_large
for folder_name in sorted(os.listdir(fma_large_folder)):
    folder_path = os.path.join(fma_large_folder, folder_name)
    if os.path.isdir(folder_path):
        print("Processing folder:", folder_name)
        
        # List all MP3 files in the current subfolder
        mp3_files = [os.path.join(folder_path, file_name) for file_name in sorted(os.listdir(folder_path)) if file_name.endswith(".mp3")]
        
        # Process audio files in parallel using multithreading
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            results = executor.map(process_audio_files, [mp3_files[i:i + num_threads] for i in range(0, len(mp3_files), num_threads)])
        
        # Flatten the list of features
        for batch in results:
            features_list.extend(batch)

# Convert features list to DataFrame
columns = ['filename'] + [f"mfcc_{i}" for i in range(len(features_list[0]) - 1)]
features_df = pd.DataFrame(features_list, columns=columns)

# Drop any rows with missing values (if any)
features_df.dropna(inplace=True)

# Separate the filename column
file_names = features_df['filename']
features_df.drop(columns=['filename'], inplace=True)

# Apply normalization and standardization to features (excluding filename column)
normalized_features = normalize_features(features_df)
standardized_features = standardize_features(normalized_features)

# Perform dimensionality reduction using PCA
pca = PCA(n_components=10)  # Specify the number of components to keep
reduced_features = pca.fit_transform(standardized_features)

# Convert reduced features to DataFrame
reduced_features_df = pd.DataFrame(reduced_features, columns=[f"mfcc_{i}" for i in range(reduced_features.shape[1])])

# Concatenate filename column to reduced features DataFrame
reduced_features_df.insert(0, 'filename', file_names)

# Save the reduced features DataFrame to a new CSV file
reduced_features_df.to_csv("audio_features1.csv", index=False)


Processing folder: 000
Processing folder: 001


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing audio\001\001486.mp3: 


In [3]:
import pandas as pd

# Replace 'path_to_your_csv_file' with the actual path to your CSV file
csv_file_path = 'audio_features1.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Display the DataFrame
print(df)


        filename    mfcc_0    mfcc_1    mfcc_2    mfcc_3    mfcc_4    mfcc_5  \
0     000002.mp3 -0.213493 -0.889261  0.300114 -1.396363 -0.202126 -0.782602   
1     000003.mp3  0.381634 -0.571500 -0.498392 -0.911056  0.144298 -0.135530   
2     000005.mp3  1.516205 -0.187141  0.846744 -1.151170  0.056885 -1.181041   
3     000010.mp3 -0.875783 -1.428132  0.076498  0.801129 -0.037355 -0.455636   
4     000020.mp3  1.863987  0.421322 -0.032940 -0.049970  0.339167 -0.044251   
...          ...       ...       ...       ...       ...       ...       ...   
1612  001995.mp3 -0.415149 -0.397714 -0.088524  0.741241 -2.236280  4.584156   
1613  001996.mp3 -1.006368 -3.281262  0.643808  0.356302 -0.995869 -0.420052   
1614  001997.mp3  0.666180  0.840533  0.375378 -0.177087 -0.941006 -0.638794   
1615  001998.mp3 -0.478807  1.101714  0.993000 -0.420856 -0.027246  0.200849   
1616  001999.mp3 -1.197814  2.253567  0.047800 -0.086981  0.215302 -0.164474   

        mfcc_6    mfcc_7    mfcc_8    m

In [4]:
import pymongo
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load the reduced features from the CSV file
    reduced_features_df = pd.read_csv('audio_features1.csv')

    # MongoDB connection details
    mongodb_url = "mongodb://localhost:27017/"
    database_name = "BDA"
    collection_name = "audio_features"

    # Connect to MongoDB
    client = pymongo.MongoClient(mongodb_url)
    db = client[database_name]

    # Check if the connection to MongoDB is successful
    if db is not None:
        # Convert DataFrame to dictionary
        transformed_data_dict = reduced_features_df.to_dict(orient='records')

        # Insert data into MongoDB collection
        collection = db[collection_name]
        collection.insert_many(transformed_data_dict)

        # Log success message
        logger.info("Transformed data has been successfully stored in MongoDB.")
    else:
        # Log error if connection to MongoDB fails
        logger.error("Failed to connect to MongoDB.")

except Exception as e:
    # Log error if any exception occurs during processing
    logger.error(f"An error occurred during processing: {e}")

finally:
    # Close MongoDB connection
    client.close()


INFO:__main__:Transformed data has been successfully stored in MongoDB.


In [33]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Load the reduced features from the CSV file
reduced_features_df = pd.read_csv('audio_features1.csv')

# Separate the filename column
filenames = reduced_features_df['filename']
features = reduced_features_df.drop(columns=['filename'])

# Initialize ANN model
n_neighbors = 10  # Number of neighbors to consider
ann_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto', metric='euclidean')

# Fit the model with the features
ann_model.fit(features)

# Example usage: find nearest neighbors for a sample audio file
sample_index = 0  # Index of the sample audio file
sample_features = features.iloc[[sample_index]]
distances, indices = ann_model.kneighbors(sample_features)

# Get the filenames of nearest neighbors
nearest_neighbor_filenames = filenames.iloc[indices[0]].tolist()
print(f"Nearest neighbors for {filenames.iloc[sample_index]}:")
for i, (filename, distance) in enumerate(zip(nearest_neighbor_filenames, distances[0]), 1):
    similarity = 1 / (1 + distance)  # Calculate similarity
    print(f"{i}. {filename} - Similarity: {similarity:.4f}")


Nearest neighbors for 000002.mp3:
1. 000002.mp3 - Similarity: 1.0000
2. 001613.mp3 - Similarity: 0.4589
3. 001329.mp3 - Similarity: 0.4474
4. 001642.mp3 - Similarity: 0.4442
5. 000587.mp3 - Similarity: 0.4318
6. 001602.mp3 - Similarity: 0.4278
7. 000744.mp3 - Similarity: 0.4225
8. 001811.mp3 - Similarity: 0.4210
9. 001334.mp3 - Similarity: 0.4200
10. 001591.mp3 - Similarity: 0.4191


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd
import jax
import jax.numpy as jnp
from jax import grad, jit

# Load the reduced features from the CSV file
features_df = pd.read_csv('audio_features1.csv')

# Separate the filename column
filenames = features_df['filename']
features = features_df.drop(columns=['filename'])

# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Split data into train, validation, and test sets
X_train, X_test = train_test_split(scaled_features, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=42)

# Define parameters
input_size = X_train.shape[1]
hidden_size = 128
output_size = input_size  # Output size should match input size for autoencoder

# Initialize model parameters
key = jax.random.PRNGKey(0)
params = init_model_params(input_size, hidden_size, output_size, key)

# Training loop with early stopping based on validation loss
best_val_loss = float('inf')
best_params = None
patience = 5
for epoch in range(num_epochs):
    grads = grad_loss(params, X_train, X_train)
    params = [w - learning_rate * dw for w, dw in zip(params, grads)]
    
    # Calculate training loss
    train_loss = loss(params, X_train, X_train)
    
    # Calculate validation loss
    val_loss = loss(params, X_val, X_val)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_params = params
        patience = 5
    else:
        patience -= 1
        if patience == 0:
            print("Early stopping!")
            break

# Evaluate model performance on the test set
test_loss = loss(best_params, X_test, X_test)
print(f'Test Loss: {test_loss:.4f}')

# Calculate similarities with the selected audio
selected_features = scaled_features[selected_index]
similarities = cosine_similarity([selected_features], scaled_features)[0]

# Get indices of top 10 similar audio files
top_indices = np.argsort(similarities)[-11:-1][::-1]  # Top 10 indices excluding the selected audio itself

# Print recommendations
print("Recommendations:")
for idx in top_indices:
    print(f"{filenames.iloc[idx]}: Similarity - {similarities[idx]}")


Epoch [1/10], Train Loss: 517.4972, Val Loss: 481.5277
Epoch [2/10], Train Loss: 497.0131, Val Loss: 462.9224
Epoch [3/10], Train Loss: 478.0166, Val Loss: 445.6771
Epoch [4/10], Train Loss: 460.3732, Val Loss: 429.6721
Epoch [5/10], Train Loss: 443.9369, Val Loss: 414.7772
Epoch [6/10], Train Loss: 428.5960, Val Loss: 400.8800
Epoch [7/10], Train Loss: 414.2455, Val Loss: 387.8878
Epoch [8/10], Train Loss: 400.7985, Val Loss: 375.7272
Epoch [9/10], Train Loss: 388.1788, Val Loss: 364.3147
Epoch [10/10], Train Loss: 376.3098, Val Loss: 353.5852
Test Loss: 403.0534
Recommendations:
001290.mp3: Similarity - 0.8596255019266058
000758.mp3: Similarity - 0.8478988081782778
000744.mp3: Similarity - 0.8424415221250918
001602.mp3: Similarity - 0.8380635628983775
001613.mp3: Similarity - 0.7899577043613131
001591.mp3: Similarity - 0.7808168897203976
001642.mp3: Similarity - 0.7701039698132147
001329.mp3: Similarity - 0.768673785739138
001435.mp3: Similarity - 0.7507669721033767
000587.mp3: Simil

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the data from CSV file
df = pd.read_csv('audio_features1.csv')

# Calculate the average value of MFCC features for each file
df['target'] = df.drop(['filename'], axis=1).mean(axis=1)

# Split the data into features (X) and target (y)
X = df.drop(['filename', 'target'], axis=1)  # features
y = df['target']  # target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a random forest regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

# Save the trained model to a pickle file
with open('recommendation_model.pkl', 'wb') as f:
    pickle.dump(model, f)

Mean Squared Error: 0.02


Using scikit-learn's classification metrics

In [74]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# Load the data from the CSV file
data = pd.read_csv('audio_features1.csv')

# Extract true labels from filenames
true_labels = [1 if filename.startswith('positive') else 0 for filename in data['filename']]

# Separate the filename column
features = data.drop(columns=['filename'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, true_labels, test_size=0.2, random_state=42)

# Train recommendation model using scikit-learn (Nearest Neighbors)
n_neighbors = 10
model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto', metric='euclidean')
model.fit(X_train)

# Make predictions on the test set
distances, indices = model.kneighbors(X_test)

# Generate predicted labels based on distances to nearest neighbors
predicted_labels = [1 if any(y_train[neighbor] == 1 for neighbor in neighbors) else 0 for neighbors in indices]

# Calculate precision
precision = precision_score(y_test, predicted_labels)

# Calculate recall
recall = recall_score(y_test, predicted_labels)

# Calculate F1-score
f1 = f1_score(y_test, predicted_labels)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Precision: 0.0
Recall: 0.0
F1-score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# Load the data from the CSV file
data = pd.read_csv('audio_features1.csv')

# Separate the filename column
filenames = data['filename']
features = data.drop(columns=['filename'])

# Split the data into training and test sets
X_train, X_test = train_test_split(features, test_size=0.2, random_state=42)

# Train recommendation model using scikit-learn (Nearest Neighbors)
n_neighbors = 10
model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto', metric='euclidean')
model.fit(X_train)

# Make predictions on the test set
distances, indices = model.kneighbors(X_test)

# Assuming you have true labels and predicted labels for your recommendation model
# Here, we'll generate random true labels for demonstration purposes
true_labels = [1 if filename.startswith('positive') else 0 for filename in filenames]

# Generate predicted labels based on distances to nearest neighbors
predicted_labels = [1 if any(true_labels[neighbor] for neighbor in neighbors) else 0 for neighbors in indices]

# Calculate true positives, false positives, and false negatives
tp = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == 1 and pred == 1)
fp = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == 0 and pred == 1)
fn = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == 1 and pred == 0)

# Calculate precision only if there are positive predictions
precision = tp / (tp + fp) if (tp + fp) > 0 else 0

# Calculate recall only if there are positive instances in the data
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

# Calculate F1-score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Precision: 0
Recall: 0
F1-score: 0


In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# Load the data from the CSV file
data = pd.read_csv('audio_features1.csv')

# Separate the filename column
filenames = data['filename']
features = data.drop(columns=['filename'])

# Split the data into training and test sets
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)

# Train recommendation model using scikit-learn (Nearest Neighbors)
n_neighbors = 10
model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto', metric='euclidean')
model.fit(X_train.drop(columns=['filename']))  # Dropping filename for training

# Make predictions on the test set
distances, indices = model.kneighbors(X_test.drop(columns=['filename']))  # Dropping filename for prediction

# Generate true labels based on filenames
true_labels = [1 if 'positive' in filename else 0 for filename in X_test['filename']]

# Generate predicted labels based on distances to nearest neighbors
predicted_labels = []

for neighbors in indices:
    positive_neighbor = False
    for neighbor in neighbors:
        if 'positive' in X_train.iloc[neighbor]['filename']:
            positive_neighbor = True
            break
    if positive_neighbor:
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)

# Calculate true positives, false positives, and false negatives
tp = np.sum(np.logical_and(true_labels == 1, np.array(predicted_labels) == 1))
fp = np.sum(np.logical_and(true_labels == 0, np.array(predicted_labels) == 1))
fn = np.sum(np.logical_and(true_labels == 1, np.array(predicted_labels) == 0))

# Calculate precision
if tp + fp == 0:
    precision = 0  # Handle the case where there are no positive predictions
else:
    precision = tp / (tp + fp)

# Calculate recall
if tp + fn == 0:
    recall = 0  # Handle the case where there are no positive instances
else:
    recall = tp / (tp + fn)

# Calculate F1-score
if precision + recall == 0:
    f1 = 0  # Handle the case where both precision and recall are zero
else:
    f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Precision: 0
Recall: 0
F1-score: 0
