In [1]:
import os
import librosa
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import concurrent.futures
import warnings

# Suppress PySoundFile and Audioread warnings
warnings.filterwarnings("ignore", message="PySoundFile failed. Trying audioread instead.")

def extract_features(audio_file, sr=22050, n_mfcc=13):
    try:
        # Load audio file
        y, sr = librosa.load(audio_file, sr=sr)
        
        # Extract MFCC features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        
        # Compute mean of each MFCC coefficient
        mfcc_mean = np.mean(mfcc, axis=1)
        
        return mfcc_mean
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None

# Function to process audio files in parallel
def process_audio_files(audio_files):
    features = []
    for audio_file in audio_files:
        mfcc_features = extract_features(audio_file)
        if mfcc_features is not None:
            file_name = os.path.basename(audio_file)  # Extract file name without path
            features.append([file_name, *mfcc_features])
    return features

# Function to normalize features
def normalize_features(features):
    # Min-max normalization
    normalized_features = (features - features.min()) / (features.max() - features.min())
    return normalized_features

# Function to standardize features
def standardize_features(features):
    # Standardization
    scaler = StandardScaler()
    standardized_features = scaler.fit_transform(features)
    return standardized_features

# Path to the folder containing subfolders of audio files
fma_large_folder = "fma_large"

# Initialize an empty list to store features
features_list = []

# Define the number of threads
num_threads = os.cpu_count()

# Loop through subfolders in fma_large
for folder_name in sorted(os.listdir(fma_large_folder)):
    folder_path = os.path.join(fma_large_folder, folder_name)
    if os.path.isdir(folder_path):
        print("Processing folder:", folder_name)
        
        # List all MP3 files in the current subfolder
        mp3_files = [os.path.join(folder_path, file_name) for file_name in sorted(os.listdir(folder_path)) if file_name.endswith(".mp3")]
        
        # Process audio files in parallel using multithreading
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            results = executor.map(process_audio_files, [mp3_files[i:i + num_threads] for i in range(0, len(mp3_files), num_threads)])
        
        # Flatten the list of features
        for batch in results:
            features_list.extend(batch)

# Convert features list to DataFrame
columns = ['filename'] + [f"mfcc_{i}" for i in range(len(features_list[0]) - 1)]
features_df = pd.DataFrame(features_list, columns=columns)

# Drop any rows with missing values (if any)
features_df.dropna(inplace=True)

# Separate the filename column
file_names = features_df['filename']
features_df.drop(columns=['filename'], inplace=True)

# Apply normalization and standardization to features (excluding filename column)
normalized_features = normalize_features(features_df)
standardized_features = standardize_features(normalized_features)

# Perform dimensionality reduction using PCA
pca = PCA(n_components=10)  # Specify the number of components to keep
reduced_features = pca.fit_transform(standardized_features)

# Convert reduced features to DataFrame
reduced_features_df = pd.DataFrame(reduced_features, columns=[f"mfcc_{i}" for i in range(reduced_features.shape[1])])

# Concatenate filename column to reduced features DataFrame
reduced_features_df.insert(0, 'filename', file_names)

# Save the reduced features DataFrame to a new CSV file
reduced_features_df.to_csv("ALL_FEATUES.csv", index=False)


Processing folder: 000
Processing folder: 001


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing fma_large\001\001486.mp3: 
Processing folder: 002
Error processing fma_large\002\002624.mp3: 
Processing folder: 003
Error processing fma_large\003\003284.mp3: 
Processing folder: 004
Processing folder: 005
Error processing fma_large\005\005574.mp3: 
Processing folder: 006
Processing folder: 007
Processing folder: 008
Error processing fma_large\008\008669.mp3: 
Processing folder: 009
Processing folder: 010
Error processing fma_large\010\010116.mp3: 
Processing folder: 011
Error processing fma_large\011\011583.mp3: 
Processing folder: 012
Error processing fma_large\012\012838.mp3: 
Processing folder: 013
Error processing fma_large\013\013529.mp3: 
Processing folder: 014
Error processing fma_large\014\014116.mp3: 
Error processing fma_large\014\014180.mp3: 
Processing folder: 015
Processing folder: 016
Processing folder: 017
Processing folder: 018
Error processing fma_large\018\018924.mp3: 
Processing folder: 019
Processing folder: 020
Error processing fma_large\020\0208



Processing folder: 022
Error processing fma_large\022\022554.mp3: 
Processing folder: 023
Error processing fma_large\023\023429.mp3: 
Error processing fma_large\023\023430.mp3: 
Error processing fma_large\023\023431.mp3: 
Processing folder: 024
Processing folder: 025
Error processing fma_large\025\025180.mp3: 
Error processing fma_large\025\025173.mp3: 
Error processing fma_large\025\025174.mp3: 
Error processing fma_large\025\025175.mp3: 
Error processing fma_large\025\025176.mp3: 
Processing folder: 026
Processing folder: 027
Processing folder: 028
Processing folder: 029
Error processing fma_large\029\029345.mp3: 
Error processing fma_large\029\029346.mp3: 
Error processing fma_large\029\029356.mp3: 
Error processing fma_large\029\029352.mp3: 
Processing folder: 030
Processing folder: 031
Processing folder: 032
Processing folder: 033
Error processing fma_large\033\033413.mp3: 
Error processing fma_large\033\033414.mp3: 
Error processing fma_large\033\033391.mp3: 
Error processing fma



Processing folder: 086
Error processing fma_large\086\086659.mp3: 
Error processing fma_large\086\086661.mp3: 
Error processing fma_large\086\086664.mp3: 
Error processing fma_large\086\086656.mp3: 
Processing folder: 087
Error processing fma_large\087\087057.mp3: 
Processing folder: 088
Processing folder: 089
Processing folder: 090
Error processing fma_large\090\090244.mp3: 
Error processing fma_large\090\090245.mp3: 
Error processing fma_large\090\090247.mp3: 
Error processing fma_large\090\090248.mp3: 
Error processing fma_large\090\090250.mp3: 
Error processing fma_large\090\090252.mp3: 
Error processing fma_large\090\090253.mp3: 
Error processing fma_large\090\090442.mp3: 
Error processing fma_large\090\090445.mp3: 
Processing folder: 091
Error processing fma_large\091\091206.mp3: 
Processing folder: 092
Error processing fma_large\092\092479.mp3: 
Processing folder: 093
Processing folder: 094
Error processing fma_large\094\094052.mp3: 
Error processing fma_large\094\094234.mp3: 
P

In [2]:
import pandas as pd

# Replace 'path_to_your_csv_file' with the actual path to your CSV file
csv_file_path = 'ALL_FEATUES.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Display the DataFrame
print(df)


          filename    mfcc_0    mfcc_1    mfcc_2    mfcc_3    mfcc_4  \
0       000002.mp3  0.202889 -1.136643 -0.038230  1.011978 -0.325367   
1       000003.mp3 -0.344717 -0.723664 -0.663541  0.648470 -0.046218   
2       000005.mp3 -1.304337 -0.594325  0.525045  0.856414  0.502651   
3       000010.mp3  0.753628 -1.671990 -0.402608 -0.840115 -0.178642   
4       000020.mp3 -1.639621  0.117239 -0.339650 -0.089534 -0.126288   
...            ...       ...       ...       ...       ...       ...   
106396  155316.mp3  0.741158 -0.145727 -0.629481 -0.794489 -0.238705   
106397  155317.mp3 -0.496234 -0.483669 -0.417215 -0.491575 -0.007724   
106398  155318.mp3 -0.816835 -0.171524 -0.021474 -0.776442 -0.296347   
106399  155319.mp3 -0.679904 -0.447171 -0.268591 -0.342063 -0.164815   
106400  155320.mp3 -1.061576 -0.333261 -0.883150  0.177372  0.332768   

          mfcc_5    mfcc_6    mfcc_7    mfcc_8    mfcc_9  
0       0.161717  0.410021 -1.009176 -0.022266 -0.051470  
1       0.544103 

In [3]:
import pymongo
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load the reduced features from the CSV file
    reduced_features_df = pd.read_csv('ALL_FEATUES.csv')

    # MongoDB connection details
    mongodb_url = "mongodb://localhost:27017/"
    database_name = "BDA_FULL_DATA"
    collection_name = "audio_features_FULL"

    # Connect to MongoDB
    client = pymongo.MongoClient(mongodb_url)
    db = client[database_name]

    # Check if the connection to MongoDB is successful
    if db is not None:
        # Convert DataFrame to dictionary
        transformed_data_dict = reduced_features_df.to_dict(orient='records')

        # Insert data into MongoDB collection
        collection = db[collection_name]
        collection.insert_many(transformed_data_dict)

        # Log success message
        logger.info("Transformed data has been successfully stored in MongoDB.")
    else:
        # Log error if connection to MongoDB fails
        logger.error("Failed to connect to MongoDB.")

except Exception as e:
    # Log error if any exception occurs during processing
    logger.error(f"An error occurred during processing: {e}")

finally:
    # Close MongoDB connection
    client.close()


INFO:__main__:Transformed data has been successfully stored in MongoDB.
