In [3]:
import os
import librosa
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pymongo
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to extract audio features
def extract_features(audio_file, sr=22050, n_mfcc=13):
    try:
        y, sr = librosa.load(audio_file, sr=sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = np.mean(mfcc, axis=1)
        return mfcc_mean
    except Exception as e:
        logger.error(f"Error extracting features from {audio_file}: {e}")
        return None

# Function to normalize features
def normalize_features(features):
    try:
        normalized_features = (features - features.min()) / (features.max() - features.min())
        return normalized_features
    except Exception as e:
        logger.error(f"Error normalizing features: {e}")
        return None

# Function to standardize features
def standardize_features(features):
    try:
        scaler = StandardScaler()
        standardized_features = scaler.fit_transform(features)
        return standardized_features
    except Exception as e:
        logger.error(f"Error standardizing features: {e}")
        return None

# Function to connect to MongoDB
def connect_to_mongodb(uri, db_name):
    try:
        client = pymongo.MongoClient(uri)
        db = client[db_name]
        return db
    except Exception as e:
        logger.error(f"Error connecting to MongoDB: {e}")
        return None

# Main function to process audio files and store data in MongoDB
def process_audio_files(audio_folder, mongodb_uri, db_name, output_csv):
    try:
        features_df = pd.DataFrame()
        for file in os.listdir(audio_folder):
            if file.endswith(".mp3"):
                audio_file = os.path.join(audio_folder, file)
                mfcc_features = extract_features(audio_file)
                if mfcc_features is not None:
                    df = pd.DataFrame([mfcc_features], columns=[f"mfcc_{i}" for i in range(len(mfcc_features))])
                    features_df = pd.concat([features_df, df], ignore_index=True)

        if features_df.empty:
            logger.error("No audio features extracted.")
            return

        normalized_features = normalize_features(features_df)
        if normalized_features is None:
            return

        standardized_features = standardize_features(normalized_features)
        if standardized_features is None:
            return

        pca = PCA(n_components=10)
        reduced_features = pca.fit_transform(standardized_features)
        reduced_features_df = pd.DataFrame(reduced_features, columns=[f"pc_{i}" for i in range(reduced_features.shape[1])])
        reduced_features_df.to_csv(output_csv, index=False)

        db = connect_to_mongodb(mongodb_uri, db_name)
        if db is not None:
            transformed_data_dict = reduced_features_df.to_dict(orient="records")
            collection_name = "audio_features_collectionn"
            collection = db[collection_name]
            collection.insert_many(transformed_data_dict)
            logger.info("Transformed data has been successfully stored in MongoDB.")
        else:
            logger.error("Failed to connect to MongoDB.")

    except Exception as e:
        logger.error(f"An error occurred during processing: {e}")

# Set parameters
audio_folder = "audio"
mongodb_uri = "mongodb://localhost:27017/"
db_name = "music_features_databasee"
output_csv = "reduced_audio_features111.csv"

# Process audio files and store data in MongoDB
process_audio_files(audio_folder, mongodb_uri, db_name, output_csv)


INFO:pymongo.serverSelection:{"message": "Waiting for suitable server to become available", "selector": "<function writable_server_selector at 0x000001FE205B4EA0>", "operation": "insert", "operationId": -609045504, "topologyDescription": "<TopologyDescription id: 663f8652ac51b34e32a1b09f, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None>]>", "clientId": {"$oid": "663f8652ac51b34e32a1b09f"}, "remainingTimeMS": 30}
INFO:__main__:Transformed data has been successfully stored in MongoDB.


In [4]:
import pandas as pd

# Replace 'path_to_your_csv_file' with the actual path to your CSV file
csv_file_path = 'reduced_audio_features111.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Display the DataFrame
print(df)


        pc_0      pc_1      pc_2      pc_3      pc_4      pc_5      pc_6  \
0   2.775385 -0.512451 -1.398603  0.766013  1.904395  2.087560 -0.067536   
1   2.742665 -0.478032  2.643861  0.371387  0.475300 -0.785631 -0.154473   
2  -0.779586 -0.885235 -0.654513 -0.185930 -0.462820 -0.079726  0.478409   
3  -0.239705 -1.245250 -0.929713  0.350711  0.067621 -0.576786  0.258883   
4   2.438098 -0.014551 -0.324895  1.198067 -1.613589 -0.289867 -1.392326   
5  -1.123491  5.063447  0.209638  0.614883 -0.085365  0.092426  0.558527   
6   0.692517 -0.387207  1.585810 -0.821483  0.327681  0.095521  0.386481   
7  -2.141782 -1.617599 -0.902443 -1.414972  0.404663 -0.152856 -1.000738   
8   5.316360 -0.022638 -0.722914 -1.403744 -0.617986  0.709271  0.295180   
9  -1.291747 -1.268354 -1.352191 -1.268271 -0.115264 -0.344346 -0.037749   
10  0.021783 -0.273324 -1.056561 -0.080321 -1.154121  0.236616  1.530644   
11 -1.394024  3.277966 -0.292009 -0.647414 -0.621027  0.441710  0.013599   
12 -3.456337

In [5]:
import pymongo

# MongoDB connection URI
mongo_uri = "mongodb://localhost:27017/"

try:
    # Connect to MongoDB
    client = pymongo.MongoClient(mongo_uri)
    
    # List available databases
    print("Available databases:")
    print(client.list_database_names())
    
    # List collections in a specific database
    db = client["music_features_database"]
    print("Collections in music_features_database:")
    print(db.list_collection_names())
    
    # Close connection
    client.close()
    
    print("Connection successful!")
except Exception as e:
    print("Connection error:", e)


INFO:pymongo.serverSelection:{"message": "Waiting for suitable server to become available", "selector": "Primary()", "operation": "listDatabases", "topologyDescription": "<TopologyDescription id: 663f86bcac51b34e32a1b0b5, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None>]>", "clientId": {"$oid": "663f86bcac51b34e32a1b0b5"}, "remainingTimeMS": 30}


Available databases:
['admin', 'amitdb', 'config', 'local', 'music_features_databasee']
Collections in music_features_database:
[]
Connection successful!


In [5]:
import dask.dataframe as dd
from implicit.als import AlternatingLeastSquares
from sklearn.neighbors import NearestNeighbors
from dask_ml.metrics import mean_squared_error
from dask_ml.model_selection import train_test_split  # Import train_test_split function
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Load Data from CSV using Dask
df = dd.read_csv('reduced_audio_features111.csv')

# Step 2: Split Data into Training and Testing Sets
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Step 3: Train Collaborative Filtering Model using NearestNeighbors
collaborative_filtering_model = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='cosine')
collaborative_filtering_model.fit(X_train.compute())

# Step 4: Evaluate Collaborative Filtering Model
nearest_neighbors_indices = collaborative_filtering_model.kneighbors(X_test.compute())[1]
original_data_points = X_test.compute()
nearest_neighbors = original_data_points.iloc[nearest_neighbors_indices]
cosine_similarity = (nearest_neighbors.dot(original_data_points.T) /
                     (nearest_neighbors.norm() * original_data_points.norm(axis=1)))
print("Average Cosine Similarity for Collaborative Filtering:", cosine_similarity.mean().compute())

# Step 5: Train Neural Network Model
# Define the neural network model
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, input_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Convert Dask DataFrame to NumPy arrays
X_train_array = X_train.compute().to_numpy()
X_test_array = X_test.compute().to_numpy()

# Define and train the neural network model
input_size = X_train_array.shape[1]
hidden_size = 64
model = NeuralNetwork(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

for epoch in range(num_epochs):
    inputs = torch.autograd.Variable(torch.Tensor(X_train_array).float())
    targets = torch.autograd.Variable(torch.Tensor(X_train_array).float())
    optimizer.zero_grad()
    out = model(inputs)
    loss = criterion(out, targets)
    loss.backward()
    optimizer.step()

# Step 6: Evaluate Neural Network Model
inputs = torch.autograd.Variable(torch.Tensor(X_test_array).float())
predicted = model(inputs).detach().numpy()
mse_nn = mean_squared_error(X_test_array, predicted)
print("Mean Squared Error for Neural Network Model:", mse_nn)




IndexError: positional indexers are out-of-bounds

In [None]:
# For collaborative filtering approach
from sklearn.neighbors import NearestNeighbors

# For neural network-based approach
import torch
import torch.nn as nn
import torch.optim as optim


In [None]:
# Assuming you have a CSV file 'normalized_audio_features.csv'
# Load the data
data = pd.read_csv('normalized_audio_features.csv')

# Split data into features and labels
X = data.drop(columns=['target_column'])  # Features
y = data['target_column']  # Labels

# Initialize and train the model
# Example for collaborative filtering
cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
cf_model.fit(X)


In [None]:
# Example hyperparameter tuning for collaborative filtering
params = {
    'n_neighbors': [5, 10, 15],
    'algorithm': ['ball_tree', 'kd_tree'],
    'leaf_size': [20, 30, 40],
    'metric': ['cosine', 'euclidean']
}

# Perform hyperparameter tuning
grid_search = GridSearchCV(cf_model, params, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train)
best_params = grid_search.best_params_
print("Best parameters:", best_params)


In [None]:
# Assuming you have test data X_test
# Predict recommendations for test data
distances, indices = cf_model.kneighbors(X_test)

# Evaluate model performance using appropriate metrics
# Example:
from sklearn.metrics import mean_squared_error

# Calculate mean squared error
mse = mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", mse)
