In [1]:
import zipfile
import os
import librosa
import soundfile as sf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pymongo

# Path to the ZIP file
zip_file_path = "C:\\Users\\gilani\\OneDrive\\Desktop\\data.zip"
# Destination folder where the files will be extracted
extract_folder = "C:\\Users\\gilani\\OneDrive\\Desktop\\extracted_data4"

# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# List the contents of the extracted folder
extracted_folders = os.listdir(extract_folder)
print("Extracted folders:", extracted_folders)

# Check the contents of the first folder
first_folder_path = os.path.join(extract_folder, extracted_folders[0])
first_folder_contents = os.listdir(first_folder_path)
print("Contents of the first folder:", first_folder_contents)

# Function to convert MP3 audio file to WAV format
def convert_to_wav(input_file, output_file):
    try:
        y, sr = librosa.load(input_file, sr=None)
        sf.write(output_file, y, sr)
        print(f"Converted {input_file} to {output_file}")
    except Exception as e:
        print(f"Error converting {input_file} to WAV: {e}")

# Function to extract MFCC features from an audio file
def extract_features(audio_file_path, num_mfcc=13):
    try:
        y, sr = librosa.load(audio_file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=num_mfcc)
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
        return mfccs, spectral_centroid, zero_crossing_rate
    except Exception as e:
        print(f"Error extracting features from {audio_file_path}: {e}")
        return None

# Dictionary to store features for each audio file
feature_data = {}

# Loop through each folder and audio file
for folder in extracted_folders:
    folder_path = os.path.join(extract_folder, folder)
    for audio_file in os.listdir(folder_path):
        audio_file_path = os.path.join(folder_path, audio_file)
        # Check if the file is an MP3
        if audio_file.lower().endswith('.mp3'):
            # Convert MP3 audio file to WAV format
            wav_file_path = os.path.splitext(audio_file_path)[0] + '.wav'
            convert_to_wav(audio_file_path, wav_file_path)
            # Extract features from the WAV file
            features = extract_features(wav_file_path)
            if features is not None:
                # Store features in the dictionary
                feature_data[audio_file] = features

# Normalization
scaler = StandardScaler()
for key, value in feature_data.items():
    mfccs, spectral_centroid, zero_crossing_rate = value
    mfccs_scaled = scaler.fit_transform(mfccs.T).T
    feature_data[key] = mfccs_scaled, spectral_centroid, zero_crossing_rate

# Dimensionality reduction
pca = PCA(n_components=10)
for key, value in feature_data.items():
    mfccs_scaled, spectral_centroid, zero_crossing_rate = value
    mfccs_pca = pca.fit_transform(mfccs_scaled.T).T
    feature_data[key] = mfccs_pca, spectral_centroid, zero_crossing_rate




Extracted folders: ['000', '001', '002', '003', '004', '005']
Contents of the first folder: ['000002.mp3', '000002.wav', '000003.mp3', '000003.wav', '000005.mp3', '000005.wav', '000010.mp3', '000010.wav', '000020.mp3', '000020.wav', '000026.mp3', '000026.wav', '000030.mp3', '000030.wav', '000046.mp3', '000046.wav', '000048.mp3', '000048.wav', '000134.mp3', '000134.wav', '000135.mp3', '000135.wav', '000136.mp3', '000136.wav', '000137.mp3', '000137.wav', '000138.mp3', '000138.wav', '000139.mp3', '000139.wav', '000140.mp3', '000140.wav', '000141.mp3', '000141.wav', '000142.mp3', '000142.wav', '000144.mp3', '000144.wav', '000145.mp3', '000145.wav', '000146.mp3', '000146.wav', '000147.mp3', '000147.wav', '000148.mp3', '000148.wav', '000149.mp3', '000149.wav', '000150.mp3', '000150.wav', '000151.mp3', '000151.wav', '000152.mp3', '000152.wav', '000153.mp3', '000153.wav', '000154.mp3', '000154.wav', '000155.mp3', '000155.wav', '000156.mp3', '000156.wav', '000157.mp3', '000157.wav', '000158.mp3

  y, sr = librosa.load(input_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(audio_file_path, sr=None)


Converted C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001488.mp3 to C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001488.wav
Converted C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001489.mp3 to C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001489.wav
Converted C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001490.mp3 to C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001490.wav
Converted C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001491.mp3 to C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001491.wav
Converted C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001492.mp3 to C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001492.wav
Converted C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001494.mp3 to C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001494.wav
Converted C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001495.mp3 to C:\Users\gilani\OneDrive\Desktop\extracted_data4\001\001495.wav
Converted C:\

  self.explained_variance_ratio_ = self.explained_variance_ / total_var


In [2]:
# Print the extracted features for each audio file
for audio_file, features in feature_data.items():
    mfccs_scaled, spectral_centroid, zero_crossing_rate = features
    print("Audio file:", audio_file)
    print("MFCC features:", mfccs_scaled)
    print("Spectral Centroid:", spectral_centroid)
    print("Zero-crossing Rate:", zero_crossing_rate)


Audio file: 000002.mp3
MFCC features: [[-3.274968   -3.8674688  -2.9481564  ...  0.7883491   0.6090717
   0.75976115]
 [ 1.0577857   1.5994588   1.0598623  ... -0.2887757   0.19151936
   0.73807794]
 [ 5.137329    4.532052    1.7713186  ... -0.59713495 -1.3180628
  -1.7782689 ]
 ...
 [ 1.5204476  -1.569403   -1.3327595  ... -0.37613744 -0.11640938
  -0.13830362]
 [ 3.1875353   1.921275    0.9765824  ... -0.84104866 -0.5754613
  -0.20607777]
 [ 2.6567028   0.4984661  -0.4381548  ...  0.04238532  0.3181361
   0.40853924]]
Spectral Centroid: [[2247.04821961 2061.68940219 2419.9808226  ... 3325.17894253
  3126.09536594 3224.31292889]]
Zero-crossing Rate: [[0.01367188 0.02783203 0.04150391 ... 0.12695312 0.09570312 0.07519531]]
Audio file: 000003.mp3
MFCC features: [[-2.5754836  -2.3080218  -0.5602377  ...  0.5528066   0.2544436
  -0.6870983 ]
 [ 1.3830394   3.932398    2.382572   ...  3.4182212   4.2945704
   4.5008855 ]
 [-2.3193164  -2.7826743  -0.9308779  ... -0.37743306 -0.02826129
  -

In [7]:
# Store in MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["BDAproject"]
collection = db["sound files"]
for key, value in feature_data.items():
    mfccs_pca, spectral_centroid, zero_crossing_rate = value
    document = {
        "filename": key,
        "mfccs_pca": mfccs_pca.tolist(),
        "spectral_centroid": spectral_centroid.tolist(),
        "zero_crossing_rate": zero_crossing_rate.tolist()
    }
    collection.insert_one(document)

print("Data stored in MongoDB.")

Data stored in MongoDB.


#Phase2


In [4]:
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["BDAproject"]
collection = db["sound files"]

# Retrieve audio features from MongoDB
audio_features = []

for document in collection.find():
    audio_feature = {
        "filename": document["filename"],
        "mfccs_pca": document["mfccs_pca"],
        "spectral_centroid": document["spectral_centroid"],
        "zero_crossing_rate": document["zero_crossing_rate"]
    }
    audio_features.append(audio_feature)

# Print the first few audio features
for feature in audio_features[:5]:
    print(feature)


{'filename': '000002.mp3', 'mfccs_pca': [[-3.27497935295105, -3.8674681186676025, -2.9481582641601562, -2.4890289306640625, -2.7840123176574707, -1.4760849475860596, 0.716415286064148, 2.0972094535827637, 2.745143413543701, 3.2990102767944336, 3.1216461658477783, 2.8918464183807373, 2.125875473022461, 0.9368449449539185, -0.5905840992927551, -1.3222670555114746, -1.740854263305664, -2.069952964782715, -2.317200183868408, -2.717221736907959, -3.2269339561462402, -3.371469259262085, -3.4152159690856934, -3.2557592391967773, -2.858841896057129, -2.792358875274658, -2.6435632705688477, -2.140982151031494, -1.3857742547988892, -1.240607500076294, -1.2435035705566406, -1.7959368228912354, -1.7867380380630493, -2.1763083934783936, -3.0143394470214844, -2.9111526012420654, -3.0075244903564453, -2.3002686500549316, -0.13416539132595062, 2.2518270015716553, 3.1859848499298096, 2.8232288360595703, 2.3158206939697266, 2.2972593307495117, 2.205960512161255, 1.8471579551696777, 1.4526935815811157, 1

In [31]:
from pyspark.sql import SparkSession

# Initialize SparkSession with MongoDB Spark Connector JAR
spark = SparkSession.builder \
    .appName("MongoDBConnectorExample") \
    .config("spark.jars", "C:\\Users\\gilani\\OneDrive\\Desktop\\connector.jar") \
    .getOrCreate()

# Define the MongoDB URI
mongodb_uri = "mongodb://localhost:27017//BDAproject.sound files"




In [37]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("MusicRecommendationModel") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/db.connection") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/db.connection") \
    .config("spark.jars", "C:\\Users\\gilani\\OneDrive\\Desktop\\connector.jar")\
    .getOrCreate()



In [34]:
spark.conf.set("spark.mongodb.input.uri", "mongodb://localhost:27017/db.collection")


In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Read data from MongoDB into DataFrame
df = spark.read.format("mongo") \
    .option("uri", mongodb_uri) \
    .load()

# Prepare features for training
assembler = VectorAssembler(
    inputCols=["mfccs_pca", "spectral_centroid", "zero_crossing_rate"],
    outputCol="features")

# Define the classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Define the pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Split the data into training and testing sets (70% training, 30% testing)
(trainingData, testData) = df.randomSplit([0.7, 0.3])

# Train the model
model = pipeline.fit(trainingData)

# Make predictions on the testing data
predictions = model.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metric
)