In [None]:
import pandas as pd
import numpy as np
import dask
import dask.bag as db


In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/ds340/total.csv'

total = pd.read_csv(file_path)

In [None]:

if total.groupby('track_genre').size().min() >= 100:
    sampled_df = total.groupby('track_genre').sample(n=100, random_state=42)
else:
    print("Some genres do not have enough tracks. Adjusting sample size.")
    sampled_df = total.groupby('track_genre').apply(lambda x: x.sample(n=min(len(x), 100), random_state=42)).reset_index(drop=True)

In [None]:
sampled_df.to_csv('/content/drive/My Drive/ds340/sampled_df.csv')

In [None]:
import pandas as pd
import os
import requests
import dask.bag as db

# Path and directory setup
dataset_path = '/content/drive/MyDrive/ds340/sampled_files'
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

def download_song(url, trackid, dataset_path):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            filename = f"{trackid}.mp3"
            file_path = os.path.join(dataset_path, filename)
            with open(file_path, 'wb') as f:
                f.write(response.content)
            return file_path
        else:
            print(f"Failed to download {url}: HTTP {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed for {url}: {str(e)}")
    return None

def download_batch(batch):
    # Downloads all songs in a batch
    return [download_song(row['url'], row['track_id'], dataset_path) for row in batch]


batch_size = 600
batches = [sampled_df.iloc[i:i + batch_size].to_dict(orient='records') for i in range(0, len(sampled_df), batch_size)]

# Using Dask to process batches
downloaded_files = db.from_sequence(batches).map(download_batch)
downloaded_file_paths = downloaded_files.compute()  # Flattening list of lists
downloaded_file_paths = [item for sublist in downloaded_file_paths for item in sublist if item is not None]

In [None]:
import librosa
import os

def process_audio(file_path):
    # Extract track_id from filename
    track_id = os.path.basename(file_path).replace('.mp3', '')

    # Load the audio file
    y, sr = librosa.load(file_path, sr=22050)  # Load audio at the default sampling rate of 22050 Hz

    # Compute MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
    #mfcc = np.asarray(mfcc, dtype=np.float32)
    # Transpose the MFCC result
    mfcc = mfcc.T

    # Return both track_id and MFCC as a tuple for further processing
    return (track_id, mfcc.tolist())


In [None]:
folder_path = '/content/drive/MyDrive/ds340/sampled_files'
sampled_df = pd.read_csv('/content/drive/My Drive/ds340/sampled_df.csv')
sampled_files = os.listdir(folder_path)

track_ids = [os.path.splitext(filename)[0] for filename in sampled_files]

filtered_df = sampled_df[sampled_df['track_id'].isin(track_ids)]

filtered_df

In [None]:
import pandas as pd
import os
import requests
import dask.bag as db

# Path and directory setup
dataset_path = '/content/drive/MyDrive/ds340/sampled_files'
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

def download_song(url, trackid, dataset_path):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            filename = f"{trackid}.mp3"
            file_path = os.path.join(dataset_path, filename)
            with open(file_path, 'wb') as f:
                f.write(response.content)
            return file_path
        else:
            print(f"Failed to download {url}: HTTP {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed for {url}: {str(e)}")
    return None

def download_batch(batch):
    # Downloads all songs in a batch
    return [download_song(row['url'], row['track_id'], dataset_path) for row in batch]


batch_size = 600
batches = [sampled_df.iloc[i:i + batch_size].to_dict(orient='records') for i in range(0, len(sampled_df), batch_size)]

# Using Dask to process batches
downloaded_files = db.from_sequence(batches).map(download_batch)
downloaded_file_paths = downloaded_files.compute()  # Flattening list of lists
downloaded_file_paths = [item for sublist in downloaded_file_paths for item in sublist if item is not None]

In [None]:
import librosa
import os

def process_audio(file_path):
    # Extract track_id from filename
    track_id = os.path.basename(file_path).replace('.mp3', '')

    # Load the audio file
    y, sr = librosa.load(file_path, sr=22050)  # Load audio at the default sampling rate of 22050 Hz

    # Compute MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
    #mfcc = np.asarray(mfcc, dtype=np.float32)
    # Transpose the MFCC result
    mfcc = mfcc.T

    # Return both track_id and MFCC as a tuple for further processing
    return (track_id, mfcc.tolist())


In [None]:
# List all files in the specified directory and filter for MP3 files
downloaded_file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.mp3')]
downloaded_file_paths

In [None]:
final_df_new = downloaded_file_paths[:10]
final_df_new

In [None]:
import dask.bag as db

audio_files = db.from_sequence(final_df_new)
features_with_ids = audio_files.map(process_audio)
mfcc_features_with_ids = features_with_ids.compute()

In [None]:
import pandas as pd


# Create DataFrame
features_df = pd.DataFrame(mfcc_features_with_ids, columns=['track_id', 'mfcc'])

# Sometimes it's useful to have MFCC features in a format that each column is one feature dimension
# This can be done by expanding the mfcc list into separate columns if needed
features_df


In [None]:
final_df = pd.merge(features_df, sampled_df, on='track_id', how='left')

In [None]:
features_df.to_csv('/content/drive/My Drive/ds340/features_df.csv')
final_df.to_csv('/content/drive/My Drive/ds340/final_df.csv')

In [None]:
features_df = pd.read_csv('/content/drive/My Drive/ds340/features_df.csv')

features_df

In [None]:
duplicates = filtered_df[filtered_df.duplicated('track_id', keep=False)]

duplicates


In [None]:
import pandas as pd

def sort_dataframe_by_column(df, column_name):
    # Ensure the DataFrame is sorted by the specified column alphabetically, case-insensitively
    sorted_df = df.sort_values(by=column_name, key=lambda col: col.str.lower()).reset_index(drop=True)
    return sorted_df


sorted_metadata_df = sort_dataframe_by_column(filtered_df, 'track_id')
sorted_metadata_df


In [None]:
import os

def get_sorted_files(directory_path):
    # List all files in the directory
    files = os.listdir(directory_path)

    # Filter and sort files alphabetically, ensuring case insensitivity
    sorted_files = sorted([file for file in files if file.endswith('.mp3')], key=str.lower)

    return sorted_files

# Specify the path to your director
directory_path = '/content/drive/MyDrive/ds340/sampled_files'
sorted_files = get_sorted_files(directory_path)
sorted_files

In [None]:
import os
import pandas as pd
import librosa

def load_and_extract_mfcc(file_path, sr=22050, n_mfcc=13, n_fft=2048, hop_length=512):
    audio, sample_rate = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    mfccs = mfccs.T
    return mfccs

def process_directory(directory_path, metadata_df):
    # Sort the metadata DataFrame by track_id alphabetically
    sorted_metadata_df = metadata_df.sort_values('track_id', key=lambda x: x.str.lower()).reset_index(drop=True)

    # Create a dictionary to map track IDs to genres
    file_to_genre = dict(zip(sorted_metadata_df['track_id'], sorted_metadata_df['track_genre']))

    # Retrieve and sort list of files in directory alphabetically
    files = sorted([f for f in os.listdir(directory_path) if f.endswith('.mp3')], key=str.lower)

    mfcc_data = []
    for filename in files:
        file_path = os.path.join(directory_path, filename)
        # Remove '.mp3' to get the track ID
        track_id = filename[:-4]  # This strips the last four characters ('.mp3') off the filename
        genre = file_to_genre.get(track_id)  # Get the genre for the track ID

        if genre is None:
            print(f"Genre not found for {filename}, skipping...")
            continue

        try:
            mfcc_features = load_and_extract_mfcc(file_path)
            mfcc_data.append((track_id, genre, mfcc_features))
            print(f"Processed {filename} with genre {genre}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

    return mfcc_data

# Example Usage
directory_path = '/content/drive/MyDrive/ds340/sampled_files'
processed_data = process_directory(directory_path, filtered_df)
