In [4]:
import os
import librosa

def read_audio_files(directory):
    audio_data = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.mp3'):
                audio_path = os.path.join(root, file)
                # Load audio file
                y, sr = librosa.load(audio_path, sr=None)
                audio_data.append((audio_path, y, sr))
    return audio_data

# Example usage
audio_directory = '/home/cake/Documents/sample_data'
audio_data = read_audio_files(audio_directory)


In [5]:
import numpy as np
import librosa

def extract_features(audio_data):
    features = []
    for audio_path, y, sr in audio_data:
        # Extract MFCC
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        
        # Calculate spectral centroid
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        if len(spectral_centroid) == 0:
            spectral_centroid_mean = 0
        else:
            spectral_centroid_mean = np.mean(spectral_centroid)
        
        # Calculate zero-crossing rate
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
        if len(zero_crossing_rate) == 0:
            zero_crossing_rate_mean = 0
        else:
            zero_crossing_rate_mean = np.mean(zero_crossing_rate)
        
        # Combine all features into a single feature vector
        combined_features = np.concatenate([np.mean(mfcc, axis=1), [spectral_centroid_mean], [zero_crossing_rate_mean]])
        features.append(combined_features)
    return features

# Example usage
audio_features = extract_features(audio_data)
print("Number of audio files processed:", len(audio_features))
print("Shape of extracted features for each audio file:", audio_features[0].shape)
print(audio_features)

Number of audio files processed: 145
Shape of extracted features for each audio file: (15,)
[array([-3.41125412e+01,  2.24745056e+02, -8.46631165e+01,  4.37065697e+01,
       -2.03661251e+01,  1.23625708e+01,  1.58951864e+01, -9.36776829e+00,
        1.57327566e+01, -1.27166862e+01,  5.42451334e+00, -1.97885418e+00,
       -2.96163440e+00,  1.76839277e+03,  4.90645121e-02]), array([-7.86479416e+01,  1.16544708e+02, -4.74036140e+01,  6.18391685e+01,
       -6.91822195e+00,  2.82916107e+01, -9.17225075e+00,  2.24581337e+01,
       -4.15477961e-01,  1.35363016e+01, -6.86161995e-01,  3.70313358e+00,
       -7.03708601e+00,  3.66530115e+03,  8.71664199e-02]), array([-5.32980766e+01,  1.03132980e+02, -4.48414116e+01,  3.65558395e+01,
       -4.30382824e+00,  2.95413628e+01,  1.65872800e+00,  3.08405457e+01,
       -2.11746430e+00,  1.79404984e+01, -4.43568993e+00,  1.02042513e+01,
       -6.07835770e+00,  4.14887140e+03,  1.09217466e-01]), array([-1.01144798e+02,  8.95737457e+01,  6.01913166

In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def normalize_features(features, n_components=None):
    # Convert list to numpy array if necessary
    if isinstance(features, list):
        features = np.array(features)
    
    # Standardize features
    scaler = StandardScaler()
    standardized_features = scaler.fit_transform(features)
    
    # Perform PCA for dimensionality reduction
    pca = PCA(n_components=min(n_components, features.shape[1]) if n_components is not None else None)
    reduced_features = pca.fit_transform(standardized_features)
    
    return reduced_features


In [7]:
# Example usage with PCA for dimensionality reduction
reduced_features = normalize_features(audio_features, n_components=50)
print("Shape of reduced features:", reduced_features.shape)


Shape of reduced features: (145, 15)


In [24]:
import os
import pandas as pd

# Convert audio_features list to DataFrame
audio_features_df = pd.DataFrame(reduced_features)

# Assuming audio_directory is the directory containing your audio files
audio_directory = '/home/cake/Documents/sample_data/sampled_audio_1gb'

# Extract track IDs from audio filenames
track_ids = []
for audio_file in os.listdir(audio_directory):
    if audio_file.endswith('.mp3'):
        track_id = os.path.splitext(os.path.basename(audio_file))[0]
        track_ids.append(track_id)

# Add the extracted track IDs to the DataFrame
audio_features_df['Track ID'] = track_ids

# Save DataFrame to CSV
audio_features_df.to_csv('/home/cake/Documents/audio_features.csv', index=False)


In [25]:
audio_features_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Track ID
0,-2.241849,0.240507,4.185459,-2.068293,-0.361721,0.653584,-0.644961,1.67307,1.681871,0.525039,0.118433,-0.304002,0.174751,-0.172222,0.287474,829
1,2.307942,-0.964606,0.659748,0.479211,1.307669,0.485742,-0.188432,0.006328,0.073459,0.220986,-0.946773,0.378638,0.164361,0.08787,0.064063,860
2,3.16991,-1.752401,-0.647319,-0.533918,0.506334,-1.056925,-0.420476,0.295742,0.358476,-0.03479,-0.544844,-0.003994,0.158731,-0.116696,-0.022655,743
3,3.099264,1.727059,-2.007091,-1.613057,-0.680775,0.219316,-0.225537,-0.312124,0.296391,-0.158177,-0.61772,-0.178687,1.092409,-0.353689,0.274727,758
4,-4.420052,-4.085529,-2.458303,-0.881916,-0.886687,0.795532,-0.348055,-0.11823,-0.349034,0.611877,-1.073566,1.267636,-0.08079,0.018861,-0.240411,798
5,1.370064,-1.558776,1.974545,-0.821759,-0.560752,-1.059574,0.187146,0.126796,0.250752,0.621932,-0.13039,0.310975,0.229021,-0.008744,-0.092718,755
6,2.886711,-0.924921,-0.522332,-0.499652,0.57087,0.818463,0.251089,0.426649,0.309161,-0.442905,-0.002749,-0.140809,-0.077552,-0.279356,0.002576,757
7,-2.23622,-0.7626,-0.370815,-0.010591,-0.893411,0.709801,-0.012283,0.226698,-0.075334,0.438887,0.222399,0.529416,0.064907,0.16945,0.046439,835
8,1.856311,-0.89681,-0.355227,-1.616032,0.689226,-0.17785,-0.000177,-0.650654,-0.179962,-0.478908,-0.248489,-0.24677,-0.56005,-0.213794,-0.029301,768
9,-1.96486,-3.7321,-1.630307,0.480113,-1.426248,0.387502,0.334175,-0.434775,1.075793,-0.442795,0.471763,0.385245,0.024752,-0.051395,-0.093684,823


In [26]:
import pandas as pd

# Load statistical data from audio files CSV
audio_stats_df = pd.read_csv('/home/cake/Documents/audio_features.csv')

# Load other CSV file with readable columns
other_csv_df = pd.read_csv('/home/cake/Documents/cleaned_dataset.csv')

# Merge data based on track ID
merged_df = pd.merge(other_csv_df, audio_stats_df, on='Track ID', how='inner')

# Now merged_df contains data from both CSV files merged based on track ID

# Example of analyzing the merged data
print(merged_df.head())  


   Track ID            Album                         Artist         Genre  \
0       140   the blind spot  alec k redfearn  the eyesores          folk   
1       282               ii                      black pus  experimental   
2       511    lingua ignota                   celesteville          rock   
3       526  sing like birds                   celesteville          rock   
4       527  sing like birds                   celesteville          rock   

                    Title         0         1         2         3         4  \
0      queen of the wires -2.057604 -1.806357  0.004615  0.795823  0.057019   
1          new atlantis a  0.649962 -3.496860  3.713545  0.813869 -0.179248   
2     sharper amp clearer  1.227683  1.615365  0.071110  2.599417  0.357880   
3             do not talk -2.166364 -0.223055  0.417915 -0.022631 -0.249497   
4  for jake erdmann et al  0.016782  2.724415  1.608663  1.417748  0.120208   

          5         6         7         8         9        10 