In [1]:
import os

In [2]:
%pwd

'c:\\Users\\yasba\\OneDrive\\Documents\\Projects\\Audio_Classification_UrbanSound8K\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\yasba\\OneDrive\\Documents\\Projects\\Audio_Classification_UrbanSound8K'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    csv_path: str
    preprocess_dir: Path
    audio_dataset_path: Path
    split_dir: Path
    preprocess_data_dir: Path
    train_data_dir: Path
    test_data_dir: Path

In [6]:
from Audio_Classification.constants import *
from Audio_Classification.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])
        create_directories([config.preprocess_dir])
        create_directories([config.split_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            csv_path = config.csv_path,
            preprocess_dir = config.preprocess_dir,
            audio_dataset_path = config.audio_dataset_path,
            split_dir = config.split_dir,
            preprocess_data_dir = config.preprocess_data_dir ,
            train_data_dir = config.train_data_dir,
            test_data_dir = config.test_data_dir,
        )

        return data_ingestion_config

In [8]:
import os
import pandas as pd 
import urllib.request as request
import zipfile
from Audio_Classification import logger
from Audio_Classification.utils.common import *
import numpy as np
from tqdm import tqdm
import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [9]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    def get_csv_file(self):
        self.metadata = pd.read_csv(self.config.csv_path)
        
        logger.info("Reading CSV File Completed")


    def feature_extractor(self,file_name):
        audio, sample_rate = librosa.load(file_name)
        mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)

        return mfccs_scaled_features
    

    def get_extracted_feature(self):
        self.extracted_features = []
        for index_num, row in tqdm(self.metadata.iterrows()):
            file_name = str(os.path.join(os.path.abspath(self.config.audio_dataset_path), f'fold{row["fold"]}', row["slice_file_name"]))
            final_class_labels = row["class"]
            data = self.feature_extractor(file_name)
            self.extracted_features.append([data, final_class_labels])
    
        logger.info("Extraction of MFCC Features Completed")


    def convert_mfcc_to_dataframe(self):
        self.mfcc_df = pd.DataFrame(self.extracted_features, columns=["mfccs","class"])

        logger.info("Converted The Extracted Feature and Classes to DataFrame")


    def encoding_class_variables(self):
        le = LabelEncoder()
        self.mfcc_df['class'] = le.fit_transform(self.mfcc_df['class'])

        logger.info("Encoded the Objective Data of Classes variable Using LabelEncoder")


    def save_mfcc_dataframe(self):
        self.mfcc_df.to_csv(self.config.preprocess_data_dir, index=False)
        train_df, test_df = train_test_split(self.mfcc_df, test_size=0.2, random_state=42)

        logger.info("Saved the DataFrame as CSV File")
    

    def splitting_data_to_train_test(self):
        train_df, test_df = train_test_split(self.mfcc_df, test_size=0.2, random_state=42)
        self.train_df = train_df
        self.test_df = test_df
        self.train_df.to_csv(self.config.train_data_dir, index=False)
        self.test_df.to_csv(self.config.test_data_dir, index=False)
        logger.info("Splitted the data into train and test")

    
    

In [10]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.get_csv_file()
    data_ingestion.get_extracted_feature()
    data_ingestion.convert_mfcc_to_dataframe()
    data_ingestion.encoding_class_variables()
    data_ingestion.save_mfcc_dataframe()
    data_ingestion.splitting_data_to_train_test()
except Exception as e:
    raise e

[2024-12-20 19:50:29,638: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-12-20 19:50:29,641: INFO: common: yaml file: params.yaml loaded successfully]
[2024-12-20 19:50:29,644: INFO: common: created directory at: artifacts]
[2024-12-20 19:50:29,646: INFO: common: created directory at: artifacts/data_ingestion]
[2024-12-20 19:50:29,647: INFO: common: created directory at: artifacts/data_ingestion/preprocess]
[2024-12-20 19:50:29,649: INFO: common: created directory at: artifacts/data_ingestion/splits]
[2024-12-20 19:50:29,681: INFO: 784385225: Reading CSV File Completed]


8732it [01:58, 73.65it/s] 

[2024-12-20 19:52:28,259: INFO: 784385225: Extraction of MFCC Features Completed]
[2024-12-20 19:52:28,263: INFO: 784385225: Converted The Extracted Feature and Classes to DataFrame]
[2024-12-20 19:52:28,267: INFO: 784385225: Encoded the Objective Data of Classes variable Using LabelEncoder]





[2024-12-20 19:52:30,233: INFO: 784385225: Saved the DataFrame as CSV File]
[2024-12-20 19:52:32,175: INFO: 784385225: Splitted the data into train and test]
