# Problem Statement 

Build a general-purpose automatic audio tagging system using a dataset of audio files covering a wide range of real-world environments. Sounds in the dataset include things like musical instruments, human sounds, domestic sounds, and animals from Freesound’s library, annotated using a vocabulary of more than 40 labels from Google’s AudioSet ontology. 

- Here, we built an SVM classifier using MFCC features from the audio files.
- PCA was also used for dimension reduction.
- Actually, SVM achieved a pretty good classification accuracy.

# Patbas

## Data preprocessing ##

In [None]:
import numpy as np
import pandas as pd

import os
import librosa
from glob import glob

import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
import torchaudio
from sklearn.svm import SVC

### Read train and test dataframe

In [None]:
train_df = pd.read_csv('./full_dataset/train_filtered.csv')
test_df = pd.read_csv('./full_dataset/test_filtered_public.csv')
# test_df = pd.read_csv('./full_dataset/test_filtered_private.csv') 


### Sort out your data and labels

In [None]:
# Load data

audio_train_files = train_df['fname'].tolist()
audio_test_files = test_df['fname'].tolist()

In [None]:
# Converts list of the top-k class predictions to top-k corresponding labels
def convert_to_labels(preds, index_to_class, k=3):
    ans = []
    ids = []
    for p in preds:
        idx = np.argsort(p)[::-1]
        ids.append([i for i in idx[:k]])
        ans.append(' '.join([index_to_class[i] for i in idx[:k]]))

    return ans, ids

### Get Features

In [None]:
# Function from EDA kernel: https://www.kaggle.com/codename007/a-very-extensive-freesound-exploratory-analysis
SAMPLE_RATE = 16000

def clean_filename(fname, string):   
    file_name = fname.split('/')[1]
    if file_name[:2] == '__':        
        file_name = string + file_name
    return file_name

# Generate mfcc features with mean and standard deviation
def get_features(name, path):
    data, _ = librosa.core.load(path + name, sr = SAMPLE_RATE)
    try:
        ft1 = librosa.feature.mfcc(y=data, sr = SAMPLE_RATE, n_mfcc=30)
        # ft2 = ...
        ft1_trunc = np.hstack((np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis = 1), np.max(ft1, axis = 1), np.median(ft1, axis = 1), np.min(ft1, axis = 1)))
        # ft2_trunc = ...
        # return pd.Series(np.hstack((ft1_trunc, ft2_trunc, ...)))
    except Exception as e:
        pass
        # Add handle for the case of failed extraction

In [None]:
# Prepare data

train_data = pd.DataFrame()
train_data['fname'] = train_df['fname']
test_data = pd.DataFrame()
test_data['fname'] = audio_test_files

train_data = train_data['fname'].progress_apply(get_features, path='full_dataset/audio_train/')
print('done loading train mfcc')
test_data = test_data['fname'].progress_apply(get_features, path='full_dataset/audio_test/')
print('done loading test mfcc')

train_data['fname'] = train_df['fname']
test_data['fname'] = audio_test_files

train_data['label'] = train_df['label']
test_data['label'] = np.zeros((len(audio_test_files)))

### Convert labels to numbers

In [None]:
# Functions from Random Foresth using MFCC ttps://www.kaggle.com/amlanpraharaj/random-forest-using-mfcc-features
# Construct features set
X = train_data.drop(['label', 'fname'], axis=1)
feature_names = list(X.columns)
X = X.values
labels = np.sort(np.unique(train_data.label.values))
num_class = len(labels)
class_to_index = {}
index_to_class = {}
for i, c in enumerate(labels):
    class_to_index[c] = i
    index_to_class[i] = c
y = np.array([class_to_index[x] for x in train_data.label.values])

In [None]:
X_test = test_data.drop(['label', 'fname'], axis=1)
X_test = X_test.values

### Features post-process - scaling + PCA

In [None]:
# Apply scaling for PCA
scaler = StandardScaler() 
# ...

In [None]:
# Apply PCA for dimension reduction
pca = PCA(n_components=65).fit(X_scaled)


In [None]:
# Fit an SVM model

# X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, random_state = 42, shuffle = True)

# clf = SVC(kernel = 'rbf', probability=True)



### Searching for optimal hyper-params

In [None]:
# Define the paramter grid for C from 0.001 to 10, gamma from 0.001 to 10
C_grid = [...]
gamma_grid = [...]
param_grid = {'C': C_grid, 'gamma' : gamma_grid}

grid = GridSearchCV(...)
# grid.fit(X_train, y_train)

# Find the best model
print(grid.best_score_)

print(grid.best_params_)

print(grid.best_estimator_)

In [None]:
# Optimal model
clf = SVC(...)

# clf.fit(X_train, y_train)

# print(accuracy_score(clf.predict(X_val), y_val))

In [None]:
# Fit the entire training sets
# clf.fit(X_pca, y)
# str_preds, _ = convert_to_labels(clf.predict_proba(X_test_pca), i2c, k=1)



## Visualizations

## Visualization of classification results

In [None]:
import holoviews as hv
from holoviews import dim, opts
import pandas as pd
from bokeh.plotting import show
import bokeh.io
from sklearn.manifold import TSNE

bokeh.io.output_notebook()
hv.extension('bokeh')
hv.renderer('bokeh').theme = 'dark_minimal'
pd.options.plotting.backend = "holoviews"