In [13]:

from hasp.make_feature_pipeline import make_feature_pipeline
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from hasp.util import combine_classes
import pandas as pd


## Data

In [6]:
from us8kdata.loader import UrbanSound8K
data = UrbanSound8K('../hasp/data')

In [7]:
train_fold = [samples for samples in data.fold_audio_generator(fold=[1, 2])]#, 3, 6, 7, 8, 9, 10])]
val_fold = [samples for samples in data.fold_audio_generator(fold=[4])]
test_fold = [samples for samples in data.fold_audio_generator(fold=[5])]

In [8]:
train_y = data.filter_metadata(fold=[1, 2]).classID
val_y = data.filter_metadata(fold=[4]).classID
test_y = data.filter_metadata(fold=[5]).classID

# Training Pipeline 

In [9]:
pipe = make_feature_pipeline()

In [10]:
full_pipe = Pipeline([
    ('preproc', pipe), 
    ('knn', KNeighborsClassifier())
])

In [11]:
full_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preproc', 'knn', 'preproc__memory', 'preproc__steps', 'preproc__verbose', 'preproc__mean_mfcc', 'preproc__scaler', 'preproc__mean_mfcc__accept_sparse', 'preproc__mean_mfcc__check_inverse', 'preproc__mean_mfcc__feature_names_out', 'preproc__mean_mfcc__func', 'preproc__mean_mfcc__inv_kw_args', 'preproc__mean_mfcc__inverse_func', 'preproc__mean_mfcc__kw_args', 'preproc__mean_mfcc__validate', 'preproc__scaler__copy', 'preproc__scaler__with_mean', 'preproc__scaler__with_std', 'knn__algorithm', 'knn__leaf_size', 'knn__metric', 'knn__metric_params', 'knn__n_jobs', 'knn__n_neighbors', 'knn__p', 'knn__weights'])

In [12]:
kneighbors_score = []

for i in range(1, 17, 1):
    full_pipe.set_params(knn__n_neighbors=i)
    full_pipe.fit(train_fold, train_y)
    
    train_score = full_pipe.score(train_fold, train_y)
    val_score = full_pipe.score(val_fold, val_y)
    
    kneighbors_score.append((i, train_score, val_score))

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(kneighbors_score, columns=['K', 'Train Score', 'Val Score'])
df

Unnamed: 0,K,Train Score,Val Score
0,1,1.0,0.29697
1,2,0.975582,0.30202
2,3,0.971607,0.30404
3,4,0.958546,0.314141
4,5,0.951732,0.308081
5,6,0.941511,0.314141
6,7,0.93356,0.30202
7,8,0.925043,0.29798
8,9,0.91255,0.293939
9,10,0.908575,0.283838


Scoring KNN on validation set shows low scores which mean the model might be overfitting. 

## Training Pipeline 2

Training Pipeline with mean_mfcc__kw_args={'fmin':500, 'fmax':4000}

In [None]:
new_pipe = make_feature_pipeline()

In [None]:
full_pipe2 = Pipeline([
    ('preproc', new_pipe), 
    ('knn', KNeighborsClassifier())
])

In [None]:
full_pipe2.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preproc', 'knn', 'preproc__memory', 'preproc__steps', 'preproc__verbose', 'preproc__mean_mfcc', 'preproc__scaler', 'preproc__mean_mfcc__accept_sparse', 'preproc__mean_mfcc__check_inverse', 'preproc__mean_mfcc__feature_names_out', 'preproc__mean_mfcc__func', 'preproc__mean_mfcc__inv_kw_args', 'preproc__mean_mfcc__inverse_func', 'preproc__mean_mfcc__kw_args', 'preproc__mean_mfcc__validate', 'preproc__scaler__copy', 'preproc__scaler__with_mean', 'preproc__scaler__with_std', 'knn__algorithm', 'knn__leaf_size', 'knn__metric', 'knn__metric_params', 'knn__n_jobs', 'knn__n_neighbors', 'knn__p', 'knn__weights'])

In [None]:
full_pipe2.set_params(preproc__mean_mfcc__kw_args={'fmin':500, 'fmax':4000})

In [None]:
kneighbors_score2 = []

for i in range(1, 17, 1):
    full_pipe2.set_params(knn__n_neighbors=i)
    full_pipe2.fit(train_fold, train_y)
    
    train_score2 = full_pipe2.score(train_fold, train_y)
    val_score2 = full_pipe2.score(val_fold, val_y)
    
    kneighbors_score2.append((i, train_score2, val_score2))

In [None]:
df2 = pd.DataFrame(kneighbors_score2, columns=['K', 'Train Score', 'Val Score'])
df2

Unnamed: 0,K,Train Score,Val Score
0,1,1.0,0.29697
1,2,0.975582,0.30202
2,3,0.971607,0.30404
3,4,0.958546,0.314141
4,5,0.951732,0.308081
5,6,0.941511,0.314141
6,7,0.93356,0.30202
7,8,0.925043,0.29798
8,9,0.91255,0.293939
9,10,0.908575,0.283838


## Conclusion -1

Both instance shows low validation score, best scores are when K=7.

## Training Pipeline 3 

When preproc__mean_mfcc__kw_args={'fmin':500, 'fmax':4000} and knn__weights='distance'.

In [None]:
pipe3 = make_feature_pipeline()

In [None]:
full_pipe3 = Pipeline([
    ('preproc', pipe3),
    ('knn', KNeighborsClassifier(weights='distance'))
])

In [None]:
full_pipe3.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('preproc',
                 Pipeline(steps=[('mean_mfcc',
                                  FunctionTransformer(func=<function samples_to_mean_mfcc at 0x10d3f4d30>,
                                                      kw_args={'fmax': 4000,
                                                               'fmin': 500})),
                                 ('scaler', StandardScaler())])),
                ('knn', KNeighborsClassifier(weights='distance'))])>

In [None]:
kneighbors_score3 = []

for i in range(1, 17, 1):
    full_pipe3.set_params(knn__n_neighbors=i)
    full_pipe3.fit(train_fold, train_y)
    
    train_score3 = full_pipe3.score(train_fold, train_y)
    val_score3 = full_pipe3.score(val_fold, val_y)
    
    kneighbors_score3.append((i, train_score3, val_score3))

In [None]:
df3 = pd.DataFrame(kneighbors_score3, columns=['K', 'Train Score', 'Val Score'])
df3

Unnamed: 0,K,Train Score,Val Score
0,1,1.0,0.29697
1,2,1.0,0.29697
2,3,1.0,0.29899
3,4,1.0,0.312121
4,5,1.0,0.311111
5,6,1.0,0.313131
6,7,1.0,0.306061
7,8,1.0,0.308081
8,9,1.0,0.292929
9,10,1.0,0.286869


## Training Pipeline 4

When preproc params are default and KNeighborsClassifier, weights='distance'.

In [None]:
pipe4 = make_feature_pipeline()

In [None]:
full_pipe4 = Pipeline([
    ('preproc', pipe4), 
    ('knn', KNeighborsClassifier(weights='distance'))
])

In [None]:
full_pipe4.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('preproc',
                 Pipeline(steps=[('mean_mfcc',
                                  FunctionTransformer(func=<function samples_to_mean_mfcc at 0x10d3f4d30>,
                                                      kw_args={'fmax': None,
                                                               'fmin': 0.0,
                                                               'hop_length': 128,
                                                               'n_fft': 512,
                                                               'n_mfcc': 20,
                                                               'sr': 16000})),
                                 ('scaler', StandardScaler())])),
                ('knn', KNeighborsClassifier(weights='distance'))])>

In [None]:
kneighbors_score4 = []

for i in range(1, 17, 1):
    full_pipe4.set_params(knn__n_neighbors=i)
    full_pipe4.fit(train_fold, train_y)
    
    train_score4 = full_pipe4.score(train_fold, train_y)
    val_score4 = full_pipe4.score(val_fold, val_y)
    
    kneighbors_score4.append((i, train_score4, val_score4))

In [None]:
df4 = pd.DataFrame(kneighbors_score4, columns=['K', 'Train Score', 'Val Score'])
df4

Unnamed: 0,K,Train Score,Val Score
0,1,1.0,0.29697
1,2,1.0,0.29697
2,3,1.0,0.29899
3,4,1.0,0.312121
4,5,1.0,0.311111
5,6,1.0,0.313131
6,7,1.0,0.306061
7,8,1.0,0.308081
8,9,1.0,0.292929
9,10,1.0,0.286869


## Conclusion -2

When weights='distance', all train scores return 1.0. This is because 'distance' means 'weight points by the inverse of their distance', so when training set is used to find the training score, the closest point would be the training set itself, hence returning 1.0. 

## Prediction -1

Using models of KneighborsClassifier weights='uniform' and K=7.

In [None]:
### When preproc__mean_mfcc__kw_args is default.
preproc_pipe = make_feature_pipeline()

In [None]:
new_full_pipe = Pipeline([
    ('preproc', preproc_pipe),
    ('knn', KNeighborsClassifier(n_neighbors=7))
])

In [None]:
new_full_pipe.fit(train_fold, train_y)

In [None]:
pred_y = new_full_pipe.predict(val_fold)

In [None]:
pred_1 = pd.DataFrame(classification_report(val_y, pred_y, output_dict=True)).transpose()
pred_1

Unnamed: 0,precision,recall,f1-score,support
0,0.149425,0.13,0.139037,100.0
1,0.9,0.152542,0.26087,59.0
2,0.142857,0.45,0.216867,100.0
3,0.533333,0.48,0.505263,100.0
4,0.525641,0.41,0.460674,100.0
5,0.346667,0.242991,0.285714,107.0
6,0.566667,0.447368,0.5,38.0
7,0.138889,0.041667,0.064103,120.0
8,0.8125,0.313253,0.452174,166.0
9,0.209756,0.43,0.281967,100.0


In [None]:
print(confusion_matrix(val_y, pred_y))

[[13  0 65  4  0  1  0  0  1 16]
 [ 7  9  9  3  5  2  1  5  5 13]
 [ 2  1 45  7 17  1  3  3  0 21]
 [ 3  0 23 48  3  1  8  2  6  6]
 [ 5  0 23  0 41  5  0 11  0 15]
 [23  0 43  0  5 26  0  2  0  8]
 [ 0  0  8  2  0  0 17  4  0  7]
 [24  0 33  0  0 13  0  5  0 45]
 [ 0  0 38 16  3 23  0  3 52 31]
 [10  0 28 10  4  3  1  1  0 43]]


## Prediction -2

When preproc__mean_mfcc__kw_args={'fmin':500, 'fmax':4000}, K=7 and weights='uniform'.

In [14]:
new_pipe2 = make_feature_pipeline()

In [15]:
new_full_pipe2 = Pipeline([
    ('preproc', new_pipe2),
    ('knn', KNeighborsClassifier(n_neighbors=7))
])

In [16]:
new_full_pipe2.set_params(preproc__mean_mfcc__kw_args={'fmin':500, 'fmax':4000})

In [17]:
new_full_pipe2.fit(train_fold, train_y)

In [18]:
pred_y2 = new_full_pipe2.predict(val_fold)

In [19]:
pred_2 = pd.DataFrame(classification_report(val_y, pred_y2, output_dict=True)).transpose()
pred_2

Unnamed: 0,precision,recall,f1-score,support
0,0.149425,0.13,0.139037,100.0
1,0.9,0.152542,0.26087,59.0
2,0.142857,0.45,0.216867,100.0
3,0.533333,0.48,0.505263,100.0
4,0.525641,0.41,0.460674,100.0
5,0.346667,0.242991,0.285714,107.0
6,0.566667,0.447368,0.5,38.0
7,0.138889,0.041667,0.064103,120.0
8,0.8125,0.313253,0.452174,166.0
9,0.209756,0.43,0.281967,100.0


In [20]:
print(confusion_matrix(val_y, pred_y2))

[[13  0 65  4  0  1  0  0  1 16]
 [ 7  9  9  3  5  2  1  5  5 13]
 [ 2  1 45  7 17  1  3  3  0 21]
 [ 3  0 23 48  3  1  8  2  6  6]
 [ 5  0 23  0 41  5  0 11  0 15]
 [23  0 43  0  5 26  0  2  0  8]
 [ 0  0  8  2  0  0 17  4  0  7]
 [24  0 33  0  0 13  0  5  0 45]
 [ 0  0 38 16  3 23  0  3 52 31]
 [10  0 28 10  4  3  1  1  0 43]]


## Comparison

In [21]:
results = {
    'mean_precision': [pred_1['precision'].mean(), pred_2['precision'].mean()],
    'mean_recall': [pred_1['recall'].mean(), pred_2['recall'].mean()],
    'mean_f1': [pred_1['f1-score'].mean(), pred_2['f1-score'].mean()]
}

NameError: name 'pred_1' is not defined

In [None]:
df = pd.DataFrame(results)
df

Unnamed: 0,mean_precision,mean_recall,mean_f1
0,0.421841,0.308588,0.31512
1,0.421841,0.308588,0.31512


## Conclusion

The preproc__mean_mfcc__kw_args does not influence KNeighborsClassifier model.
The scores here are low comparing to RandomForestClassifiers.

## Comparing classification reports on 10-class and 3-class

In [22]:
new_pred = combine_classes([1,8], pred_y2)
new_val = combine_classes([1,8], val_y)

In [25]:
class_report_10 = pred_2
class_report_10

Unnamed: 0,precision,recall,f1-score,support
0,0.149425,0.13,0.139037,100.0
1,0.9,0.152542,0.26087,59.0
2,0.142857,0.45,0.216867,100.0
3,0.533333,0.48,0.505263,100.0
4,0.525641,0.41,0.460674,100.0
5,0.346667,0.242991,0.285714,107.0
6,0.566667,0.447368,0.5,38.0
7,0.138889,0.041667,0.064103,120.0
8,0.8125,0.313253,0.452174,166.0
9,0.209756,0.43,0.281967,100.0


In [26]:
class_report_3 = pd.DataFrame(classification_report(new_val, new_pred, output_dict=True)).transpose()
class_report_3

Unnamed: 0,precision,recall,f1-score,support
1,0.9,0.152542,0.26087,59.0
7,0.826419,0.989542,0.900654,765.0
8,0.8125,0.313253,0.452174,166.0
accuracy,0.826263,0.826263,0.826263,0.826263
macro avg,0.846306,0.485113,0.537899,990.0
weighted avg,0.82847,0.826263,0.787326,990.0
