## Boiler Plate

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
## Unzipping data and moving it in right location
#!p7zip -d ./data/all.7z
#!mkdir data
#!mv negative data/
#!mv positive data/

## Importing Library and Helper functions

In [None]:
#from audio import * ## Imporing FastAI Audio Library

from fastai.data.all import *
from fastai.vision.all import *

from audiotransform import AudioTransform, SpectrogramConfig2, AudioConfig2, label_func

import torchaudio

from tqdm import tqdm_notebook ## For progress bars

In [None]:
## Helper functions for model evaluation

## Taken from my cookbooks - https://github.com/aayushmnit/cookbook/blob/master/ml_classification.py
from sklearn.metrics import (
    roc_curve,
    auc,
    roc_auc_score,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    accuracy_score,
    f1_score
)

def plot_confusion_matrix(
    y_true,
    y_pred,
    classes,
    normalize=False,
    title="Confusion matrix",
    cmap=plt.cm.Blues,
):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix, without normalization")

    print(cm)

    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = ".2f" if normalize else "d"
    thresh = cm.max() / 2.0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(
            j,
            i,
            format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black",
        )

    plt.tight_layout()
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

## Data Check

This step just checks data and provide some summary statistics like sampling rate of different audio clips and length distribution of each waveFile

In [None]:
# Define the path to your data
data_folder = Path("./data/train/mldata/all/")


# Get a list of audio files
audio_files = get_files(data_folder, extensions=['.wav',])  # adjust extensions as needed

# # You can then explore the audio files with torchaudio
# for audio_file in audio_files:
#     wave, sr = torchaudio.load(audio_file)
#     print(f"Loaded {audio_file}, Sample Rate: {sr}, Waveform Shape: {wave.shape}")



# Integrate with fastai's DataBlock (customized for your use case)
# def label_func(f): return f.parent.name

# audio_block = DataBlock(
#     blocks=(AudioTransform, CategoryBlock),
#     get_items=get_files,
#     get_y=label_func,
#     splitter=RandomSplitter(),
#     item_tfms=[],
#     batch_tfms=[]
# )

# dls = audio_block.dataloaders(data_folder, bs=1)

# xb, yb = dls.one_batch()
# print(xb.shape, yb.shape)

In [None]:
# ## Defining path of modeling related data (Contains two folder positive and negative)
# data_folder = Path("./data/train/mldata/all/") 
# audios = AudioList.from_folder(data_folder)
# len_dict = audios.stats(prec=1)

## Load Data

In [None]:
sg_config = SpectrogramConfig2()
config = AudioConfig2(sg_cfg=sg_config)
print(config)

In [None]:
# ## Definining Audio config needed to create on the fly mel spectograms
# config = AudioConfig(standardize=False, 
#                      sg_cfg=SpectrogramConfig(
#                          f_min=0.0,  ## Minimum frequency to Display
#                          f_max=10000, ## Maximum Frequency to Display
#                          hop_length=256,
#                          n_fft=2560, ## Number of Samples for Fourier
#                          n_mels=256, ## Mel bins
#                          pad=0, 
#                          to_db_scale=True, ## Converting to DB sclae
#                          top_db=100,  ## Top decible sound
#                          win_length=None, 
#                          n_mfcc=20)
#                     )
# config.duration = 4000 ## 4 sec padding or snip
# config.resample_to=20000 ## Every sample at 20000 frequency
# config

**HyperParameter Cheat Sheet - Taken from [here](https://nbviewer.jupyter.org/github/mogwai/fastai_audio/blob/master/tutorials/01_Intro_to_Audio.ipynb)**
- sample_rate, This is not the place to change this, you are just telling librosa what your sample rate is. Usually it is predetermined for you by your dataset but check the resampling section for more info on changing this.
- fmin, minimum frequency to display in spectrogram, this should be low, anything 0-20 seems to work well
- fmax, maximum frequency to display. This should generally be 1/2 of your sample rate, but can be set to 8000 for speech.
- n_mels, How many mel bins to use, this will determine number of pixels tall your sg is. 64-128 are good defaults, but try various values, bigger isn't always better, test for your dataset. Some evidence suggests upscaling the image to a larger size is more effective than
- n_fft, The number of samples you use each time you compute a Fourier Transform. This is the width of the window and hop_length is how much you move the window each step. Increasing n_fft will increase frequency (y-axis) resolution to a point, powers of 2 are faster. Also dependent somewhat on n_mels so 20*n_mels is a common value as less than this can produce empty mel bins (black horizontal lines on sg)
- hop_length, the number of samples between successive frames of your sg. Determines width of image (# samples/hop = width in pixels). Good defaults really depend on dataset and the duration of audio your sg's represent (if they are longer, a larger hop is required to fit on a gpu, but you will be compressing the data). If you go too small, you can get blurring. Anything 64-512 can be good depending on context.
- top_db, Distance between loudest and softest sound you want displayed in spectrogram. If you choose 50db, the brightest pixel will be 50db, and anything that is 50+db lower than that won't be displayed. 80-120 is good.
- power, Honestly not entirely sure how this works. It's set to 1 for "energy" spectrogram and 2 for "power" spectrogram. An energy spectrogram is more detailed (less energy required to show up on the sg) but you don't generally have to worry about this because if you are converting to decibels (you'll do this almost always) it is factored out.

This code creates a AudioDataLoader and split data in random 80/20 split and takes the label from the folder name

In [None]:
## create Data Loader

audio_transform = AudioTransform(config, mode='train')


# Define your DataBlock
audio_block = DataBlock(
    blocks=(TransformBlock, CategoryBlock),
    get_items=get_files,
    get_x=audio_transform,
    get_y=label_func,
    splitter=RandomSplitter(),
    item_tfms=[],
    batch_tfms=[]
)

# Create DataLoaders
dls = audio_block.dataloaders(data_folder, bs=32)

xb, yb = dls.one_batch()
print(xb.shape, yb.shape)

In [None]:
# ## Creating Data Loader
# audios = AudioList.from_folder(data_folder, config=config).split_by_rand_pct(.2, seed=4).label_from_folder()

This code creates a AudioDataBunch which apply defined transformations (In our case frequency masking) on the fly and provide input spectograms to the model in defined bactch size (64) 

In [None]:
# ## Defining Transformation
# tfms = None

# ## Frequency masking:ON
# tfms = get_spectro_transforms(mask_time=False, mask_freq=True, roll=False) 

# ## Creating a databunch
# db = audios.transform(tfms).databunch(bs=64)

# ## Let's insepect some data
# db.show_batch(20)

## Model Training

Code below creates a ResNet18 model, removes the last 2 fully connected layer and then add new fully connected layers and load the pretrained weights from ImageNet Training.

In [None]:
metrics=[accuracy]

learn = Learner(dls,models.resnet18(), metrics = metrics).to_fp16()

In [None]:
# ## Default learner is ResNet 18 
# learn = audio_learner(db)

This is key feature of FastAI library, this helps us find the ideal learning rate by running model on sample data to see how the accuracy progresses. Output of this step is a learning rate curve (Choose the learning rate where loss starts bumping again)

In [None]:
## Find ideal learning rate
learn.lr_find()
learn.recorder.plot_lr_find()

Training model, two cool things to highlight - 
- **This model is getting trained using [1 cycle learning policy]**(https://arxiv.org/abs/1803.09820) which leads to faster conversion, Here is a [cool blog](https://towardsdatascience.com/finding-good-learning-rate-and-the-one-cycle-policy-7159fe1db5d6) explaing the same if you are not a paper person
- **Differential learning rate** - You want different learning rate for different layer of models. In transfer learning you don't want to change learning rate of early layers as fast as later layers in network. (The slice function allows us to pass that information in FastAI)

In [None]:
## 1-cycle learning (5 epochs and variable learning rate)
learn.fit_one_cycle(20, slice(2e-3, 2e-2))

FastAI outputs the model training porgress per epoch, Note that the accuracy is only calculated on Validation set (20% holdout set created during creating AudioDatabunch)

In [None]:
## Find ideal learning rate
learn.lr_find()
learn.recorder.plot_lr_find()

In [None]:
## 1-cycle learning (5 epochs and variable learning rate)
learn.fit_one_cycle(5, slice(1e-5, 1e-3))

In [None]:
## Exporting the model
learn.export('models/stg2-rn18.pkl')

torch.save(learn.model.state_dict(), 'models/stg2-rn18.pt') # torch version

With just 15 minutes of training we got our accuracy up to ~93.7% on 20% holdout set which was not used for training!

## Model Evaluation

A cool function in fastAI to plot different evaluation measures

In [None]:
def _load_model(mPath, mName="stg2-rn18.pkl"):
    if mName.endswith('.pkl'):
        tmp = load_learner(os.path.join(mPath, mName))    
    elif mName.endswith('.pt'):
        import torch
        # it is a pytorch model
        checkpoint = torch.load(os.path.join(mPath, mName), map_location=torch.device('cpu'))

        # create a dummy dataloader
        metrics = [accuracy]
        # loss_func = CrossEntropyLossFlat()

        # write a dummy wave file
        wave =np.zeros((20000))
        scipy.io.wavfile.write(os.path.join(mPath,'dummy.wav'), 20000, wave)
        testpath = Path(mPath)


        spec_config = SpectrogramConfig2()
        config = AudioConfig2(sg_cfg=spec_config)

        audio_transform = AudioTransform(config, mode='test')


        get_wav_files = lambda x: get_files(x, extensions=['.wav',])
        # Define your DataBlock
        audio_block = DataBlock(
            blocks=(TransformBlock, CategoryBlock),
            get_items=get_wav_files,
            get_x=audio_transform,
            get_y=label_func,
            splitter=RandomSplitter(),
            item_tfms=[],
            batch_tfms=[]
        )

        dls = audio_block.dataloaders(testpath, bs=1)


        tmp = Learner(dls,models.resnet18(), metrics = metrics)


        # load the model
        for n, p in checkpoint.items():
            print(n, p.shape)
        for n, p in tmp.model.state_dict().items():
            print(n, p.shape)
        tmp.model.load_state_dict(checkpoint, strict=True)
    else:raise NotImplementedError("Only .pkl and .pt models are supported")
    return tmp



learn = _load_model('/media/bnestor/easystore2/aifororcas-livesystem/ModelTraining/models', 'stg2-rn18.pkl')


In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(figsize=(5,5))

Plot top losses help you plot 10 most wrong prediction by the model, this helps you listen/visualize the sound. This helps you understand where the model is not performing the best and provide key insights. As we can listen in below examples some of these audios don't contain Orca Call but the labeling process has marked them positive and some cases where model thinks there is a Orca call but nobody tagged it as positive.

In [None]:
interp.plot_top_losses(10, heatmap = False)

## Model Evaluation on testing set

Defining DataFolder

In [None]:
test_data_folder = Path("./data/test/all/")
test_data_folder

Creating a AudioBunch

In [None]:
# Get a list of audio files
audio_files = get_files(test_data_folder, extensions=['.wav',])  # adjust extensions as needed

audio_transform = AudioTransform(config, mode='test')


# Define your DataBlock
audio_block = DataBlock(
    blocks=(TransformBlock, CategoryBlock),
    get_items=get_files,
    get_x=audio_transform,
    get_y=label_func,
    splitter=RandomSplitter(),
    item_tfms=[],
    batch_tfms=[]
)

# Create DataLoaders
test_dls = audio_block.dataloaders(test_data_folder, bs=1)

xb, yb = test_dls.one_batch()
print(xb.shape, yb.shape)



## Also extracting true labels
true_value = pd.Series([item[1].cpu().data.numpy().squeeze().tolist() for item in test_dls.train_ds])


In [None]:
# test = AudioList.from_folder(test_data_folder, config=config).split_none().label_from_folder()
# testdb = test.transform(tfms).databunch(bs=64)

# ## Also extracting true labels
# true_value = pd.Series(list(testdb.train_ds.y.items))

Generating predictions : 
- **To-Do** - There should be a better way to batch scoring, write now we have to score 1 by 1

In [None]:
predictions = []
for item in tqdm_notebook([item[0] for item in test_dls.train_ds]):    
    predictions.append(learn.predict(item)[2][1])

Calulating performance measure

In [None]:
print("AUC Score :{0:.2f} \nF-1 Score :{1:.2f} \nAccuracy Score :{2:.2f} \nAveragePrecisionScore :{1:.2f}".format(
    roc_auc_score(true_value,pd.Series(predictions)), 
    f1_score(true_value,pd.Series(predictions)>0.5), 
    accuracy_score(true_value,pd.Series(predictions)>0.5),
    average_precision_score(true_value,pd.Series(predictions) )
))

Wohoo model seems to performing inline with our initial model training process on this test set. Let's plot a confusion matrix.

In [None]:
plot_confusion_matrix(true_value, pd.Series(predictions)>0.5, classes=["No Orca","Orca"])

## Scoring for official evaluation

Loading the trained model

In [None]:
learn = load_learner('models/stg2-rn18.pkl')
# learn = load_learner("./data/train/mldata/all/models/", 'stg2-rn18.pkl')

Loading the 2 sec audio clips generated in Data prepration step for evaluation

In [None]:
# test_data_folder = Path("./data/test/OrcasoundLab07052019_Test/test2Sec/")
test_data_folder = Path("./data/test/all/")



# Get a list of audio files
# audio_files = get_files(data_folder, extensions=['.wav',])  # adjust extensions as needed

audio_transform = AudioTransform(config, mode='test')


# Define your DataBlock
audio_block = DataBlock(
    blocks=(TransformBlock, CategoryBlock),
    get_items=get_files,
    get_x=audio_transform,
    get_y=label_func,
    splitter=RandomSplitter(),
    item_tfms=[],
    batch_tfms=[]
)

# Create DataLoaders
test_dls = audio_block.dataloaders(test_data_folder, bs=32)

xb, yb = dls.one_batch()
print(xb.shape, yb.shape)


In [None]:
# test_data_folder = Path("./data/test/OrcasoundLab07052019_Test/test2Sec/")
# tfms=None
# test = AudioList.from_folder(test_data_folder, config=config).split_none().label_empty()
# testdb = test.transform(tfms).databunch(bs=64)

Runnning though model and generating predictions

In [None]:
predictions = []
pathList = [] 
# for item in tqdm_notebook(testdb.x):
#     predictions.append(learn.predict(item)[2][1])
#     pathList.append(str(item.path))



for pathname, item in tqdm_notebook(zip(test_dls.items, [item[0] for item in test_dls.train_ds]), total=len(test_dls.items)):    
    predictions.append(learn.predict(item)[2][1].cpu().data.numpy().tolist())
    pathList.append(str(pathname))

Exporing the predictions

In [None]:
prediction = pd.DataFrame({'FilePath': pathList, 'pred': predictions})
prediction['FileName'] = prediction.FilePath.apply(lambda x: os.path.basename(x).split("-")[0])
prediction.loc[:,['FileName','pred']].to_csv('./test2Sec.csv', index=False)

Converting the predictions in standard evaluation format

In [None]:
## Load predictions
test2secDF = pd.read_csv("./test2Sec.csv") 

display(test2secDF)

# ## Clean the predictions(it got converted in string)
# test2secDF['pred'] = test2secDF.pred.apply(lambda x: float(x.split('(')[1].split(')')[0])) 

In [None]:
## Extracting Start time from file name
test2secDF['startTime'] = test2secDF.FileName.apply(lambda x: int(x.split('__')[1].split('.')[0].split('_')[0]))

## Sorting the file based on startTime
test2secDF = test2secDF.sort_values(['startTime']).reset_index(drop=True)

In [None]:
test2secDF.head()

In [None]:
## Rolling Window (to average at per second level)
submission = pd.DataFrame({'pred': list(test2secDF.rolling(2)['pred'].mean().values)}).reset_index().rename(columns={'index':'StartTime'})

## Updating first row
submission.loc[0,'pred'] = test2secDF.pred[0]

## Adding lastrow
lastLine = pd.DataFrame({'StartTime':[submission.StartTime.max()+1],'pred':[test2secDF.pred[test2secDF.shape[0]-1]]})
# submission = submission.append(lastLine, ignore_index=True)
# display(lastLine)
submission = pd.concat((submission, lastLine), ignore_index=True)
# display(submission)

finalSubmission = submission.loc[submission.pred > 0.5,:].reset_index(drop=True)
finalSubmission['Duration'] = 1


In [None]:
## Final submission file
finalSubmission.loc[:,['StartTime','Duration']].to_csv('../evaluation/submission/submission2SecFastAI.csv', index=False)