In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!unzip /content/gdrive/MyDrive/Kaggle/Pog_Music_Classification/data/mel_spec_aug/mel_spec_test_aug.zip > /dev/null

In [3]:
!pip install -q pytorch_lightning timm torchtoolbox wandb

[K     |████████████████████████████████| 582 kB 5.1 MB/s 
[K     |████████████████████████████████| 431 kB 47.4 MB/s 
[K     |████████████████████████████████| 84 kB 3.5 MB/s 
[K     |████████████████████████████████| 1.8 MB 52.0 MB/s 
[K     |████████████████████████████████| 136 kB 66.5 MB/s 
[K     |████████████████████████████████| 408 kB 55.8 MB/s 
[K     |████████████████████████████████| 596 kB 64.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 54.5 MB/s 
[K     |████████████████████████████████| 4.0 MB 45.8 MB/s 
[K     |████████████████████████████████| 144 kB 68.2 MB/s 
[K     |████████████████████████████████| 181 kB 69.0 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |████████████████████████████████| 271 kB 61.8 MB/s 
[K     |████████████████████████████████| 94 kB 3.4 MB/s 
[K     |████████████████████████████████| 144 kB 67.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 55.5 MB/s 
[K     |██████████████████████

In [4]:
!pip install -q albumentations==0.4.6

[K     |████████████████████████████████| 117 kB 5.2 MB/s 
[K     |████████████████████████████████| 948 kB 50.1 MB/s 
[?25h  Building wheel for albumentations (setup.py) ... [?25l[?25hdone


In [5]:
!pip uninstall -y opencv-python-headless==4.5.5.62



In [6]:
!pip install -q opencv-python-headless==4.5.2.52

[K     |████████████████████████████████| 38.2 MB 134 kB/s 
[?25h

### Inference notebook using models trained on mel spectrograms generated using audio augmentations

In [33]:
import torch
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms
import tqdm
import albumentations as alb
from albumentations.pytorch import ToTensorV2
import sys
import torch.multiprocessing as mp
import os

In [34]:
class TransformationType:
    TORCHVISION = "torchvision"
    ALB = "albumentations"

class Models:
    RESNET34 = "resnet34"
    RESNET50 = "resnet50"
    RESNEXT50 = "resnext50_32x4d"
    EFFNET_B0 = "tf_efficientnet_b0_ns"
    EFFNET_B4 = "tf_efficientnet_b4_ns"

class ImgStats:
    IMAGENET_MEAN = [0.485, 0.456, 0.406]
    IMAGENET_STD = [0.229, 0.224, 0.225]    

class WandbConfig:
    WANDB_KEY = ""
    WANDB_RUN_NAME = "melspec_256by313_effnet_run1"
    WANDB_PROJECT = "Pog_MusicClf_Resnext"
    USE_WANDB = False    

class SchedulerConfig:
    SCHEDULER_PATIENCE = 4
    SCHEDULER = 'CosineAnnealingWarmRestarts'
    T_0 = 10 # for CosineAnnealingWarmRestarts
    MIN_LR = 5e-7 # for CosineAnnealingWarmRestarts
    MAX_LR = 1e-2
    STEPS_PER_EPOCH = 0

# CONSTANTS
class Config:
    # whether to use mel spectrograms generated using audio augmentations ( multiple mel spec for one audio)
    USE_MEL_SPEC_AUG = True
    RUNTIME = "COLAB"
    RESUME_FROM_CHKPT = None
    NUM_CLASSES = 19
    BATCH_SIZE = 128
    NUM_FOLDS = 1
    UNFREEZE_EPOCH_NO = 1
    NUM_EPOCHS = 2
    NUM_WORKERS = mp.cpu_count()
    INPUT_IMAGE_SIZE = (128,128)
    IMG_MEAN = ImgStats.IMAGENET_MEAN
    IMG_STD = ImgStats.IMAGENET_STD
    FAST_DEV_RUN = False
    PRECISION = 16    
    PATIENCE = 10    
    SUBSET_ROWS_FRAC = 0.05
    TRAIN_ON_SUBSET = False
    RANDOM_SEED = 42
    MODEL_TO_USE = Models.RESNEXT50
    PRETRAINED = False        
    FIND_LR = False
    WEIGHT_DECAY = 1e-6
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    
    # model hyperparameters
    MODEL_PARAMS = {    
        "drop_out": 0.25,
        "lr": 0.00043
    }

In [35]:
if Config.RUNTIME == "COLAB":
    Config.DATA_ROOT_FOLDER = "/content/gdrive/MyDrive/Kaggle/Pog_Music_Classification/data/"
    Config.IMG_ROOT_FOLDER = "/content/processed_test_aug/mel_spec/"
    Config.MODEL_PATH_BASE = Config.DATA_ROOT_FOLDER + "model/CV/AUG_RESNEXT50/"
elif Config.RUNTIME == "KAGGLE":
    Config.DATA_ROOT_FOLDER = "../input/pog-musicclf-melspec-aug/"
    Config.IMG_ROOT_FOLDER = "/kaggle/input/pog-musicclf-melspec-aug/mel_spec_test_aug/processed_test_aug/mel_spec/"        
else:
    Config.DATA_ROOT_FOLDER = "./data/"
    Config.IMG_ROOT_FOLDER = "./data/processed_train/mel_spec/"

In [36]:
FOLD_BEST_MODELS = [
    #"fold0_resnext50_32x4d_best_model_epoch=11_val_loss=1.5832.ckpt",
    "fold1_resnext50_32x4d_best_model_epoch=9_val_loss=1.4965.ckpt",
    #"fold2_resnext50_32x4d_best_model_epoch=10_val_loss=1.5228.ckpt",
    #"fold3_resnext50_32x4d_best_model_epoch=9_val_loss=1.5181.ckpt",
    #"fold4_resnext50_32x4d_best_model_epoch=8_val_loss=1.5268.ckpt"
]

In [37]:
class AudioMelSpecImgTestDataset(Dataset):
    def __init__(self, df, file_name_col, img_root_folder, transform=None):
        self.df = df
        self.file_name_col = file_name_col        
        self.img_root_folder = img_root_folder
        self.transform = transform        

    def __getitem__(self, index):        
        if Config.USE_MEL_SPEC_AUG:
            mel_spec_img = self.df.loc[index, self.file_name_col]
            img_path = self.img_root_folder + mel_spec_img
        else:            
            file_name_noext = self.df.loc[index, self.file_name_col].split(".")[0]        
            img_path = self.img_root_folder + "/" + file_name_noext + ".jpg" 
        img = Image.open(img_path)
        img_arr = np.array(img)
        if self.transform is not None:
            augmented = self.transform(image=img_arr)
            img_tfmd = augmented["image"]        
        return index, img_tfmd

    def __len__(self):
        return len(self.df)

In [38]:
test_transform = alb.Compose([
        alb.CenterCrop(Config.INPUT_IMAGE_SIZE[0], Config.INPUT_IMAGE_SIZE[1]),
        alb.Normalize(mean=Config.IMG_MEAN, std=Config.IMG_STD),
        ToTensorV2()        
])

In [39]:
df_test = pd.read_csv(Config.DATA_ROOT_FOLDER + "mel_spec_aug/df_test_aug.csv")
df_test.head()

Unnamed: 0.1,Unnamed: 0,song_id,filename,filepath,file_exists,mel_spec
0,0,7072,007072.ogg,test/007072.ogg,True,007072_0.jpg
1,1,7072,007072.ogg,test/007072.ogg,True,007072_2.jpg
2,2,7072,007072.ogg,test/007072.ogg,True,007072_4.jpg
3,3,7072,007072.ogg,test/007072.ogg,True,007072_3.jpg
4,4,7072,007072.ogg,test/007072.ogg,True,007072_1.jpg


In [40]:
ds_test = AudioMelSpecImgTestDataset(
            df_test, 
            file_name_col="mel_spec",        
            img_root_folder=Config.IMG_ROOT_FOLDER, 
            transform=test_transform
        )        

dl_test = DataLoader(ds_test, batch_size=Config.BATCH_SIZE)

In [41]:
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau, OneCycleLR

def get_optimizer(lr, params):
    model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, params), 
            lr=lr,
            weight_decay=Config.WEIGHT_DECAY
        )
    interval = "epoch"
    if SchedulerConfig.SCHEDULER == "CosineAnnealingWarmRestarts":
        lr_scheduler = CosineAnnealingWarmRestarts(
                            model_optimizer, 
                            T_0=SchedulerConfig.T_0, 
                            T_mult=1, 
                            eta_min=SchedulerConfig.MIN_LR, 
                            last_epoch=-1
                        )
    elif SchedulerConfig.SCHEDULER == "OneCycleLR":
        lr_scheduler = OneCycleLR(
            optimizer=model_optimizer,
            max_lr=SchedulerConfig.MAX_LR,
            epochs=Config.NUM_EPOCHS,
            steps_per_epoch=SchedulerConfig.STEPS_PER_EPOCH,
            verbose=True
        )
        interval = "step"
    elif SchedulerConfig.SCHEDULER == "CosineAnnealingLR":
        lr_scheduler = CosineAnnealingLR(model_optimizer, eta_min=SchedulerConfig.MIN_LR, T_max=Config.NUM_EPOCHS)
    else:
        # ReduceLROnPlateau throws an error is parameters are filtered, 
        # refer: https://github.com/PyTorchLightning/pytorch-lightning/issues/8720
        model_optimizer = torch.optim.Adam(
            params, 
            lr=lr,
            weight_decay=Config.WEIGHT_DECAY
        )
        lr_scheduler = ReduceLROnPlateau(
                            model_optimizer, 
                            mode="min",                                                                
                            factor=0.1,
                            patience=SchedulerConfig.SCHEDULER_PATIENCE,
                            min_lr=SchedulerConfig.MIN_LR,                                
                            verbose=True
                        )   
    return {
        "optimizer": model_optimizer, 
        "lr_scheduler": {
            "scheduler": lr_scheduler,
            "interval": interval,
            "monitor": "val_loss",
            "frequency": 1
        }
    }

In [42]:
import torchvision.models as models
import torch.nn as nn
from torch.nn.functional import cross_entropy
import torchmetrics
import timm

class MusicClfLitModel(pl.LightningModule):
    def __init__(self, num_classes, hparams, model_to_use):
        super().__init__()
        self.save_hyperparameters()
        self.lr = hparams["lr"]
        self.num_classes = num_classes              
        self.backbone, self.classifier = self.get_backbone_classifier(model_to_use, hparams["drop_out"], num_classes) 

    @staticmethod
    def get_backbone_classifier(model_to_use, drop_out, num_classes):
        pt_model = timm.create_model(model_to_use, pretrained=Config.PRETRAINED)
        backbone = None
        classifier = None
        if model_to_use in [Models.RESNET34, Models.RESNET50, Models.RESNEXT50]:            
            backbone = nn.Sequential(*list(pt_model.children())[:-1])
            in_features = pt_model.fc.in_features
            classifier = nn.Sequential(
                nn.Dropout(drop_out),
                nn.Linear(in_features, num_classes)
            )
        if model_to_use in [Models.EFFNET_B4, Models.EFFNET_B0]:
            backbone = nn.Sequential(*list(pt_model.children())[:-1])
            in_features = pt_model.classifier.in_features
            classifier = nn.Linear(in_features, num_classes)
                    
        return backbone, classifier

    def forward(self, x):
        features = self.backbone(x)
        features = torch.flatten(features, 1)                
        x = self.classifier(features)
        soft_max = nn.Softmax(dim=1)        
        proba = soft_max(x)
        return proba

    def configure_optimizers(self):
        return get_optimizer(lr=self.lr, params=self.parameters())

    def training_step(self, batch, batch_idx):
        id, X, y = batch
        y_pred = self(X)
        loss = cross_entropy(y_pred, y)
        train_f1 = torchmetrics.functional.f1(preds=y_pred, target=y, num_classes=self.num_classes, average="micro")
        self.log("train_loss", loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        self.log("train_f1", train_f1, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        return loss        

    def validation_step(self, batch, batch_idx):
        id, X, y = batch
        y_pred = self(X)
        val_loss = cross_entropy(y_pred, y)
        current_lr = self.trainer.optimizers[0].param_groups[0]['lr']
        val_f1 = torchmetrics.functional.f1(preds=y_pred, target=y, num_classes=self.num_classes, average="micro")
        self.log("val_loss", val_loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        self.log("val_f1", val_f1, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        self.log("cur_lr", current_lr, prog_bar=True, on_step=True, on_epoch=True, logger=True)
        return {"loss": val_loss, "val_f1": val_f1, "cur_lr": current_lr}

In [43]:
from tqdm.notebook import tqdm
from IPython.display import display

df_submission = pd.read_csv(Config.DATA_ROOT_FOLDER + "sample_submission.csv")    
# records present both in df_test and df_submission
df_subm_test = pd.merge(
    left=df_test,
    right=df_submission,
    how="right",
    on="song_id",
    suffixes=("_test", "_subm")
)
print(f"len(df_subm_test) = {len(df_subm_test)}")
display(df_subm_test.head())

len(df_subm_test) = 25382


Unnamed: 0.1,Unnamed: 0,song_id,filename,filepath,file_exists,mel_spec,genre_id
0,0.0,7072,007072.ogg,test/007072.ogg,True,007072_0.jpg,0
1,1.0,7072,007072.ogg,test/007072.ogg,True,007072_2.jpg,0
2,2.0,7072,007072.ogg,test/007072.ogg,True,007072_4.jpg,0
3,3.0,7072,007072.ogg,test/007072.ogg,True,007072_3.jpg,0
4,4.0,7072,007072.ogg,test/007072.ogg,True,007072_1.jpg,0


In [44]:
# # get the best model (having lowest val loss) for the fold
# best_model_path_val_loss = "/content/gdrive/MyDrive/Kaggle/Pog_Music_Classification/data/model/fold0_best_model_epoch=21_val_loss=1.5357.ckpt"
# print(f"Using best model = {best_model_path_val_loss} for test prediction")
# best_model = MusicClfLitModel.load_from_checkpoint(
#     checkpoint_path=best_model_path_val_loss,
#     num_classes=Config.NUM_CLASSES, 
#     hparams=Config.MODEL_PARAMS,        
#     model_to_use=Config.MODEL_TO_USE
# )
# print(f"Using device = {Config.DEVICE}")
# best_model.to(Config.DEVICE)


In [45]:
#df = df_subm_test[df_subm_test.filename.notnull()]

In [46]:
def get_pred(preds, proba_thresh):
  index_preds = [(index, pred) for index, pred in enumerate(preds)]
  index_preds_sorted = sorted(index_preds, key=lambda item: item[1], reverse=True)  
  pred_label = 0
  for index, pred in index_preds_sorted:
    if pred > proba_thresh[index]:
      pred_label = index
  return pred_label

In [47]:
# test_preds = []
# with torch.no_grad():    
#     for id, X in tqdm(dl_test):
#         id = id.cpu().detach().numpy()        
#         y_preds = best_model(X.to(Config.DEVICE))                
#         test_preds.append(y_preds.cpu().detach().numpy())
# test_preds_final = np.concatenate(test_preds, axis=0)

In [48]:
optimal_proba_dict = {'0': 0.36, '1': 0.30000000000000004, '2': 0.48, '3': 0.24000000000000002, '4': 0.30000000000000004, '5': 0.22, '6': 0.44000000000000006, '7': 0.28, '8': 0.28, '9': 0.26, '10': 0.42000000000000004, '11': 0.26, '12': 0.4, '13': 0.2, '14': 0.33999999999999997, '15': 0.45999999999999996, '16': 0.36, '17': 0.44000000000000006, '18': 0.24000000000000002}
#optimal_proba = list(optimal_proba_dict.values())
optimal_proba = None

In [49]:
for fold in range(Config.NUM_FOLDS):
    fold_best_model = MusicClfLitModel.load_from_checkpoint(
                            checkpoint_path=Config.MODEL_PATH_BASE + FOLD_BEST_MODELS[fold],
                            num_classes=Config.NUM_CLASSES, 
                            hparams=Config.MODEL_PARAMS,        
                            model_to_use=Config.MODEL_TO_USE
                        )
    fold_best_model.to(Config.DEVICE)
    print(f"Using fold {fold} best model = {FOLD_BEST_MODELS[fold]} for test prediction")
    with torch.no_grad():    
        for id, X in tqdm(dl_test):
            id = id.cpu().detach().numpy()  
            y_preds = fold_best_model(X.to(Config.DEVICE))
            y_preds_np = y_preds.cpu().detach().numpy()          
            if optimal_proba is not None:          
                test_genre_id = [get_pred(pred, optimal_proba) for pred in y_preds_np]
            else:
                test_genre_id = np.argmax(y_preds_np, axis=1)
            if Config.NUM_FOLDS == 1:
                df_test.loc[id, "genre_id"] = test_genre_id
            else:
                df_test.loc[id, f"fold{fold}_genre_id"] = test_genre_id                                      
df_test.to_csv(Config.DATA_ROOT_FOLDER + "test_fold_preds.csv")

Using fold 0 best model = fold1_resnext50_32x4d_best_model_epoch=9_val_loss=1.4965.ckpt for test prediction


  0%|          | 0/199 [00:00<?, ?it/s]

In [50]:
def combine_preds(test_row):
    preds = np.zeros(Config.NUM_CLASSES)
    for fold in range(Config.NUM_FOLDS):
        fold_pred = int(test_row[f"fold{fold}_genre_id"])
        preds[fold_pred] += 1
    return np.argmax(preds)

if Config.NUM_FOLDS > 1:
    df_test["genre_id"] = df_test.apply(lambda row:combine_preds(row), axis=1)
df_test.to_csv(Config.DATA_ROOT_FOLDER + "test_fold_preds.csv")    

In [51]:
# with torch.no_grad():    
#   for id, X in tqdm(dl_test):
#     id = id.cpu().detach().numpy()        
#     y_preds = best_model(X.to(Config.DEVICE))
#     y_preds_np = y_preds.cpu().detach().numpy()    
#     if optimal_proba is not None:          
#       test_genre_id = [get_pred(pred, optimal_proba) for pred in y_preds_np]
#     else:
#       test_genre_id = np.argmax(y_preds_np, axis=1)                  
#     df_test.loc[id, "genre_id"] = test_genre_id        

In [52]:
df_test = df_test[df_test.genre_id.notnull()]

In [53]:
# from df_test, group by song_id count(genre_id), select song_id and max count(genre_id)
# this will be new df_test with a unique combination of song_id and genre_id

# group by song_id, genre_id and get the count of each group (under mel_spec column)
df_test_new1 = df_test.groupby(["song_id", "genre_id"], as_index=False)["mel_spec"].count()
# for each song_id get the max count (of mel_spec which is a proxy for genre_id).
# df_test_new2 thus has song_id and the max count of genre_id
df_test_new2 = df_test_new1.groupby(["song_id"], as_index=False)[["mel_spec"]].max()
# join the above two dataframes on song_id and max count to get the genre_id with max_count
df_test_final = pd.merge(
    left=df_test_new1,
    right=df_test_new2,
    how="inner",
    on=["song_id", "mel_spec"]
)
print(f"len(df_test_final) = {len(df_test_final)}")
df_test_final.head()

len(df_test_final) = 5774


Unnamed: 0,song_id,genre_id,mel_spec
0,3,3.0,2
1,3,4.0,2
2,6,11.0,3
3,8,1.0,4
4,11,2.0,2


In [54]:
df_test_final = df_test_final.groupby(["song_id", "mel_spec"], as_index=False)["genre_id"].max()
print(f"len(df_test_final) = {len(df_test_final)}")

len(df_test_final) = 5076


In [55]:
df_invalid = df_subm_test[df_subm_test.filename.isnull()][["song_id", "genre_id"]]
df_preds = pd.concat([df_test_final[["song_id", "genre_id"]], df_invalid], axis=0)
df_preds.genre_id = df_preds.genre_id.astype(int)
print(f"len(df_test_final) = {len(df_test_final)}")
print(f"len(df_preds) = {len(df_preds)}")

len(df_test_final) = 5076
len(df_preds) = 5078


In [56]:
df_preds.head()

Unnamed: 0,song_id,genre_id
0,3,4
1,6,11
2,8,1
3,11,2
4,17,5


In [57]:
df_preds.to_csv(Config.DATA_ROOT_FOLDER + "model/CV/AUG_RESNEXT50/submission_resnext50_aug_bestfold.csv", index=False)