#Clone and import the necessary packages and code and import 

In [1]:
!git clone https://github.com/physionetchallenges/python-classifier-2022.git pythonClassifier2022
!git clone https://github.com/physionetchallenges/evaluation-2022.git evaluation2022

Cloning into 'pythonClassifier2022'...
remote: Enumerating objects: 41, done.[K
remote: Total 41 (delta 0), reused 0 (delta 0), pack-reused 41[K
Unpacking objects: 100% (41/41), 22.95 KiB | 1.43 MiB/s, done.
Cloning into 'evaluation2022'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 39 (delta 20), reused 23 (delta 7), pack-reused 0[K
Unpacking objects: 100% (39/39), 15.65 KiB | 890.00 KiB/s, done.


In [16]:
import numpy as np
import librosa as lb
from tqdm import tqdm
import torch
import os
import pandas as pd

from pythonClassifier2022.helper_code import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import IPython.display as ipd
import librosa.display
from scipy import signal
from scipy.ndimage.interpolation import zoom
from matplotlib import pyplot as plt
from os import listdir
def ls(ruta = '.'):
    return listdir(ruta)
from scipy import signal
from IPython.display import Audio
import seaborn as sns
import IPython.display as ipd
from tqdm import tqdm

  from scipy.ndimage.interpolation import zoom


## copy the data zip file to the temporary directory before unzipping it

In [20]:
!cp '/content/drive/MyDrive/surrey/aihealth/Project_AI_Health/the-circor-digiscope-phonocardiogram-dataset-1.0.3.zip' '/content/data.zip'
!unzip '/content/data.zip'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68711_TV.tsv  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68711_TV.wav  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737.txt  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737_AV.hea  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737_AV.tsv  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737_AV.wav  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737_MV.hea  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737_MV.tsv  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737_MV.wav  
  inflating: the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/68737_PV.hea  
  inflat

# Data processing

## Handling the audio features and demographic features

In this function, we convert normal frame array into the smaller 2D log mel spectogram arrays. First of all, we need to cut the frame into smaller frames sample, and each of the frame has the duration of 4 second with the stride is 1 second (It is similar to the CNN, but the filter size is 4 seconds while the stride is 1 second). After that, for each cropped frame, we convert it into the Log mel spectogram to represent the spectrum in a suitable 2D array.   

In [21]:
import numpy as np
import librosa

def new_audio_features(signal, rate):
  frames = lb.util.frame(signal, frame_length=rate*4, hop_length=rate, axis = 0)
  framesarray = []
  for frame in frames:
    melspec = lb.feature.melspectrogram(y=frame, sr=rate, n_fft=int(0.05 * rate), hop_length=int(0.03 * rate), n_mels=64)
    framesarray.append(lb.power_to_db(melspec))
  return np.array(framesarray)

These functions below will process the demographic data

In [22]:
path_train = '/content/the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/'
patient_files = find_patient_files(path_train)

def extract_label(data):
    murmur = None
    outcome = None
    for l in data.split('\n'):
        if l.startswith('#Murmur:'):
            try:
                murmur = l.split(': ')[1]
            except:
                pass
        
        if l.startswith('#Outcome:'):
            try:
                outcome = l.split(': ')[1]
            except:
                pass
    if murmur is None:
        raise ValueError('No murmur available.')
    if outcome is None:
        raise ValueError('No outcome available.')
    return murmur, outcome

def get_specific_info(data, search_str):
    for line in data.split('\n'):
        if search_str in line:
            label = line.split(': ')[1]
            return label
    raise ValueError('Error: search string not found')

def get_patient_info(data,info):
    id = get_patient_id(data)
    locations = get_locations(data)
    info_v=[]
    murmur, outcome = extract_label(data)
    for i in info:
        info_v.append(get_specific_info(data,i))
    return id,locations,info_v, murmur, outcome 

def get_features(data_folder,data_path,resamp,w,info):

    data = load_patient_data(data_path)
    id,locations,info_v, labelm, labelo  = get_patient_info(data,info)
    
    #age
    age = np.zeros(5)
    age_group=info_v[0]
    if compare_strings(age_group, 'Neonate'):
        age[0] = 1
    elif compare_strings(age_group, 'Infant'):
        age[1] = 1
    elif compare_strings(age_group, 'Child'):
        age[2] = 1
    elif compare_strings(age_group, 'Adolescent'):
        age[3] = 1
    elif compare_strings(age_group, 'Young Adult'):
        age[4] = 1
    else:
        pass

    #sex
    if compare_strings(info_v[1], 'Female'):
        sex = 0
    elif compare_strings(info_v[1], 'Male'):
        sex = 1
    else:
        sex = float('nan') #there is no nan case in this features
    
    #heigth and weight
    # height = info_v[2]
    # weight = info_v[3]

    #pregnancy status
    if compare_strings(info_v[2], 'False'): #info_v[4] to info_v[2]
        preg = 0
    elif compare_strings(info_v[2], 'True'):
        preg = 1
    else:
        preg = float('nan') #there is no nan case in this features

    recording_locations = ['AV', 'MV', 'PV', 'TV', 'Phc']
    
    if len(info_v)>3:
        murmur_locs = info_v[3].split('+') #info_v[5] to info_v[3]
        murmur_locs_cod = np.zeros(5,dtype=int)
        murmur_most_au = np.zeros(5,dtype=int)
        for _,loc in enumerate(murmur_locs):
            if loc in recording_locations:
                murmur_locs_cod[recording_locations.index(loc)]=+1

        if info_v[4] in recording_locations:
            murmur_most_au[recording_locations.index(info_v[4])]=+1
        demo_features = np.hstack((age,sex,preg,murmur_locs_cod,murmur_most_au))
    else:
        print(data_path)    
        demo_features = np.hstack((age,sex,preg))

    signal_features = list()
    num_locations = get_num_locations(data)
    recording_information = data.split('\n')[1:num_locations+1]

    for c,i in enumerate(locations):
        entries = recording_information[c].split(' ')
        locs = np.zeros(5,dtype=int)
        recording_file = entries[2]
        if i in recording_locations:
            locs[recording_locations.index(i)]=+1
        else:
            print('what',i)
        x,rate=lb.load(data_folder+recording_file,sr=resamp)
        x=x/np.max(np.abs(x))
        # signal_features.append(np.hstack((id,locs,audio_features(x,rate,w))))
        signal_features.append((np.hstack((id,locs)), new_audio_features(x,rate)))
    return demo_features.astype(np.float), signal_features, labelm, labelo, id

For each patient, we will obtain different samples made from demographic and signal data, the number of samples for each patients depends on the number of audio location obtained from the dataset

In [23]:
w=0.2
resamp=1000
all_features = []
info='''#Age,
#Sex,
#Pregnancy status,
#Murmur locations,
#Most audible location'''.split(',\n')

patient_files = find_patient_files(path_train)
demo_data=[]
labelmurmurs=[]
labeloutcomes=[]
signal_data=[]
id_data=[]
for i in tqdm(patient_files):
    current=get_features(path_train,i,resamp,w,info)
    demo_data.append(current[0])
    labelmurmurs.append(current[2])
    labeloutcomes.append(current[3])
    id_data.append(current[4])
    signal_data.append(current[1])

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return demo_features.astype(np.float), signal_features, labelm, labelo, id
100%|██████████| 942/942 [02:06<00:00,  7.44it/s]


In [24]:
print(len(demo_data))
print(len(labelmurmurs))
print(len(signal_data))
print(len(id_data))
print(len(labeloutcomes))

942
942
942
942
942


## Split the dataset
We apply the train_test_split function on the combination of demo features and signal features and the combination of the murmur label, outcome label and patient ID. The train set will be 60% of the dataset, while the validate set is 10% and the test set is 30% of the dataset.

In [25]:
from sklearn.model_selection import train_test_split

# labels = [(labellist[i], labeloutcomelist[i], idlist[i]) for i in range(len(labellist))]
features = [(demo_data[i], signal_data[i]) for i in range(len(demo_data))]
labels = [(labelmurmurs[i], labeloutcomes[i], id_data[i]) for i in range(len(demo_data))]
rawtrain_features, rawtest_features, rawtrain_labels, rawtest_labels = train_test_split(features, labels, test_size=0.4, random_state=42)
rawval_features, rawtest_features, rawval_labels, rawtest_labels = train_test_split(rawtest_features, rawtest_labels, test_size=0.75, random_state=42)

In [26]:
rawdemo_datatrain, rawsignal_datatrain = list(map(list, zip(*rawtrain_features)))
rawdemo_dataval, rawsignal_dataval = list(map(list, zip(*rawval_features)))
rawdemo_datatest, rawsignal_datatest = list(map(list, zip(*rawtest_features)))
rawmurmurtrain, rawoutcometrain, rawidtrain = list(map(list, zip(*rawtrain_labels)))
rawmurmurval, rawoutcomeval, rawidval = list(map(list, zip(*rawval_labels)))
rawmurmurtest, rawoutcometest, rawidtest = list(map(list, zip(*rawtest_labels)))

After that, we create different lists of the dataset:
- The data array list for the features obtained in different sets (train, test & val)
- The murmur, outcome label list for different sets (train, test & val)
- The ID list for different sets (train, test & val)

For each separated and converted log mel spectogram array, we assign it with the transformed demographic features, ID. Finally, we can the final data array in both 3 sets to put in the Torch dataloader function.

In [27]:
def get_signalarray(signal_data, index):  
  try:
    k = signal_data[index]
  except:
    print(index)
    print(signal_data)
    return
  signalarray = []
  for i in range(len(k)):
    for j in range(len(k[i][1])):
      signalarray.append(k[i][1][j])
  return torch.from_numpy(np.stack(signalarray))

dataarraytrain = []
dataarrayval = []
dataarraytest = []

murmurlisttrain = []
outcomelisttrain = []
idlisttrain = []

murmurlistval = []
outcomelistval = []
idlistval = []

murmurlisttest = []
outcomelisttest = []
idlisttest = []

labeldict = {
      'Present': 0,
      'Unknown': 1,
      'Absent': 2,
      'Abnormal': 0,
      'Normal': 1 
}


for i in range(len(rawdemo_datatrain)):  
  #train
  signalarraytrain = get_signalarray(rawsignal_datatrain, i)
  for j in range(signalarraytrain.shape[0]):
    dataarraytrain.append((torch.from_numpy(rawdemo_datatrain[i]), signalarraytrain[j]))
    murmurlisttrain.append(torch.tensor(labeldict[rawmurmurtrain[i]]))
    outcomelisttrain.append(torch.tensor(labeldict[rawoutcometrain[i]]))
    idlisttrain.append(rawidtrain[i])

for i in range(len(rawdemo_dataval)):  
  #val
  signalarrayval = get_signalarray(rawsignal_dataval, i)
  for j in range(signalarrayval.shape[0]):
    dataarrayval.append((torch.from_numpy(rawdemo_dataval[i]), signalarrayval[j]))
    murmurlistval.append(torch.tensor(labeldict[rawmurmurval[i]]))
    outcomelistval.append(torch.tensor(labeldict[rawoutcomeval[i]]))
    idlistval.append(rawidval[i])

for i in range(len(rawdemo_datatest)):    
  #test
  signalarraytest = get_signalarray(rawsignal_datatest, i)
  for j in range(signalarraytest.shape[0]):
    dataarraytest.append((torch.from_numpy(rawdemo_datatest[i]), signalarraytest[j]))
    murmurlisttest.append(torch.tensor(labeldict[rawmurmurtest[i]]))
    outcomelisttest.append(torch.tensor(labeldict[rawoutcometest[i]]))
    idlisttest.append(rawidtest[i])

print(len(dataarraytrain))
print(len(idlisttrain))  
print(len(dataarrayval))
print(len(idlistval))
print(len(dataarraytest))
print(len(idlisttest))

36996
36996
6310
6310
17932
17932


# Using torch to develop DataLoader, model, train and generate output

## Training part

Here, we implement a custom dataset using these features obtained from the functions above and apply the Dataloader function using the Dataset, Dataloader from Torch

In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

trainlabels = [(murmurlisttrain[i], outcomelisttrain[i], idlisttrain[i]) for i in range(len(murmurlisttrain))]
vallabels = [(murmurlistval[i], outcomelistval[i], idlistval[i]) for i in range(len(murmurlistval))]
testlabels = [(murmurlisttest[i], outcomelisttest[i], idlisttest[i]) for i in range(len(murmurlisttest))]

class NewDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        demo_data = self.data[index][0]
        signal_data = self.data[index][1]
        labelmurmur = self.label[index][0]
        labeloutcome = self.label[index][1]
        id = self.label[index][2]
        return demo_data.float(), signal_data.float(), labelmurmur, labeloutcome, torch.tensor([int(id)])

trainDataset = NewDataset(dataarraytrain, trainlabels)
trainLoader = DataLoader(trainDataset, batch_size=32, shuffle=True)

valDataset = NewDataset(dataarrayval, vallabels)
valLoader = DataLoader(valDataset, batch_size=64, shuffle=True)

testDataset = NewDataset(dataarraytest, testlabels)
testLoader = DataLoader(testDataset, batch_size=64, shuffle=False)

In this part, we develop a model that receive both the demographic features and the signal features as inputs. The demographic feature is put into a fully connected layer first while the signal one get into a feature extractor (this backbone can be resnet, densenet, efficientnet, etc). After that, we concatenate all the obtained features together, put it into another fully connected layer before being separated again to 2 different branches. This time, in each branch, the features will go through several other fully connected layers to obtain the final murmur and outcome outputs. All activation functions used in this model are all ReLU function, and dropout layer is also used to alleviate the overfit problem 

In [9]:
import torch.nn as nn
from torchvision import models
import torch.optim as optim

class NewModel(nn.Module):
  def __init__(self, modelname):
    super(NewModel, self).__init__()
    if modelname == 'resnet50':
      self.signalhandler = models.resnet50()
      self.signalhandler.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias = False)
    elif modelname == 'densenet121':
      self.signalhandler = models.densenet121()
      self.signalhandler.features.conv0 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    elif modelname == 'efficientnetb4':
      self.signalhandler = models.efficientnet_b4()
      self.signalhandler.features[0][0] = nn.Conv2d(1, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    else:
      self.signalhandler = models.resnet50()
      self.signalhandler.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias = False)
    self.fc1 = nn.Linear(17, 256)
    self.relu = nn.ReLU()
    self.fcmerge = nn.Linear(1256, 1256) 
    self.fc2 = nn.Linear(1256, 512)
    self.fc3 = nn.Linear(1256, 512)
    self.fc4 = nn.Linear(512, 256)
    self.fc5 = nn.Linear(512, 256)
    self.fc6 = nn.Linear(256, 3)
    self.fc7 = nn.Linear(256, 2)
    self.dr = nn.Dropout(p=0.2)


  def forward(self, demo, signal):
    demo = self.fc1(demo)
    demo = self.dr(self.relu(demo))
    signal = self.signalhandler(signal.unsqueeze(1))
    merge = torch.cat((demo, signal), dim=1)
    merge = self.relu(self.fcmerge(merge))
    
    murmur = self.dr(self.relu(self.fc2(merge)))
    murmur = self.dr(self.relu(self.fc4(murmur)))
    murmur = self.fc6(murmur)

    outcome = self.dr(self.relu(self.fc3(merge)))
    outcome = self.dr(self.relu(self.fc5(outcome)))
    outcome = self.fc7(outcome)
    return murmur, outcome

After that, we will train the model above, the model name is the name of the backbone used in this architecture to extract the features from the signal data.
- In the code below, the selected optimizer is Adam and the chosen learning rate is 0.0001. After 5 epochs, the learning rate will be reduce to half.
- The loss functions for both branches are cross entropy loss, and the total loss is combination of these loss.
- The model will be trained on 20 epochs 

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F

modelname = 'resnet50'
model = NewModel(modelname).to(device)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
# criterion2 = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, amsgrad = True)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# Define your training loop
def train(model, trainloader, testloader, criterion, optimizer, num_epochs):
    maxf1 = 0.0
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        epoch_val_loss = 0.0
        epoch_accuracy = 0.0


        model.train()
        for i, (demo_data, signal_data, labelmurmur, labeloutcome, id) in tqdm(enumerate(trainloader), total=len(trainloader)):
            demo_data, signal_data, labelmurmur, labeloutcome, id = demo_data.to(device), signal_data.to(device), labelmurmur.to(device), labeloutcome.to(device), id.to(device)
            optimizer.zero_grad()
            murmur, outcome = model(demo_data, signal_data)
            loss1 = criterion(murmur, labelmurmur) 
            # print(labeloutcome)
            # loss2 = criterion2(outcome, F.one_hot(labeloutcome.squeeze(), num_classes=2).float())
            loss2 = criterion(outcome, labeloutcome) 
            
            loss = loss1 + 0.2*loss2
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

            # murmurtrainloss += loss1.item()
            # outcometrainloss += loss2.item()
        
        scheduler.step()
        epoch_loss = epoch_loss / len(trainloader)

        predlist = []
        labellist = []
        predoutcomelist = []
        labeloutcomelist = []  
        
        model.eval()
        with torch.no_grad():
          for i, (demo_data, signal_data, labelmurmur, labeloutcome, id) in tqdm(enumerate(testloader), total=len(testloader)):
              demo_data, signal_data, labelmurmur, labeloutcome, id = demo_data.to(device), signal_data.to(device), labelmurmur.to(device), labeloutcome.to(device), id.to(device)
              murmur, outcome = model(demo_data, signal_data)
              val_loss1 = criterion(murmur, labelmurmur)
              # val_loss2 = criterion2(outcome, F.one_hot(labeloutcome.squeeze(), num_classes=2).float())
              val_loss2 = criterion(outcome, labeloutcome)
              val_loss = val_loss1 + 0.2*val_loss2 
              epoch_val_loss += val_loss.item()

              # murmurtestloss += val_loss1
              # outcometestloss += val_loss2

              # epoch_accuracy += torch.sum(murmur.argmax(dim = 1) == labelmurmur).item()
              
              predlist.extend(murmur.cpu().numpy())
              labellist.extend(labelmurmur.cpu().numpy())
              predoutcomelist.extend(outcome.cpu().numpy())
              labeloutcomelist.extend(labeloutcome.cpu().numpy())
          
          epoch_val_loss = epoch_val_loss / len(testloader)
          # epoch_accuracy = epoch_accuracy/ (64 * len(testloader)) # accuracy_score(labellist.cpu().numpy(), predlist.cpu().numpy()) 
          f1murmur = f1_score(labellist, np.argmax(predlist, axis = 1), average='macro')
          f1outcome = f1_score(labeloutcomelist, np.argmax(predoutcomelist, axis = 1), average='macro')

        print('Epoch [{}/{}], Loss: {:.4f}, Val_Loss: {:.4f}, F1valmurmur: {:.4f}, F1valoutcome: {:.4f}'.format(epoch+1, num_epochs, epoch_loss, epoch_val_loss, f1murmur, f1outcome))
        
        if f1murmur > maxf1:
          maxf1 = f1murmur
          torch.save(model.state_dict(), f'/content/model{epoch + 1}.pt')

train(model, trainLoader, valLoader, criterion, optimizer, num_epochs=20)

After training, we save the selected model by copying the best model on val dataset to temporary location in colab

In [None]:
!cp /content/model9.pt /content/drive/MyDrive/resnet50_9.pt

## Inference part
We load the selected model and run it on the test dataset to obtain the test outputs

In [None]:
import pandas as pd
listpresent = []
listabsent = []
listunknown = []
listnormal = []
listabnormal = []

model = NewModel('resnet50').to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/resnet50_9.pt'))
model.eval()
with torch.no_grad():
  for i, (demo_data, signal_data, labelmurmur, labeloutcome, id) in tqdm(enumerate(testLoader), total=len(testLoader)):
    demo_data, signal_data, labelmurmur, labeloutcome, id = demo_data.to(device), signal_data.to(device), labelmurmur.to(device), labeloutcome.to(device), id.to(device)
    murmur, outcome = model(demo_data, signal_data)
    for j in range(murmur.shape[0]):
      listpresent.append(murmur[j][0].item())
      listunknown.append(murmur[j][1].item())
      listabsent.append(murmur[j][2].item())
      listabnormal.append(outcome[j][0].item())
      listnormal.append(outcome[j][1].item())
      


100%|██████████| 281/281 [00:13<00:00, 21.37it/s]


We create the dataframe from the generated test outputs

In [None]:
label_list = [item for tensor in murmurlisttest for item in tensor.flatten().tolist()]
label_outcomelist = [item for tensor in outcomelisttest for item in tensor.flatten().tolist()]
outputdf = pd.DataFrame({'Id': idlisttest, 'Present': listpresent, 'Unknown': listunknown, 'Absent': listabsent, 'Abnormal': listabnormal,'Normal': listnormal,  'Label_murmur': label_list, 'Label_outcome': label_outcomelist})
outputdf

Unnamed: 0,Id,Present,Unknown,Absent,Abnormal,Normal,Label_murmur,Label_outcome
0,50049,-11.170255,-0.711617,11.530146,-0.713749,0.626303,2,0
1,50049,-11.672747,-0.668783,11.906554,-0.718791,0.625559,2,0
2,50049,-11.255380,0.635575,9.969642,-0.614303,0.498793,2,0
3,50049,-10.553848,0.206531,9.771252,-0.639451,0.531168,2,0
4,50049,-10.886030,-0.453663,10.899289,-0.719905,0.624366,2,0
...,...,...,...,...,...,...,...,...
17927,84780,-11.487653,-1.199198,12.227393,-0.445041,0.394306,2,1
17928,84780,-6.325760,0.202310,5.602614,-0.301312,0.208638,2,1
17929,84780,-5.394622,1.601373,3.085593,-0.182213,0.008872,2,1
17930,84780,-5.697076,0.663887,4.451768,-0.373214,0.236453,2,1


After that, since the dataframe has many samples having the same patient ID, and we only want 1 specific output (murmur and outcome) for each patient, we just group all the samples with the same patient ID using the mean function.

In [None]:
resultdf = outputdf.groupby('Id').mean()
resultdf

Unnamed: 0_level_0,Present,Unknown,Absent,Abnormal,Normal,Label_murmur,Label_outcome
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14998,-7.125820,-0.091001,6.729200,-0.270682,0.196120,2.0,0.0
2530,-8.121606,-0.278336,8.020247,-0.379237,0.287166,2.0,0.0
31737,-7.896607,-0.429122,7.947450,-0.327838,0.237959,2.0,0.0
40798,-8.700133,-0.475554,8.743419,-0.387091,0.309841,2.0,0.0
46532,-6.778765,0.198867,6.101916,-0.256258,0.146521,2.0,1.0
...,...,...,...,...,...,...,...
85338,-13.398640,0.330079,12.187226,-0.538684,0.447017,2.0,1.0
85339,10.062480,-4.623791,-3.690697,1.381339,-1.423358,0.0,1.0
85340,-7.056693,0.253299,6.252205,-0.270394,0.178029,2.0,1.0
85345,-7.449839,0.528311,6.416335,-0.352005,0.250950,2.0,1.0


We also need to apply softmax for the output of each patient to be similar to the format of the csv file used in the competition

In [None]:
from scipy.special import softmax

newpred = [0 for i in range(len(resultdf))]

resultdf['predmurmur'] = newpred

for i in range(len(resultdf)):
  present = resultdf['Present'][i]
  absent = resultdf['Absent'][i]
  unknown = resultdf['Unknown'][i]
  normal = resultdf['Normal'][i]
  abnormal = resultdf['Abnormal'][i]
  softmaxoutput = softmax(np.array([present, absent, unknown]))
  softmaxoutcomeoutput = softmax(np.array([normal, abnormal]))
  resultdf['Present'][i] = softmaxoutput[0]
  resultdf['Absent'][i] = softmaxoutput[1]
  resultdf['Unknown'][i] = softmaxoutput[2]
  resultdf['Normal'][i] = softmaxoutcomeoutput[1]
  resultdf['Abnormal'][i] = softmaxoutcomeoutput[0]
  # resultdf['Label_murmur'][i] = resultdf['Label_murmur'][i].astype(int)
  resultdf['predmurmur'][i] = np.argmax(np.array(softmaxoutput))

resultdf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resultdf['Present'][i] = softmaxoutput[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resultdf['Absent'][i] = softmaxoutput[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resultdf['Unknown'][i] = softmaxoutput[2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resultdf['Normal'][i] = softmaxoutcomeoutpu

Unnamed: 0_level_0,Present,Unknown,Absent,Abnormal,Normal,Label_murmur,Label_outcome,predmurmur
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14998,9.602120e-07,1.090311e-03,0.998909,0.614627,0.385373,2.0,0.0,1
2530,9.762800e-08,2.488075e-04,0.999751,0.660697,0.339303,2.0,0.0,1
31737,1.314962e-07,2.301447e-04,0.999770,0.637793,0.362207,2.0,0.0,1
40798,2.656554e-08,9.913066e-05,0.999901,0.667507,0.332493,2.0,0.0,1
46532,2.539835e-06,2.723661e-03,0.997274,0.599355,0.400645,2.0,1.0,1
...,...,...,...,...,...,...,...,...
85338,7.730299e-12,7.087665e-06,0.999993,0.728238,0.271762,2.0,1.0,1
85339,9.999985e-01,4.186323e-07,0.000001,0.057071,0.942929,0.0,1.0,0
85340,1.655549e-06,2.475319e-03,0.997523,0.610264,0.389736,2.0,1.0,1
85345,9.479695e-07,2.764782e-03,0.997234,0.646332,0.353668,2.0,1.0,1


Finally, we drop all unnecessary columns, obtain the desired dataframe and save it as a output csv file

In [None]:
newresultdf = resultdf.drop(columns=["Label_murmur", "Label_outcome", "predmurmur"])
newresultdf

Unnamed: 0_level_0,Present,Unknown,Absent,Abnormal,Normal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14998,9.602120e-07,1.090311e-03,0.998909,0.614627,0.385373
2530,9.762800e-08,2.488075e-04,0.999751,0.660697,0.339303
31737,1.314962e-07,2.301447e-04,0.999770,0.637793,0.362207
40798,2.656554e-08,9.913066e-05,0.999901,0.667507,0.332493
46532,2.539835e-06,2.723661e-03,0.997274,0.599355,0.400645
...,...,...,...,...,...
85338,7.730299e-12,7.087665e-06,0.999993,0.728238,0.271762
85339,9.999985e-01,4.186323e-07,0.000001,0.057071,0.942929
85340,1.655549e-06,2.475319e-03,0.997523,0.610264,0.389736
85345,9.479695e-07,2.764782e-03,0.997234,0.646332,0.353668


In [None]:
newresultdf.to_csv('/content/resnet50temp.csv')