

---

# Coswara

---



# Full audio

## Load feature


### Mel-spectrogram

#### ML

In [112]:
import os
import pandas as pd

root = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/full_audio'

ml_mel_coswara = os.path.join(root, 'melspectrogram_coswara_features.csv')

ml_mfcc_13_coswara = os.path.join(root, 'mfcc_13_coswara_features.csv')
ml_mfcc_26_coswara = os.path.join(root, 'mfcc_26_coswara_features.csv')
ml_mfcc_39_coswara = os.path.join(root, 'mfcc_39_coswara_features.csv')

In [110]:
def data_split(label0, label1):
  p180 = 965
  p190 = 1086
  p080 = 2555
  p090 = 2875
  
  train = pd.concat([label0[:p080], label1[:p180]], ignore_index=True)
  val = pd.concat([label0[p080:p090], label1[p180:p190]], ignore_index=True)
  test = pd.concat([label0[p090:], label1[p190:]], ignore_index=True)

  train = train.sample(frac=1).reset_index(drop=True)
  val = val.sample(frac=1).reset_index(drop=True)
  test = test.sample(frac=1).reset_index(drop=True)

  return train, val, test

In [113]:
mel_coswara_df = pd.read_csv(ml_mel_coswara)
mel_coswara_df.label = mel_coswara_df.label.map({
    'healthy': 0,
    'positive_mild': 1,
    'no_resp_illness_exposed': 1,
    'resp_illness_not_identified': 0,
    'positive_moderate': 1,
    'recovered_full': 0,
    'positive_asymp': 1
})
label0 = mel_coswara_df[mel_coswara_df['label']==0].copy().reset_index(drop=True)
label1 = mel_coswara_df[mel_coswara_df['label']==1].copy().reset_index(drop=True)

In [114]:
_, __, ml_test_cos_mel = data_split(label0, label1)

In [115]:
ml_X_test_cos_mel = ml_test_cos_mel.iloc[:, :-1].values
ml_y_test_cos_mel = ml_test_cos_mel.iloc[:, -1].values

#### DL

##### Set up

In [63]:
# from google.colab import drive
# drive.mount('/content/drive')

In [64]:
from IPython.display import clear_output, Audio
import IPython.display as ipd
import os
import numpy as np
import shutil
import pandas as pd
import logging
import warnings

warnings.filterwarnings("ignore")


logger = logging.getLogger('COSWARA - HYBRID MODEL FULL AUDIO')
logger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)
pd.set_option('display.max_colwidth', None)
clear_output()

In [65]:
num_mfcc = 0

root = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/cnn/mobileNet'

coswara_feature = os.path.join(root, 'full_audio/melspectrogram_coswara_features.npy')

coughvid_feature = os.path.join(root, 'full_audio/melspectrogram_coughvid_features.npy')

meta_coswara = os.path.join(root, 'full_audio/coswara_metadata_for_label_matching.csv')
meta_coughvid = os.path.join(root, 'full_audio/coughvid_metadata_for_label_matching.csv')

##### Feature preparation

###### Metadata

In [66]:
# process metadata
meta_df = pd.read_csv(meta_coswara)
meta_df.head()

Unnamed: 0,id,age,covid_status,record_date,is_english_proficiency,gender,country,locality,state,is_returning_user,...,is_fatigue,is_sore_throat,is_ischemic_heart_disease,is_asthma,is_others_preexist_conditions,is_chronic_lung_disease,is_neumonia,cough_path,label,error
0,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,28,healthy,2020-04-23,y,male,India,Anantapur,Andhra Pradesh,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/iV3Db6t1T8b7c5HQY2TwxIhjbzD3/cough-shallow.wav,healthy,0
1,AxuYWBN0jFVLINCBqIW5aZmGCdu1,25,healthy,2020-04-20,y,male,India,BENGALURU URBAN,Karnataka,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/AxuYWBN0jFVLINCBqIW5aZmGCdu1/cough-shallow.wav,healthy,0
2,C5eIsssb9GSkaAgIfsHMHeR6fSh1,28,healthy,2020-04-24,y,female,United States,Pittsburgh,Pennsylvania,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/C5eIsssb9GSkaAgIfsHMHeR6fSh1/cough-shallow.wav,healthy,0
3,YjbEAECMBIaZKyfqOvWy5DDImUb2,26,healthy,2020-04-23,y,male,India,Bangalore,Karnataka,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/YjbEAECMBIaZKyfqOvWy5DDImUb2/cough-shallow.wav,healthy,0
4,aGOvk4ji0cVqIzCs1jHnzlw2UEy2,32,healthy,2020-04-22,y,male,India,Nalanda,Bihar,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/aGOvk4ji0cVqIzCs1jHnzlw2UEy2/cough-shallow.wav,healthy,0


In [67]:
logger.info(meta_df.shape)

2022-05-22 02:48:10,286 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4465, 39)
2022-05-22 02:48:10,286 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4465, 39)


In [68]:
mel_feature = np.load(coswara_feature)
meta_df = pd.read_csv(meta_coswara)
meta_df = meta_df[meta_df['error'] == 0].reset_index(drop=True)

In [69]:
logger.info(meta_df.shape)

2022-05-22 02:48:12,044 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 39)
2022-05-22 02:48:12,044 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 39)


In [70]:
meta_df.label.value_counts().sum()

4400

In [71]:
mel_feature[0].shape, len(mel_feature), meta_df.shape

((224, 246), 4400, (4400, 39))

In [72]:
DROP_COLS = [
             'cough_path',
             'error',
             'record_date',
             'id',
             'covid_status',
             'test_date'
]

meta_df.isna().sum()/4400*100 > 50

id                               False
age                              False
covid_status                     False
record_date                      False
is_english_proficiency           False
gender                           False
country                          False
locality                         False
state                            False
is_returning_user                False
is_smoker                         True
is_cold                           True
is_hypertension                   True
is_diabetes                       True
is_cough                          True
date_of_ct_scan                   True
has_ctScan                        True
ct_score                          True
is_diarrheoa                      True
is_fever                          True
is_loss_of_smell                  True
is_muscle_pain                    True
test_type                         True
test_date                         True
test_status                       True
is_using_mask            

In [73]:
meta_df.dtypes

id                                object
age                                int64
covid_status                      object
record_date                       object
is_english_proficiency            object
gender                            object
country                           object
locality                          object
state                             object
is_returning_user                 object
is_smoker                         object
is_cold                           object
is_hypertension                   object
is_diabetes                       object
is_cough                          object
date_of_ct_scan                   object
has_ctScan                        object
ct_score                         float64
is_diarrheoa                      object
is_fever                          object
is_loss_of_smell                  object
is_muscle_pain                    object
test_type                         object
test_date                         object
test_status     

In [74]:
# fill na with unknown value
meta_df.fillna('unknown', inplace=True)
meta_df.isna().sum()

id                               0
age                              0
covid_status                     0
record_date                      0
is_english_proficiency           0
gender                           0
country                          0
locality                         0
state                            0
is_returning_user                0
is_smoker                        0
is_cold                          0
is_hypertension                  0
is_diabetes                      0
is_cough                         0
date_of_ct_scan                  0
has_ctScan                       0
ct_score                         0
is_diarrheoa                     0
is_fever                         0
is_loss_of_smell                 0
is_muscle_pain                   0
test_type                        0
test_date                        0
test_status                      0
is_using_mask                    0
vaccination_status               0
is_breathing_difficulty          0
is_others_resp      

In [75]:
# drop cols
meta_df.drop(DROP_COLS, axis=1, inplace=True)
logger.info(meta_df.shape)

2022-05-22 02:48:12,163 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 33)
2022-05-22 02:48:12,163 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 33)


In [76]:
# replace label with binary numbers
meta_df.label = meta_df.label.map({
    'healthy': 0,
    'positive_mild': 1,
    'no_resp_illness_exposed': 1,
    'resp_illness_not_identified': 0,
    'positive_moderate': 1,
    'recovered_full': 0,
    'positive_asymp': 1
})
meta_df.label.value_counts()

0    3194
1    1206
Name: label, dtype: int64

In [77]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in meta_df.drop(['label', 'age'], axis=1).columns:
  meta_df[col] = meta_df[col].astype('string')
  meta_df[col] = le.fit_transform(meta_df[col].values)

In [78]:
meta_feature = np.array(meta_df.drop(['label'], axis=1))

##### Features

###### 2d

In [79]:
label_0 = meta_df[meta_df['label']==0].copy().reset_index(drop=True)
label_1 = meta_df[meta_df['label']==1].copy().reset_index(drop=True)

In [80]:
# get index of each label 0, 1 to match with extracted features
index_1 = meta_df[meta_df['label']==1].index.tolist()
index_0 = meta_df[meta_df['label']==0].index.tolist()
logger.info(f'{len(index_1)}, {len(index_0)}')

2022-05-22 02:48:12,297 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - 1206, 3194
2022-05-22 02:48:12,297 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - 1206, 3194


In [81]:
# get features for each class
feature_0 = mel_feature[index_0]
feature_1 = mel_feature[index_1]

In [82]:
print(len(feature_1))

1206


In [83]:
del mel_feature

In [84]:
# split data and reshape feature
def data_split(feature_0, feature_1, label0, label1):
  train = np.concatenate((feature_0[:2555], feature_1[:965]), axis=0)
  val = np.concatenate((feature_0[2555:2875], feature_1[965:1086]), axis=0)
  test = np.concatenate((feature_0[2875:], feature_1[1086:]), axis=0)

  y_train = pd.concat([label0[:2555], label1[:965]], ignore_index=True)
  y_val = pd.concat([label0[2555:2875], label1[965:1086]], ignore_index=True)
  y_test = pd.concat([label0[2875:], label1[1086:]], ignore_index=True)

  return train, val, test,\
         np.array(y_train.label), np.array(y_val.label), np.array(y_test.label)

In [85]:
X_train_2d, X_val_2d, X_test_2d, y_train_2d, y_val_2d, y_test_2d = data_split(feature_0, feature_1, label_0, label_1)
print(X_train_2d.shape)

(3520, 224, 246)


In [86]:
y_train_2d

array([0, 0, 0, ..., 1, 1, 1])

In [87]:
del feature_0
del feature_1

###### 1d

In [88]:
feature_0 = meta_feature[index_0]
feature_1 = meta_feature[index_1]

In [89]:
X_train_1d, X_val_1d, X_test_1d, y_train_1d, y_val_1d, y_test_1d = data_split(feature_0, feature_1, label_0, label_1)

In [90]:
from sklearn.utils import shuffle

X_train_2d, X_train_1d, y_train_2d, y_train_1d = shuffle(X_train_2d, X_train_1d, y_train_2d, y_train_1d, random_state=42)
X_val_2d, X_val_1d, y_val_2d, y_val_1d = shuffle(X_val_2d, X_val_1d, y_val_2d, y_val_1d, random_state=42)
X_test_2d, X_test_1d, y_test_2d, y_test_1d = shuffle(X_test_2d, X_test_1d, y_test_2d, y_test_1d, random_state=42)

### MFCC

#### 13

##### ML

In [136]:
import os
import pandas as pd

root = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/full_audio'

ml_mel_coswara = os.path.join(root, 'melspectrogram_coswara_features.csv')

ml_mfcc_13_coswara = os.path.join(root, 'mfcc_13_coswara_features.csv')
ml_mfcc_26_coswara = os.path.join(root, 'mfcc_26_coswara_features.csv')
ml_mfcc_39_coswara = os.path.join(root, 'mfcc_39_coswara_features.csv')

In [137]:
def data_split(label0, label1):
  p180 = 965
  p190 = 1086
  p080 = 2555
  p090 = 2875
  
  train = pd.concat([label0[:p080], label1[:p180]], ignore_index=True)
  val = pd.concat([label0[p080:p090], label1[p180:p190]], ignore_index=True)
  test = pd.concat([label0[p090:], label1[p190:]], ignore_index=True)

  train = train.sample(frac=1).reset_index(drop=True)
  val = val.sample(frac=1).reset_index(drop=True)
  test = test.sample(frac=1).reset_index(drop=True)

  return train, val, test

In [154]:
mfcc_coswara_df = pd.read_csv(ml_mfcc_13_coswara)
mfcc_coswara_df.label = mfcc_coswara_df.label.map({
    'healthy': 0,
    'positive_mild': 1,
    'no_resp_illness_exposed': 1,
    'resp_illness_not_identified': 0,
    'positive_moderate': 1,
    'recovered_full': 0,
    'positive_asymp': 1
})
label0 = mfcc_coswara_df[mfcc_coswara_df['label']==0].copy().reset_index(drop=True)
label1 = mfcc_coswara_df[mfcc_coswara_df['label']==1].copy().reset_index(drop=True)

In [155]:
_, __, ml_test_cos_mfcc = data_split(label0, label1)

In [156]:
ml_X_test_cos_mfcc = ml_test_cos_mfcc.iloc[:, :-1].values
ml_y_test_cos_mfcc = ml_test_cos_mfcc.iloc[:, -1].values

##### DL

###### Set up

In [172]:
# from google.colab import drive
# drive.mount('/content/drive')

In [173]:
from IPython.display import clear_output, Audio
import IPython.display as ipd
import os
import numpy as np
import shutil
import pandas as pd
import logging
import warnings

warnings.filterwarnings("ignore")


logger = logging.getLogger('COSWARA - HYBRID MODEL FULL AUDIO')
logger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)
pd.set_option('display.max_colwidth', None)
clear_output()

In [174]:
n_mfcc = 13
root = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/cnn/mobileNet'

coswara_feature = os.path.join(root, f'full_audio/mfcc_{n_mfcc}_coswara_features.npy')
coughvid_feature = os.path.join(root, f'full_audio/mfcc_{n_mfcc}_coughvid_features.npy')

###### Feature preparation

###### Metadata

In [175]:
# process metadata
meta_df = pd.read_csv(meta_coswara)
meta_df.head()

Unnamed: 0,id,age,covid_status,record_date,is_english_proficiency,gender,country,locality,state,is_returning_user,...,is_fatigue,is_sore_throat,is_ischemic_heart_disease,is_asthma,is_others_preexist_conditions,is_chronic_lung_disease,is_neumonia,cough_path,label,error
0,iV3Db6t1T8b7c5HQY2TwxIhjbzD3,28,healthy,2020-04-23,y,male,India,Anantapur,Andhra Pradesh,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/iV3Db6t1T8b7c5HQY2TwxIhjbzD3/cough-shallow.wav,healthy,0
1,AxuYWBN0jFVLINCBqIW5aZmGCdu1,25,healthy,2020-04-20,y,male,India,BENGALURU URBAN,Karnataka,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/AxuYWBN0jFVLINCBqIW5aZmGCdu1/cough-shallow.wav,healthy,0
2,C5eIsssb9GSkaAgIfsHMHeR6fSh1,28,healthy,2020-04-24,y,female,United States,Pittsburgh,Pennsylvania,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/C5eIsssb9GSkaAgIfsHMHeR6fSh1/cough-shallow.wav,healthy,0
3,YjbEAECMBIaZKyfqOvWy5DDImUb2,26,healthy,2020-04-23,y,male,India,Bangalore,Karnataka,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/YjbEAECMBIaZKyfqOvWy5DDImUb2/cough-shallow.wav,healthy,0
4,aGOvk4ji0cVqIzCs1jHnzlw2UEy2,32,healthy,2020-04-22,y,male,India,Nalanda,Bihar,n,...,,,,,,,,/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/data/coswara/coswara_cough_data/20200424/aGOvk4ji0cVqIzCs1jHnzlw2UEy2/cough-shallow.wav,healthy,0


In [176]:
logger.info(meta_df.shape)

2022-05-22 04:04:43,329 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4465, 39)
2022-05-22 04:04:43,329 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4465, 39)
2022-05-22 04:04:43,329 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4465, 39)
2022-05-22 04:04:43,329 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4465, 39)


In [177]:
mfcc_feature = np.load(coswara_feature)
meta_df = pd.read_csv(meta_coswara)
meta_df = meta_df[meta_df['error'] == 0].reset_index(drop=True)

In [178]:
logger.info(meta_df.shape)

2022-05-22 04:04:49,463 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 39)
2022-05-22 04:04:49,463 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 39)
2022-05-22 04:04:49,463 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 39)
2022-05-22 04:04:49,463 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 39)


In [179]:
meta_df.label.value_counts().sum()

4400

In [180]:
DROP_COLS = [
             'cough_path',
             'error',
             'record_date',
             'id',
             'covid_status',
             'test_date'
]

meta_df.isna().sum()/4400*100 > 50

id                               False
age                              False
covid_status                     False
record_date                      False
is_english_proficiency           False
gender                           False
country                          False
locality                         False
state                            False
is_returning_user                False
is_smoker                         True
is_cold                           True
is_hypertension                   True
is_diabetes                       True
is_cough                          True
date_of_ct_scan                   True
has_ctScan                        True
ct_score                          True
is_diarrheoa                      True
is_fever                          True
is_loss_of_smell                  True
is_muscle_pain                    True
test_type                         True
test_date                         True
test_status                       True
is_using_mask            

In [181]:
# fill na with unknown value
meta_df.fillna('unknown', inplace=True)
meta_df.isna().sum()

id                               0
age                              0
covid_status                     0
record_date                      0
is_english_proficiency           0
gender                           0
country                          0
locality                         0
state                            0
is_returning_user                0
is_smoker                        0
is_cold                          0
is_hypertension                  0
is_diabetes                      0
is_cough                         0
date_of_ct_scan                  0
has_ctScan                       0
ct_score                         0
is_diarrheoa                     0
is_fever                         0
is_loss_of_smell                 0
is_muscle_pain                   0
test_type                        0
test_date                        0
test_status                      0
is_using_mask                    0
vaccination_status               0
is_breathing_difficulty          0
is_others_resp      

In [182]:
# drop cols
meta_df.drop(DROP_COLS, axis=1, inplace=True)
logger.info(meta_df.shape)

2022-05-22 04:04:49,578 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 33)
2022-05-22 04:04:49,578 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 33)
2022-05-22 04:04:49,578 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 33)
2022-05-22 04:04:49,578 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - (4400, 33)


In [183]:
# replace label with binary numbers
meta_df.label = meta_df.label.map({
    'healthy': 0,
    'positive_mild': 1,
    'no_resp_illness_exposed': 1,
    'resp_illness_not_identified': 0,
    'positive_moderate': 1,
    'recovered_full': 0,
    'positive_asymp': 1
})
meta_df.label.value_counts()

0    3194
1    1206
Name: label, dtype: int64

In [184]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in meta_df.drop(['label', 'age'], axis=1).columns:
  meta_df[col] = meta_df[col].astype('string')
  meta_df[col] = le.fit_transform(meta_df[col].values)

In [185]:
meta_feature = np.array(meta_df.drop(['label'], axis=1))

###### 2d

In [186]:
label_0 = meta_df[meta_df['label']==0].copy().reset_index(drop=True)
label_1 = meta_df[meta_df['label']==1].copy().reset_index(drop=True)

In [187]:
# get index of each label 0, 1 to match with extracted features
index_1 = meta_df[meta_df['label']==1].index.tolist()
index_0 = meta_df[meta_df['label']==0].index.tolist()
logger.info(f'{len(index_1)}, {len(index_0)}')

2022-05-22 04:04:49,711 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - 1206, 3194
2022-05-22 04:04:49,711 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - 1206, 3194
2022-05-22 04:04:49,711 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - 1206, 3194
2022-05-22 04:04:49,711 - COSWARA - HYBRID MODEL FULL AUDIO - INFO - 1206, 3194


In [188]:
# get features for each class
feature_0 = mfcc_feature[index_0]
feature_1 = mfcc_feature[index_1]

In [189]:
print(len(feature_1))

1206


In [190]:
del mfcc_feature

In [191]:
# split data and reshape feature
def data_split(feature_0, feature_1, label0, label1):
  train = np.concatenate((feature_0[:2555], feature_1[:965]), axis=0)
  val = np.concatenate((feature_0[2555:2875], feature_1[965:1086]), axis=0)
  test = np.concatenate((feature_0[2875:], feature_1[1086:]), axis=0)

  y_train = pd.concat([label0[:2555], label1[:965]], ignore_index=True)
  y_val = pd.concat([label0[2555:2875], label1[965:1086]], ignore_index=True)
  y_test = pd.concat([label0[2875:], label1[1086:]], ignore_index=True)

  return train, val, test,\
         np.array(y_train.label), np.array(y_val.label), np.array(y_test.label)

In [192]:
X_train_2d, X_val_2d, X_test_2d, y_train_2d, y_val_2d, y_test_2d = data_split(feature_0, feature_1, label_0, label_1)
print(X_train_2d.shape)

(3520, 234, 246)


In [193]:
y_train_2d

array([0, 0, 0, ..., 1, 1, 1])

In [194]:
del feature_0
del feature_1

###### 1d

In [195]:
feature_0 = meta_feature[index_0]
feature_1 = meta_feature[index_1]

In [196]:
X_train_1d, X_val_1d, X_test_1d, y_train_1d, y_val_1d, y_test_1d = data_split(feature_0, feature_1, label_0, label_1)

In [197]:
from sklearn.utils import shuffle

X_train_2d, X_train_1d, y_train_2d, y_train_1d = shuffle(X_train_2d, X_train_1d, y_train_2d, y_train_1d, random_state=42)
X_val_2d, X_val_1d, y_val_2d, y_val_1d = shuffle(X_val_2d, X_val_1d, y_val_2d, y_val_1d, random_state=42)
X_test_2d, X_test_1d, y_test_2d, y_test_1d = shuffle(X_test_2d, X_test_1d, y_test_2d, y_test_1d, random_state=42)

## Bagging ensemble

In [None]:
!pip install catboost

In [45]:
import tensorflow as tf
import pickle
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score

In [42]:
def bagging_ensemble(prob: list):
  s_prob = np.zeros(prob[0].shape)
  for l in prob:
    s_prob += l

  return s_prob/len(prob)

### Mel-spectrogram

##### ML

In [12]:
ml_saved_p = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/saved_model'
lgbm_p = os.path.join(ml_saved_p, 'lgbm_full_cv_mel_smote.pkl')
svm_p = os.path.join(ml_saved_p, 'svm_full_cv_mel_smote.pkl')
cat_p = os.path.join(ml_saved_p, 'catboost_full_cv_mel_smote.pkl')
rf_p = os.path.join(ml_saved_p, 'rf_full_cv_mel_smote.pkl')

lgbm = pickle.load(open(lgbm_p, 'rb'))
svm = pickle.load(open(svm_p, 'rb'))
cat = pickle.load(open(cat_p, 'rb'))
rf = pickle.load(open(rf_p, 'rb'))

In [116]:
lgbm_pred = lgbm.predict(ml_X_test_cos_mel)
svm_pred = svm.predict_proba(ml_X_test_cos_mel)[:, 1]
cat_pred = cat.predict_proba(ml_X_test_cos_mel)[:, 1]
rf_pred = rf.predict_proba(ml_X_test_cos_mel)[:, 1]

In [117]:
ml_bagging = bagging_ensemble([lgbm_pred, svm_pred, cat_pred, rf_pred])

In [118]:
fpr_test, tpr_test, thresh_test = roc_curve(ml_y_test_cos_mel, ml_bagging, pos_label=1)
auc_test = roc_auc_score(ml_y_test_cos_mel, ml_bagging)
auc_test

0.5127481713688611

##### DL

In [91]:
# reshape training data

X_test_2d = X_test_2d[..., np.newaxis]
X_test_2d.shape

(439, 224, 246, 1)

In [99]:
from tensorflow import keras

saved_p = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/cnn/mobileNet/saved_model'
mobile = keras.models.load_model(os.path.join(saved_p, 'full_audio_coswara_mel_model_40_8_0.001'))
hybrid = keras.models.load_model(os.path.join(saved_p, 'full_audio_coswara_mel_hybrid_model_40_64_0.001'))

In [100]:
mobilenet_pred = mobile.predict(X_test_2d)[:, 1]
hybrid_pred = hybrid([X_test_2d, X_test_1d])[:, 1]
dl_bagging = bagging_ensemble([mobilenet_pred, hybrid_pred])

In [101]:
fpr_test, tpr_test, thresh_test = roc_curve(y_test_2d, dl_bagging, pos_label=1)
auc_test = roc_auc_score(y_test_2d, dl_bagging)
auc_test

0.7775862068965518

In [132]:
fn_bagging = (0.1*np.array(ml_bagging) + 0.9*np.array(dl_bagging))/2

In [133]:
fpr_test, tpr_test, thresh_test = roc_curve(y_test_2d, fn_bagging, pos_label=1)
auc_test = roc_auc_score(y_test_2d, fn_bagging)
auc_test

0.7755224660397074

### MFCC

#### 13

##### ML

In [135]:
ml_saved_p = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/saved_model'
lgbm_p = os.path.join(ml_saved_p, 'lgbm_full_cv_mfcc13_smote.pkl')
svm_p = os.path.join(ml_saved_p, 'svm_full_cv_mfcc13_smote.pkl')
cat_p = os.path.join(ml_saved_p, 'catboost_full_cv_mfcc13_smote.pkl')
rf_p = os.path.join(ml_saved_p, 'rf_full_cv_mfcc13_smote.pkl')

lgbm = pickle.load(open(lgbm_p, 'rb'))
svm = pickle.load(open(svm_p, 'rb'))
cat = pickle.load(open(cat_p, 'rb'))
rf = pickle.load(open(rf_p, 'rb'))

In [158]:
lgbm_pred = lgbm.predict(ml_X_test_cos_mfcc)
svm_pred = svm.predict_proba(ml_X_test_cos_mfcc)[:, 1]
cat_pred = cat.predict_proba(ml_X_test_cos_mfcc)[:, 1]
rf_pred = rf.predict_proba(ml_X_test_cos_mfcc)[:, 1]

In [162]:
ml_bagging = bagging_ensemble([lgbm_pred, svm_pred, cat_pred, rf_pred])

In [163]:
fpr_test, tpr_test, thresh_test = roc_curve(ml_y_test_cos_mfcc, ml_bagging, pos_label=1)
auc_test = roc_auc_score(ml_y_test_cos_mel, ml_bagging)
auc_test

0.4821055381400209

##### DL

In [198]:
# reshape training data

X_test_2d = X_test_2d[..., np.newaxis]
X_test_2d.shape

(439, 234, 246, 1)

In [199]:
from tensorflow import keras

saved_p = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/cnn/mobileNet/saved_model'
mobile = keras.models.load_model(os.path.join(saved_p, f'full_audio_coswara_mfcc_{n_mfcc}_model_40_32_0.001'))
hybrid = keras.models.load_model(os.path.join(saved_p, f'full_audio_coswara_mfcc_{n_mfcc}_hybrid_model_40_16_0.001'))

In [200]:
mobilenet_pred = mobile.predict(X_test_2d)[:, 1]
hybrid_pred = hybrid([X_test_2d, X_test_1d])[:, 1]
dl_bagging = bagging_ensemble([mobilenet_pred, hybrid_pred])

In [201]:
fpr_test, tpr_test, thresh_test = roc_curve(y_test_2d, dl_bagging, pos_label=1)
auc_test = roc_auc_score(y_test_2d, dl_bagging)
auc_test

0.7567398119122257

In [202]:
mfcc_13_fn_bagging = bagging_ensemble([np.array(ml_bagging), np.array(dl_bagging)])

In [203]:
fpr_test, tpr_test, thresh_test = roc_curve(y_test_2d, mfcc_13_fn_bagging, pos_label=1)
auc_test = roc_auc_score(y_test_2d, mfcc_13_fn_bagging)
auc_test

0.661833855799373

#### 26

#### 39

# Chunks

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.5-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 79.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.5


In [None]:
import pickle, joblib

k = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/saved_model/lgbm_full_cv_mfcc39_smote.pkl'
l = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/saved_model/rf_chunk_cos_mfcc39_smote.pkl'
h = '/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/saved_model/catboost_full_cv_mfcc13_smote.pkl'
# with open('/content/drive/MyDrive/Colab Notebooks/Senior Thesis/Workspace/recognition/ml_model/saved_model/catboost_chunk_cos_mel_raw.pkl', 'rb') as f:
m = pickle.load(open(h, 'rb'))


---

# COUGHVID


---

