In [6]:
import json
import torch
import torch.nn as nn
import librosa 
import numpy as np
import pylab
import os
import librosa.display
from tqdm.notebook import tqdm
import warnings
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch.nn.functional as F
import torch.optim as optim
from audio2numpy import open_audio
from scipy.signal import hilbert
import signal_envelope as se
warnings.filterwarnings("ignore")
import soundfile

In [20]:
#Read Dataset
datasetPath = r"dataset/combine-russian-ESC"
with open(os.path.join(datasetPath, 'metadata.json')) as f:
  data = json.load(f)
  
#Smooth values
def movmean(values, window):
    weights = np.repeat(1.0, window)/window
    sma = np.convolve(values, weights, 'valid')
    return sma

#Feature extraction
areas  = []
asymptomatic = []
covid19      = []
verified     = []

for key in tqdm(data):
  if 'filename' not in key:
    continue
  filename = key['filename']
  try:
    format = filename.split(".")[1]
    filepath = os.path.join(datasetPath, "raw", filename)
    if format == "mp3":
      signal, sr = open_audio(filepath)
    elif format == "ogg":
      signal, sr = soundfile.read(filepath)
      
  except:
    print(0)
    continue

  if 'asymptomatic' in key:
    asymptomatic.append(int(key['asymptomatic']))
  else:
    asymptomatic.append(2)

  if 'covid19' in key:  
    covid19.append(int(key['covid19']))
  else:
    covid19.append(2)

  if 'verified' in key:   
    verified.append(int(key['verified']))
  else:
    verified.append(2)

  correct = None
  if len(signal.shape) == 1:
    correct = signal
  else:
    if np.sum(np.var(signal[:,1])) > np.sum(np.var(signal[:,0])):
      correct = signal[:,1]
    else:
      correct = signal[:,0]

  correct = movmean(correct,3)
  area = np.sum(np.abs(hilbert(correct)))
  areas.append(area)
  
areas, asymptomatic, covid19, verified = np.array(areas), np.array(asymptomatic), np.array(covid19), np.array(verified)   


  0%|          | 0/1364 [00:00<?, ?it/s]

0
0


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,auc,roc_curve,cohen_kappa_score

#Use LR to find threshold
def train(X, Y):
  X = np.array(X).reshape(-1,1)
  Y = np.array(Y).reshape(-1)

  neigh = LogisticRegression(max_iter = 1000)
  neigh.fit(X, Y)

  
  print(classification_report(Y, neigh.predict(X)))
  print(accuracy_score(Y, neigh.predict(X)))
  fpr, tpr, thresholds = roc_curve(Y, neigh.predict(X))
  print(auc(fpr,tpr))
  print(cohen_kappa_score(Y,neigh.predict(X)))

  return neigh

In [22]:
#Experiments
mask = np.logical_and(covid19 !=2,verified !=2)
mask = np.logical_and(mask, covid19 == verified )
X, Y = areas[mask], verified[mask]
neigh = train(X, Y)

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       478
           1       0.90      0.95      0.93       381

    accuracy                           0.93       859
   macro avg       0.93      0.94      0.93       859
weighted avg       0.94      0.93      0.93       859

0.9336437718277066
0.9355829736764076
0.8663210228436167


In [25]:
print(classification_report(Y, X <= 2000))
print(accuracy_score(Y, X <= 2000))
fpr, tpr, thresholds = roc_curve(Y, X <= 2000)
print(auc(fpr,tpr))
print(cohen_kappa_score(Y,X <= 2000))
print("---")

print(classification_report(Y, X <= 3000))
print(accuracy_score(Y, X <= 3000))
fpr, tpr, thresholds = roc_curve(Y, X <= 3000)
print(auc(fpr,tpr))
print(cohen_kappa_score(Y,X <= 3000))
print("---")

print(classification_report(Y, X <= 4000))
print(accuracy_score(Y, X <= 4000))
fpr, tpr, thresholds = roc_curve(Y, X <= 4000)
print(auc(fpr,tpr))
print(cohen_kappa_score(Y,X <= 4000))
print("---")

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       478
           1       0.93      0.91      0.92       381

    accuracy                           0.93       859
   macro avg       0.93      0.93      0.93       859
weighted avg       0.93      0.93      0.93       859

0.9289871944121071
0.926871588750151
0.8557943433976491
---
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       478
           1       0.89      0.97      0.93       381

    accuracy                           0.93       859
   macro avg       0.93      0.94      0.93       859
weighted avg       0.94      0.93      0.93       859

0.9348079161816065
0.9384931747548293
0.8691168708201146
---
              precision    recall  f1-score   support

           0       0.99      0.84      0.91       478
           1       0.83      0.99      0.90       381

    accuracy                           0.91       859
   macr

In [29]:
from sklearn.model_selection import train_test_split
X = np.array(X).reshape(-1,1)
Y = np.array(Y).reshape(-1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)

In [30]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors = 2)
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

print(classification_report(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
print(auc(fpr,tpr))
print(cohen_kappa_score(Y_test, Y_pred))


              precision    recall  f1-score   support

           0       0.82      0.96      0.88        93
           1       0.94      0.75      0.83        79

    accuracy                           0.86       172
   macro avg       0.88      0.85      0.86       172
weighted avg       0.87      0.86      0.86       172

0.8604651162790697
0.8519123451749013
0.7147201105736005
