In [2]:
import os, json, pickle, inspect
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.amp import GradScaler
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoConfig, AutoFeatureExtractor

import utils
import commons
import models
from cough_datasets import CoughDatasets, CoughDatasetsCollate

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score, f1_score

from tensorboard.backend.event_processing import event_accumulator

import warnings
warnings.simplefilter("ignore", UserWarning)

# =============================================================
# SECTION: Intialize Data
# =============================================================
INIT = True
MODEL_NAME = "unamed"
CONFIG_PATH = "configs/lstm_cnn.json"

model_dir = os.path.join("./logs", MODEL_NAME)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

config_save_path = os.path.join(model_dir, "config.json")
if INIT:
    with open(CONFIG_PATH, "r") as f:
      data = f.read()
    with open(config_save_path, "w") as f:
      f.write(data)
else:
    with open(config_save_path, "r") as f:
      data = f.read()

config = json.loads(data)
  
hps = utils.HParams(**config)
hps.model_dir = model_dir

BATCH_SIZE = hps.train.batch_size
ACCUMULATION_STEP = hps.train.accumulation_steps
assert (ACCUMULATION_STEP > 0) and (BATCH_SIZE % ACCUMULATION_STEP == 0)
cur_bs = BATCH_SIZE // ACCUMULATION_STEP

# =============================================================
# SECTION: Loading Data
# =============================================================

##### Label Umum Semua
Diseases_codes = [0, 1]
CLASS_NAMES = ["Healthy", "TB"]

df = pd.read_csv(f'{hps.data.db_path}/{hps.data.metadata_csv}')
df = df[df['cough_score'] >= 0.90].sample(frac=1, random_state=40)

df_solic = df[df['type_cough'] == 0].sample(frac=1, random_state=41)
df_long = df[df['type_cough'] == 1].sample(frac=1, random_state=42) # 0 Solic, 1 Longi
df_long_array = []
for i_rand in range(5):
    df_0 = df_long[df_long['disease_label'] == 0].sample(n=df_solic['disease_label'].value_counts()[0], random_state=i_rand * 4)
    df_1 = df_long[df_long['disease_label'] == 1].sample(n=df_solic['disease_label'].value_counts()[1], random_state=i_rand * 4)
    df_long_array.append(pd.concat([df_0, df_1], ignore_index=True, sort=False))

df = df
#df = df_solic
#df = df_long_array[0]
#df = df_long
print(df.shape)

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)
#df_train, df_test = df_long, df_solic

df_issue = pd.read_csv("df_issue.csv")
df_train = df_train[~df_train['path_file'].isin(df_issue['wavname'])]
df_test = df_test[~df_test['path_file'].isin(df_issue['wavname'])]

class_frequencies = df_train['disease_label'].value_counts().to_dict()
total_samples = len(df_train)
class_weights = {cls: total_samples / (len(Diseases_codes) * freq) if freq != 0 else 0 for cls, freq in class_frequencies.items()}
weights_list = [class_weights[cls] for cls in Diseases_codes]
class_weights_tensor = torch.tensor(weights_list, device='cuda', dtype=torch.float)
class_weights_tensor = None
print(class_weights_tensor)

# =============================================================
# SECTION: Setup Logger, Dataloader
# =============================================================
logger = utils.get_logger(hps.model_dir)
logger.info(hps)

writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))

collate_fn = CoughDatasetsCollate(hps.data.many_class)
train_dataset = CoughDatasets(df_train.values, hps.data, train=True)
val_dataset = CoughDatasets(df_test.values, hps.data, train=False)

#train_sampler = DistributedBucketSampler(train_dataset, cur_bs, [32,300,400,500,600,700,800,900,1000], num_replicas=1, rank=0, shuffle=True)
#train_loader = DataLoader(train_dataset, num_workers=28, shuffle=False, pin_memory=True, collate_fn=collate_fn, batch_sampler=train_sampler)
train_loader = DataLoader(train_dataset, num_workers=28, shuffle=True, batch_size=cur_bs, pin_memory=True, drop_last=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, num_workers=28, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn)

print(next(iter(train_loader))[1][0].numpy().shape)
# =============================================================
# SECTION: Setup Logger, Dataloader
# =============================================================
logger.info(f"======================================")
logger.info(f"✨ Loss: {hps.train.loss_function}")
logger.info(f"✨ Use Between Class Training: {hps.data.mix_audio}")
logger.info(f"✨ Use Augment: {hps.data.augment_data}")
logger.info(f"✨ Padding Type: {hps.data.pad_types}")
logger.info(f"✨ Using Model: {hps.model.pooling_model}")
logger.info(f"======================================")

epoch_str = 1
global_step = 0

pool_net = getattr(models, hps.model.pooling_model)
pool_model = pool_net(hps.model.feature_dim, **hps.model).cuda()

optimizer_p = torch.optim.AdamW(pool_model.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
scheduler_p = torch.optim.lr_scheduler.ExponentialLR(optimizer_p, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)

class_code_pool_net = inspect.getsource(pool_net)
with open(f'{hps.model_dir}/model_net.py.bak', 'w') as f:
    f.write("import torch\nimport torch.nn as nn\n\n")
    f.write(class_code_pool_net)

# =============================================================
# SECTION: Setup Logger, Dataloader
# =============================================================
best_lost = np.inf
patience_val = []

if hps.train.warm_start:
    if hps.train.from_pretrain:
        print(hps.train.warm_start_checkpoint_pool)
        checkpoint = torch.load(hps.train.warm_start_checkpoint_pool, map_location='cpu', weights_only=True)['model']
        state_dict = checkpoint['state_dict'] if 'state_dict' in checkpoint else checkpoint
        model_dict = {}

        for key, value in state_dict.items():
            if key.startswith('v.'):
                new_key = key[2:]
                model_dict[new_key] = value

        if hasattr(pool_model, 'module'):
            pool_model.module.load_state_dict(model_dict, strict=True)
        else:
            pool_model.load_state_dict(model_dict, strict=True)
    else:
        pool_model = utils.warm_start_model(hps.train.warm_start_checkpoint_pool, pool_model, hps.train.ignored_layer)
else:
    try:
        _, _, _, _, epoch_str = utils.load_checkpoint(
            utils.latest_checkpoint_path(hps.model_dir, "pool_*.pth"),
            pool_model,
            optimizer_p,
            scheduler_p,
        )

        epoch_str += 1
        global_step = (epoch_str - 1) * len(train_loader)

        with open(os.path.join(hps.model_dir, "traindata.pickle"), 'rb') as handle:
            traindata = pickle.load(handle)
            best_lost = traindata['best_lost']
            patience_val = traindata['patience_val']
        
    except Exception as e:
        print(e)

scaler = GradScaler('cuda')
optimizer_p.zero_grad(set_to_none=True)

# =============================================================
# SECTION: Train Epoch
# =============================================================

for batch_idx, (wav_names, audio, attention_masks, dse_ids, spk_ids) in enumerate(tqdm(train_loader)):
    audio = audio.cuda(non_blocking=True).float().squeeze(1)
    attention_masks = attention_masks.cuda(non_blocking=True).float()
    dse_ids = dse_ids.cuda(non_blocking=True).float()
    spk_ids = spk_ids.cuda(non_blocking=True).long()

    break

(656292, 5)
None
INFO:unamed:{'train': {'use_cuda': True, 'log_interval': 20, 'seed': 1234, 'epochs': 10000, 'learning_rate': 0.0005, 'betas': [0.8, 0.99], 'eps': 1e-09, 'lr_decay': 0.999875, 'warmup_steps': 0, 'scheduler': 'noam', 'batch_size': 128, 'accumulation_steps': 2, 'fp16_run': False, 'warm_start': False, 'loss_function': 'CE', 'from_pretrain': False, 'warm_start_checkpoint_pool': './logs/Dim_PoolingSep_VATTTry6_Roberto_normmax/best_pool.pth', 'ignored_layer': []}, 'data': {'max_value_norm': False, 'max_wav_value': 32768.0, 'sampling_rate': 16000, 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024, 'n_mel_channels': 80, 'mel_fmin': 0.0, 'mel_fmax': 8000.0, 'desired_length': 1.2, 'fade_samples_ratio': 16, 'pad_types': 'zero', 'acoustic_feature': False, 'feature_type': 'melspectogram', 'multimask_augment': False, 'augment_data': False, 'add_noise': False, 'mix_audio': False, 'many_class': 2, 'db_path': '/run/media/fourier/Data1/Pras/Database_ThesisNew/', 'metadata_csv'

  0%|          | 0/9227 [00:01<?, ?it/s]


In [3]:
ssl_model = AutoModel.from_pretrained("openai/whisper-large-v3")

In [7]:
ssl_model.encoder.state_dict()

OrderedDict([('conv1.weight',
              tensor([[[-0.0010, -0.0024,  0.0006],
                       [ 0.0169,  0.0162,  0.0154],
                       [ 0.0104,  0.0071,  0.0079],
                       ...,
                       [-0.0008,  0.0016, -0.0017],
                       [-0.0081, -0.0040, -0.0075],
                       [-0.0114, -0.0128, -0.0128]],
              
                      [[ 0.0003,  0.0008, -0.0003],
                       [-0.0030,  0.0009,  0.0015],
                       [-0.0036,  0.0016,  0.0005],
                       ...,
                       [ 0.0004,  0.0028, -0.0003],
                       [ 0.0019,  0.0028, -0.0015],
                       [ 0.0007, -0.0004,  0.0008]],
              
                      [[ 0.0144, -0.0008, -0.0083],
                       [ 0.0258,  0.0055, -0.0079],
                       [ 0.0073, -0.0018, -0.0107],
                       ...,
                       [ 0.0067,  0.0042,  0.0024],
                      

In [15]:
ssl_model

WhisperModel(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 1280)
    (layers): ModuleList(
      (0-31): 32 x WhisperEncoderLayer(
        (self_attn): WhisperSdpaAttention(
          (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
          (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwis

In [3]:
import os, json, pickle, inspect
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.amp import GradScaler
from sklearn.model_selection import train_test_split

import utils
import commons
import models
from cough_datasets import CoughDatasets, CoughDatasetsCollate

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score, f1_score

from tensorboard.backend.event_processing import event_accumulator

import warnings
warnings.simplefilter("ignore", UserWarning)

# =============================================================
# SECTION: Intialize Data
# =============================================================
INIT = False
MODEL_NAME = "resnet_reproduce_cam"
CONFIG_PATH = "configs/lstm_cnn.json"

model_dir = os.path.join("./logs", MODEL_NAME)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

config_save_path = os.path.join(model_dir, "config.json")
if INIT:
    with open(CONFIG_PATH, "r") as f:
      data = f.read()
    with open(config_save_path, "w") as f:
      f.write(data)
else:
    with open(config_save_path, "r") as f:
      data = f.read()

config = json.loads(data)
  
hps = utils.HParams(**config)
hps.model_dir = model_dir

BATCH_SIZE = hps.train.batch_size
ACCUMULATION_STEP = hps.train.accumulation_steps
assert (ACCUMULATION_STEP > 0) and (BATCH_SIZE % ACCUMULATION_STEP == 0)
cur_bs = BATCH_SIZE // ACCUMULATION_STEP

# =============================================================
# SECTION: Loading Data
# =============================================================

##### Label Umum Semua
Diseases_codes = [0, 1]
CLASS_NAMES = ["Negative TB", "Positive TB"]

df = pd.read_csv(f'{hps.data.db_path}/GoogleHealth/google_tb_metadata.csv')
df['path_file'] = f'GoogleHealth/' + df['path_file'] 

collate_fn = CoughDatasetsCollate(hps.data.many_class)
val_dataset = CoughDatasets(df.values, hps.data, train=False)
val_loader = DataLoader(val_dataset, num_workers=28, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn)

# =============================================================
# SECTION: Setup Logger, Dataloader
# =============================================================
epoch_str = 1
global_step = 0

pool_net = getattr(models, hps.model.pooling_model)
pool_model = pool_net(hps.model.feature_dim, **hps.model).cuda()

optimizer_p = torch.optim.AdamW(pool_model.parameters(), hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
scheduler_p = torch.optim.lr_scheduler.ExponentialLR(optimizer_p, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)

_, _, _, _, epoch_str = utils.load_checkpoint(
    os.path.join(hps.model_dir, "best_pool.pth"),
    pool_model,
    optimizer_p,
    scheduler_p,
)

False


In [10]:
pool_model.eval() 
all_preds, all_labels, all_wavnames, all_embeddings  = [], [], [], []
with torch.no_grad():
    for batch_idx, (wav_names, audio, attention_masks, dse_ids, spk_ids) in enumerate(tqdm(val_loader)):
        audio = audio.cuda(non_blocking=True).float().squeeze(1)
        attention_masks = attention_masks.cuda(non_blocking=True).float()
        dse_ids = dse_ids.cuda(non_blocking=True).float()
        spk_ids = spk_ids.cuda(non_blocking=True).long()

        x_lengths = torch.tensor(commons.compute_length_from_mask(attention_masks)).cuda(non_blocking=True).long()
        out_model = pool_model(audio)
        outputs = out_model[0]
        
        preds = torch.argmax(outputs, dim=1)
        dse_ids = np.argmax(dse_ids.cpu().detach().numpy(), axis=-1)

        all_wavnames.extend(wav_names)
        all_preds.extend(preds.cpu().numpy())
        #all_embeddings.extend(out_model[1].cpu().numpy())
        all_labels.extend(dse_ids)

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)
all_wavnames = np.array(all_wavnames)
#all_embeddings = np.array(all_embeddings)

accuracy_score(all_labels, all_preds)

100%|██████████| 51/51 [00:01<00:00, 27.10it/s]


0.3241421568627451

In [8]:


df = pd.read_csv(f'{hps.data.db_path}/{hps.data.metadata_csv}')
df = df[df['cough_score'] >= 0.90].sample(frac=1, random_state=40)

df_solic = df[df['type_cough'] == 0].sample(frac=1, random_state=41)
df_long = df[df['type_cough'] == 1].sample(frac=1, random_state=42) # 0 Solic, 1 Longi
df_long_array = []
for i_rand in range(5):
    df_0 = df_long[df_long['disease_label'] == 0].sample(n=df_solic['disease_label'].value_counts()[0], random_state=i_rand * 4)
    df_1 = df_long[df_long['disease_label'] == 1].sample(n=df_solic['disease_label'].value_counts()[1], random_state=i_rand * 4)
    df_long_array.append(pd.concat([df_0, df_1], ignore_index=True, sort=False))

df = df
print(df.shape)

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)
df_issue = pd.read_csv("df_issue.csv")
df_train = df_train[~df_train['path_file'].isin(df_issue['wavname'])]
df_test = df_test[~df_test['path_file'].isin(df_issue['wavname'])]

class_frequencies = df_train['disease_label'].value_counts().to_dict()
total_samples = len(df_train)
class_weights = {cls: total_samples / (len(Diseases_codes) * freq) if freq != 0 else 0 for cls, freq in class_frequencies.items()}
weights_list = [class_weights[cls] for cls in Diseases_codes]
class_weights_tensor = torch.tensor(weights_list, device='cuda', dtype=torch.float)
print(class_weights_tensor)

# =============================================================
# SECTION: Setup Logger, Dataloader
# =============================================================
writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))

collate_fn = CoughDatasetsCollate(hps.data.many_class)
train_dataset = CoughDatasets(df_train.values, hps.data, train=True)
val_dataset = CoughDatasets(df_test.values, hps.data, train=False)

#train_sampler = DistributedBucketSampler(train_dataset, cur_bs, [32,300,400,500,600,700,800,900,1000], num_replicas=1, rank=0, shuffle=True)
#train_loader = DataLoader(train_dataset, num_workers=28, shuffle=False, pin_memory=True, collate_fn=collate_fn, batch_sampler=train_sampler)
train_loader = DataLoader(train_dataset, num_workers=28, shuffle=True, batch_size=cur_bs, pin_memory=True, drop_last=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, num_workers=28, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn)

print(next(iter(train_loader))[1][0].numpy().shape)


(656292, 5)
tensor([1.2743, 0.8229], device='cuda:0')
True
False
(80, 63)


In [9]:
df = pd.read_csv(f'{hps.data.db_path}/{hps.data.metadata_csv}')
df_solic = df[df['type_cough'] == 0].sample(frac=1, random_state=41)
df_long = df[df['type_cough'] == 1].sample(frac=1, random_state=42)

In [16]:
df_long['disease_label'].count()

647060

In [17]:
df_long['disease_label'].value_counts()

disease_label
1    396182
0    250878
Name: count, dtype: int64

In [None]:

df = df[df['cough_score'] >= 0.90].sample(frac=1, random_state=40)

 # 0 Solic, 1 Longi

In [2]:
pool_model.eval() 
all_preds, all_labels, all_wavnames, all_embeddings  = [], [], [], []
with torch.no_grad():
    for batch_idx, (wav_names, audio, attention_masks, dse_ids, spk_ids) in enumerate(tqdm(train_loader)):
        audio = audio.cuda(non_blocking=True).float().squeeze(1)
        attention_masks = attention_masks.cuda(non_blocking=True).float()
        dse_ids = dse_ids.cuda(non_blocking=True).float()
        spk_ids = spk_ids.cuda(non_blocking=True).long()

        x_lengths = torch.tensor(commons.compute_length_from_mask(attention_masks)).cuda(non_blocking=True).long()
        out_model = pool_model(audio)
        outputs = out_model[0]
        
        preds = torch.argmax(outputs, dim=1)
        dse_ids = np.argmax(dse_ids.cpu().detach().numpy(), axis=-1)

        all_wavnames.extend(wav_names)
        all_preds.extend(preds.cpu().numpy())
        all_embeddings.extend(out_model[1].cpu().numpy())
        all_labels.extend(dse_ids)

    for batch_idx, (wav_names, audio, attention_masks, dse_ids, spk_ids) in enumerate(tqdm(val_loader)):
        audio = audio.cuda(non_blocking=True).float().squeeze(1)
        attention_masks = attention_masks.cuda(non_blocking=True).float()
        dse_ids = dse_ids.cuda(non_blocking=True).float()
        spk_ids = spk_ids.cuda(non_blocking=True).long()

        x_lengths = torch.tensor(commons.compute_length_from_mask(attention_masks)).cuda(non_blocking=True).long()
        out_model = pool_model(audio)
        outputs = out_model[0]
        
        preds = torch.argmax(outputs, dim=1)
        dse_ids = np.argmax(dse_ids.cpu().detach().numpy(), axis=-1)

        all_wavnames.extend(wav_names)
        all_preds.extend(preds.cpu().numpy())
        all_embeddings.extend(out_model[1].cpu().numpy())
        all_labels.extend(dse_ids)

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)
all_wavnames = np.array(all_wavnames)
all_embeddings = np.array(all_embeddings)

df_result = pd.DataFrame({
    'wavname': all_wavnames,
    'label': all_labels,
    'pred': all_preds
})

100%|██████████| 9228/9228 [00:44<00:00, 206.97it/s]
100%|██████████| 512/512 [00:09<00:00, 54.26it/s] 


In [4]:
accuracy_score(all_labels, all_preds)

0.9636097361747781

In [8]:
df_mismatch = df_result[df_result['label'] != df_result['pred']]

In [12]:
df_mismatch.to_csv("mismatch_data.csv", index=False)

In [5]:
df_result['wavname'] = hps.data.db_path + df_result['wavname']

In [3]:
from sliceguard import SliceGuard
from sklearn.metrics import accuracy_score

sg = SliceGuard()
issues = sg.find_issues(df_result, features=["wavname"], y="label", y_pred="pred", metric=accuracy_score, precomputed_embeddings={"wavname": all_embeddings})
report_df, spotlight_data_issues, spotlight_dtypes, spotlight_layout = sg.report(no_browser=True)



Using precomputed embeddings.
Pre-reducing feature wavname in mode native.
Using op mix ratio 0.8.
Using num dimensions 32.
The overall metric value is 0.9636168552477565
You didn't specify metric_mode parameter. Using max as default.
Detecting issues for criteria n_slices=20, criterion=drop, min_drop=None, min_support=None.
Identified 20 problematic slices.


In [6]:
from renumics import spotlight

spotlight.show(df_result.reset_index(), issues=spotlight_data_issues, layout=spotlight_layout)

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:37719/'), HBox(children=(Button(description=…

In [5]:
combined_selected_rows = []

for now_data_issue in spotlight_data_issues:
    selected_rows = df_result.iloc[now_data_issue.rows]
    combined_selected_rows.append(selected_rows)

# Concatenate all selected rows into a single DataFrame
final_selected_df = pd.concat(combined_selected_rows, ignore_index=True)
final_selected_df['wavname'] = final_selected_df['wavname'].str.replace(hps.data.db_path, "", regex=False)

In [6]:
final_selected_df

Unnamed: 0,wavname,label,pred
0,CombineData2/TB/longitudinal_data/166031369478...,1,0
1,CombineData2/TB/longitudinal_data/163971760433...,1,0
2,CombineData2/TB/longitudinal_data/163958681315...,1,0
3,CombineData2/TB/longitudinal_data/163308643539...,0,1
4,CombineData2/TB/longitudinal_data/164450756765...,0,1
5,CombineData2/TB/longitudinal_data/163446334743...,0,1
6,CombineData2/TB/longitudinal_data/165236057349...,0,1
7,CombineData2/TB/longitudinal_data/164475324249...,0,1
8,CombineData2/TB/longitudinal_data/165225839680...,0,1
9,CombineData2/TB/longitudinal_data/164165415633...,1,0


In [7]:
final_selected_df.to_csv("df_issue2.csv", index=False)

In [None]:
filtered_df_result = df_result[~df_result['wavname'].isin(final_selected_df['wavname'])]

In [1]:
import os, json, pickle, inspect
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.amp import GradScaler
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoConfig

import utils
import commons
import models
from cough_datasets import CoughDatasets, CoughDatasetsCollate

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score, f1_score

from tensorboard.backend.event_processing import event_accumulator

import warnings
warnings.simplefilter("ignore", UserWarning)

# =============================================================
# SECTION: Intialize Data
# =============================================================
INIT = False
MODEL_NAME = "try_lstm_sken3_exludedf_issue"
CONFIG_PATH = "configs/lstm_cnn.json"

model_dir = os.path.join("./logs", MODEL_NAME)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

config_save_path = os.path.join(model_dir, "config.json")
if INIT:
    with open(CONFIG_PATH, "r") as f:
      data = f.read()
    with open(config_save_path, "w") as f:
      f.write(data)
else:
    with open(config_save_path, "r") as f:
      data = f.read()

config = json.loads(data)
  
hps = utils.HParams(**config)
hps.model_dir = model_dir

BATCH_SIZE = hps.train.batch_size
ACCUMULATION_STEP = hps.train.accumulation_steps
assert (ACCUMULATION_STEP > 0) and (BATCH_SIZE % ACCUMULATION_STEP == 0)
cur_bs = BATCH_SIZE // ACCUMULATION_STEP

# =============================================================
# SECTION: Loading Data
# =============================================================

##### Label Umum Semua
Diseases_codes = [0, 1]
CLASS_NAMES = ["Healthy", "TB"]

df = pd.read_csv(f'{hps.data.db_path}/{hps.data.metadata_csv}')
df = df[df['cough_score'] >= 0.90].sample(frac=1, random_state=40)

df_solic = df[df['type_cough'] == 0].sample(frac=1, random_state=41)
df_long = df[df['type_cough'] == 1].sample(frac=1, random_state=42) # 0 Solic, 1 Longi
df_long_array = []
for i_rand in range(5):
    df_0 = df_long[df_long['disease_label'] == 0].sample(n=df_solic['disease_label'].value_counts()[0], random_state=i_rand * 4)
    df_1 = df_long[df_long['disease_label'] == 1].sample(n=df_solic['disease_label'].value_counts()[1], random_state=i_rand * 4)
    df_long_array.append(pd.concat([df_0, df_1], ignore_index=True, sort=False))

df = df
#df = df_solic
#df = df_long_array[0]
#df = df_long
print(df.shape)

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)



(656292, 5)
