In [28]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings('ignore')

In [29]:
with open("preds/all_predictions_test.pkl", "rb") as f:
    all_predictions_test = pickle.load(f)
    
with open("preds/all_predictions_val.pkl", "rb") as f:
    all_predictions_val = pickle.load(f)
    
len(all_predictions_val), len(all_predictions_test)

(2878, 4603)

In [30]:
val_preds = pd.read_parquet('data_processed/words/val.parquet').reset_index(drop=True)
test_preds = pd.read_parquet('data_processed/words/test.parquet').reset_index(drop=True)

In [31]:
val_preds['pred'] = [np.array(x) for x in all_predictions_val]
test_preds['pred'] = [np.array(x) for x in all_predictions_test]

In [32]:
with open('data_processed/words/val_files.pkl', 'rb') as f:
    val_files = pickle.load(f)

with open("data_processed/disorders_letters.pkl", "rb") as f:
    disorders_letters = pickle.load(f)

num_classes = len(disorders_letters)

val_files = np.unique(val_files)

In [33]:
def aggregate_preds(x):
    means = np.stack(x['pred'].values).mean(axis=0)

    preds = [0] * num_classes
    
    for i, letters in disorders_letters.items():
        cols = [f'{letter}_count' for letter in letters]
        if len(cols) == 0:
            preds[i] = means[i]
        elif x[cols].sum().sum() > 0:
            preds[i] = np.stack(x.loc[x[cols].sum(axis=1) > 0, 'pred'].values).mean(axis=0)[i]
 
    return np.array(preds)

In [34]:
def get_metric(y_true, y_pred, verbose=True):
    
    y_pred = np.argmax(y_pred, axis=-1)
    
    if verbose:
        print(confusion_matrix(y_true, y_pred))

    return f1_score(y_true, y_pred, average='macro')


# Validation

In [36]:
def process_y(y):
    y = y.rename(columns={0: 'audio_name', 1: 'target'})
    y = y.set_index('audio_name')
    return y
    

y = pd.read_csv("data/final/train.csv", header=None)

y = process_y(y)
y = y.loc[val_files]
y = y.squeeze()
y.head(3)

audio_name
005b4225-c0ca-4335-84a4-2c501aca6871.mp3    0
01195010-508e-426a-9d05-960b3d0e4373.mp3    0
01acd762-14d9-420e-b699-3dc6b22b6c6c.mp3    1
Name: target, dtype: int64

In [37]:
disorders_letters

{0: [], 1: ['р'], 2: ['г'], 3: []}

In [38]:
val_preds_agg = pd.Series(index=np.unique(val_files))

val_preds_agg.loc[np.unique(val_preds['file'])] = val_preds.groupby('file')[['pred', 'р_count', 'г_count']].apply(aggregate_preds)

In [39]:
value_to_fill_audio_with_no_r = [np.array([1.0] + [0.0] * (num_classes - 1))]

nans_count = val_preds_agg.isna().sum()

val_preds_agg_full = val_preds_agg.copy()
val_preds_agg_full.loc[val_preds_agg.isna()] = value_to_fill_audio_with_no_r * nans_count

In [40]:
val_preds_agg_full = np.stack(val_preds_agg_full)

In [41]:
metric = get_metric(y, val_preds_agg_full, verbose=True)
print("F1 macro:", metric)
print(classification_report(y, np.argmax(val_preds_agg_full, axis=-1)))

[[233  32   5   1]
 [ 90  62   0   4]
 [ 12   2   8   0]
 [  3   1   1   2]]
F1 macro: 0.4963665353470451
              precision    recall  f1-score   support

           0       0.69      0.86      0.77       271
           1       0.64      0.40      0.49       156
           2       0.57      0.36      0.44        22
           3       0.29      0.29      0.29         7

    accuracy                           0.67       456
   macro avg       0.55      0.48      0.50       456
weighted avg       0.66      0.67      0.65       456



# Test

In [43]:
y = pd.read_csv('data/final/test.csv', header=None)
y = process_y(y)
y['pred'] = np.nan
y

Unnamed: 0_level_0,target,pred
audio_name,Unnamed: 1_level_1,Unnamed: 2_level_1
0291d295-aae8-4aee-98c6-51899e638b50.mp3,,
bebe7afb-0081-4975-81cc-2b62d92376fd.mp3,,
a8273eba-cbee-41ae-8918-28196e693dc7.mp3,,
f6ad5636-d1cd-40f2-95aa-4a2dba5cb2c4.mp3,,
a98eff13-e333-412f-a5df-66d6f77aa4d0.mp3,,
...,...,...
54c991cc-2505-480c-8be2-adb69d896dfc.mp3,,
dceb22e0-5587-40cb-a4ae-d326e6ae17e4.mp3,,
4cdf0a12-bca6-493b-aea4-c120271f8479.mp3,,
3527d1f1-eeb9-4783-b493-27f1dec831a0.mp3,,


In [44]:
test_preds_agg = test_preds.groupby('file')[['pred', 'р_count', 'г_count']].apply(aggregate_preds)
test_preds_agg = test_preds_agg.apply(lambda x: pd.Series(x))

y.loc[test_preds_agg.index, 'pred'] = test_preds_agg.apply(lambda x: x.values, axis=1)

y['target'] = y['pred'].apply(lambda x: np.argmax(x) if x is not np.nan else 0)

In [45]:
y = y.reset_index().drop(columns=['pred'])

In [46]:
y.to_csv('preds/wav2vec_final.csv', header=None, index=False)

In [47]:
y

Unnamed: 0,audio_name,target
0,0291d295-aae8-4aee-98c6-51899e638b50.mp3,0
1,bebe7afb-0081-4975-81cc-2b62d92376fd.mp3,0
2,a8273eba-cbee-41ae-8918-28196e693dc7.mp3,0
3,f6ad5636-d1cd-40f2-95aa-4a2dba5cb2c4.mp3,0
4,a98eff13-e333-412f-a5df-66d6f77aa4d0.mp3,1
...,...,...
713,54c991cc-2505-480c-8be2-adb69d896dfc.mp3,2
714,dceb22e0-5587-40cb-a4ae-d326e6ae17e4.mp3,0
715,4cdf0a12-bca6-493b-aea4-c120271f8479.mp3,1
716,3527d1f1-eeb9-4783-b493-27f1dec831a0.mp3,1
