
# WavLM Feature Extractor

##### https://github.com/microsoft/unilm/tree/master/wavlm

In [1]:
%run WavLM

In [2]:
import pandas as pd
import numpy as np

In [3]:
import torch
from WavLM import WavLM, WavLMConfig

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(torch.cuda.get_device_name(0))

# load the pre-trained checkpoints
checkpoint = torch.load('model/WavLM-Base+.pt')
cfg = WavLMConfig(checkpoint['cfg'])
model = WavLM(cfg)
#model = model.to(device) #, dtype=torch.float32)
model.load_state_dict(checkpoint['model'])
model.eval()

Using device: cuda:2
NVIDIA RTX A6000




WavLM(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
        (3): GELU(approximate='none')
      )
      (1-4): 4 x Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU(approximate='none')
      )
      (5-6): 2 x Sequential(
        (0): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU(approximate='none')
      )
    )
  )
  (post_extract_proj): Linear(in_features=512, out_features=768, bias=True)
  (dropout_input): Dropout(p=0.1, inplace=False)
  (dropout_features): Dropout(p=0.1, inplace=False)
  (encoder): TransformerEncoder(
    (pos_conv): Sequential(
      (0): Conv1d(768, 768, kernel_s

In [13]:
# Path to source files
path2 = 'Recola2018_16k/audio/Devel'

In [14]:
import os

# Read a directory and put all files in a list
file_list = []
i = 0 
for path, subdirs, files in os.walk( path2 ):
    for name in files:
        file_list.append( os.path.join( path, name) )
        i += 1
print("Files processed: "+str(i) )


Files processed: 9


In [15]:
import soundfile as sf

# sample rate = 16,000
#  1s = 16,000 x 1 =  16,000
# 60s = 16,000 x 60 = 960,500

length = list()
i      = 0
avg    = 0

for file in file_list:
    data, samplerate = sf.read( file )
    
    if len(data) <= 960500:                         
            print("Audio length: "+str(len(data))+" with less than 30s: "+str(file) )
    #computer average lenght of files
    avg = avg + len(data)
    length.append(len(data))
    i += 1

print( "Files processed: "+str(i) )
print( "Average file length: "+str(avg/i) + " samples   "+str(avg/i/samplerate)+" s   "+str(avg/i/samplerate/60)+" min" )
print( "Max length: "+str(max(length))+ " samples   "+str(max(length)/samplerate)+" s   "+str(max(length)/samplerate/60)+" min" )
print( "Min length: "+str(min(length))+ " samples   "+str(min(length)/samplerate)+" s   "+str(min(length)/samplerate/60)+" min" )

Files processed: 9
Average file length: 4800000.0 samples   300.0 s   5.0 min
Max length: 4800000 samples   300.0 s   5.0 min
Min length: 4800000 samples   300.0 s   5.0 min


In [16]:
sampling_rate = 16000
track_count = 0

for file in file_list:
    data , samplerate = sf.read( file )
    print ("--------------")
    print ("Sample Rate: " + str(samplerate) + " Length: " + str(data.shape) + " " + str( file ) )

    # extract the representation of last layer
    wav_input = torch.from_numpy(data).float()
    wav_input_16khz = torch.unsqueeze(wav_input,0)
    # wav_input_16khz = torch.randn(1,48000)
    # wav_input_16khz = torch.randn(1,2786987)
    if cfg.normalize:
        wav_input_16khz = torch.nn.functional.layer_norm(wav_input_16khz , wav_input_16khz.shape)

    #rep = model.extract_features(wav_input_16khz.to(device, dtype=torch.float32))[0]
    rep = model.extract_features(wav_input_16khz)[0]

    rep[0].shape
    rep_np = rep[0].detach().numpy()
    rep_df = pd.DataFrame(rep_np)

    file_id = 'Recola2018_16k/features/'+file.split('/')[3].split('.')[0]+'.wavlmbasefeat'  
    rep_df.to_csv(file_id)
    
    # Pooling 
    df_temp = pd.DataFrame(rep[0].detach().numpy())
    df_pool = df_temp.rolling(2, step=2).mean().drop(index=0) 
    
    file_id = 'Recola2018_16k/features/'+file.split('/')[3].split('.')[0]+'.wavlmbasefeatpool'  
    df_pool.to_csv(file_id)
    
    print( file )
    
    track_count += 1

--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_7.wav
Recola2018_16k/audio/Devel/dev_7.wav
--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_9.wav
Recola2018_16k/audio/Devel/dev_9.wav
--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_8.wav
Recola2018_16k/audio/Devel/dev_8.wav
--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_1.wav
Recola2018_16k/audio/Devel/dev_1.wav
--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_6.wav
Recola2018_16k/audio/Devel/dev_6.wav
--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_5.wav
Recola2018_16k/audio/Devel/dev_5.wav
--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_2.wav
Recola2018_16k/audio/Devel/dev_2.wav
--------------
Sample Rate: 16000 Length: (4800000,) Recola2018_16k/audio/Devel/dev_3.wav
Recola2018_16k/audio/

In [17]:
rep[0].shape

torch.Size([14999, 768])

### Process Feature Files

In [106]:
# Path to feature files
path_train = 'features/train'
extension = 'wavlmbasefeatpool'

In [107]:
train_files = [file for file in os.listdir(path_train) if file.endswith(extension)]

In [108]:
sorted_train_files = sorted(train_files)
sorted_train_files

['Train_01.wavlmbasefeatpool',
 'Train_02.wavlmbasefeatpool',
 'Train_03.wavlmbasefeatpool',
 'Train_04.wavlmbasefeatpool',
 'Train_05.wavlmbasefeatpool',
 'Train_06.wavlmbasefeatpool',
 'Train_07.wavlmbasefeatpool',
 'Train_08.wavlmbasefeatpool',
 'Train_09.wavlmbasefeatpool',
 'Train_10.wavlmbasefeatpool',
 'Train_11.wavlmbasefeatpool',
 'Train_12.wavlmbasefeatpool',
 'Train_13.wavlmbasefeatpool',
 'Train_14.wavlmbasefeatpool',
 'Train_15.wavlmbasefeatpool',
 'Train_16.wavlmbasefeatpool',
 'Train_17.wavlmbasefeatpool',
 'Train_18.wavlmbasefeatpool',
 'Train_19.wavlmbasefeatpool',
 'Train_20.wavlmbasefeatpool',
 'Train_21.wavlmbasefeatpool',
 'Train_22.wavlmbasefeatpool',
 'Train_23.wavlmbasefeatpool',
 'Train_24.wavlmbasefeatpool',
 'Train_25.wavlmbasefeatpool',
 'Train_26.wavlmbasefeatpool',
 'Train_27.wavlmbasefeatpool',
 'Train_28.wavlmbasefeatpool',
 'Train_29.wavlmbasefeatpool',
 'Train_30.wavlmbasefeatpool',
 'Train_31.wavlmbasefeatpool',
 'Train_32.wavlmbasefeatpool',
 'Train_

In [109]:
dfs = []
for file in sorted_train_files:
    df = pd.read_csv(os.path.join(path_train, file))
    dfs.append(df)

In [110]:
dfs

[      Unnamed: 0         0         1         2         3         4         5  \
 0              5  0.192775  0.021630 -0.002220  0.049548  0.027550 -0.072259   
 1             10 -0.001633 -0.056285  0.016375  0.116277  0.057702  0.007278   
 2             15 -0.009711 -0.050756  0.003903  0.107202  0.087066  0.028435   
 3             20  0.011100 -0.061423 -0.012051  0.114248  0.082298  0.047392   
 4             25  0.002032 -0.075134 -0.036766  0.145119  0.074578  0.036160   
 ...          ...       ...       ...       ...       ...       ...       ...   
 1750        8755  0.118933 -0.034314 -0.128744  0.077383  0.016834 -0.036635   
 1751        8760  0.084248 -0.015669 -0.041884  0.032899  0.079929 -0.081491   
 1752        8765  0.116338 -0.003613  0.052293  0.030137  0.008060  0.038389   
 1753        8770  0.160406  0.101941  0.016634  0.037377 -0.127868 -0.018315   
 1754        8775 -0.083426  0.025483 -0.015569  0.099718 -0.009567 -0.023486   
 
              6         7 

In [111]:
df_feat = pd.concat(dfs, ignore_index=True)

In [112]:
df_feat.drop(df_feat.columns[[0]], axis=1, inplace=True)

In [113]:
df_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.192775,0.021630,-0.002220,0.049548,0.027550,-0.072259,-0.113953,0.026322,0.243111,0.258217,...,0.031259,0.177174,-0.024470,1.580828,0.131061,0.058088,0.060344,0.055556,0.088015,0.534338
1,-0.001633,-0.056285,0.016375,0.116277,0.057702,0.007278,-0.098612,0.030306,0.074493,0.116834,...,0.049916,0.032317,-0.161122,0.897733,-0.007508,0.081411,0.150929,0.016092,0.003430,1.002552
2,-0.009711,-0.050756,0.003903,0.107202,0.087066,0.028435,-0.046143,0.007138,0.061045,0.091797,...,0.087188,-0.021439,-0.148893,0.941788,-0.005991,0.070249,0.156968,0.036499,-0.024933,0.925863
3,0.011100,-0.061423,-0.012051,0.114248,0.082298,0.047392,-0.023478,0.037867,0.000405,0.074239,...,0.085302,-0.045526,-0.160086,1.093111,-0.030713,0.098162,0.145225,0.026590,-0.012685,0.716978
4,0.002032,-0.075134,-0.036766,0.145119,0.074578,0.036160,-0.013421,0.031363,0.066145,0.097691,...,0.065395,-0.022680,-0.172983,0.913236,-0.012158,0.048093,0.143347,0.040120,0.002296,0.863229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56051,-0.057029,0.003183,-0.064779,0.019756,0.008583,0.055907,-0.110969,0.038949,0.018483,0.004246,...,0.046562,-0.052049,-0.027549,0.298962,0.045139,0.023852,0.104877,-0.014643,-0.135014,0.061568
56052,-0.021461,0.084907,-0.018901,-0.001795,0.027498,-0.010944,0.016355,-0.034383,-0.039028,-0.051059,...,0.058767,-0.113810,0.055252,-1.048426,0.015921,0.112068,0.011031,0.054151,-0.049708,-0.006128
56053,0.011502,-0.067663,0.009387,-0.089047,-0.019156,-0.036930,0.129560,-0.120475,-0.086944,-0.067322,...,0.126056,-0.011462,0.095049,0.547388,0.047290,0.097644,0.081951,-0.006497,0.024804,1.693958
56054,0.062623,0.025638,-0.077265,0.101572,0.095471,0.201871,0.028752,-0.012768,-0.022422,-0.038458,...,0.025629,0.038275,-0.051533,0.544126,0.002532,-0.009448,0.100813,-0.106342,-0.083069,-1.055355


### Process Label files

In [122]:
# Path to label files
path_train_labels = 'SEWA16/labels/Train/'
extension = 'csv'

In [123]:
train_files_labels = [file for file in os.listdir(path_train_labels) if file.endswith(extension)]

In [124]:
sorted_train_labels = sorted(train_files_labels)
sorted_train_labels

['Train_01.csv',
 'Train_02.csv',
 'Train_03.csv',
 'Train_04.csv',
 'Train_05.csv',
 'Train_06.csv',
 'Train_07.csv',
 'Train_08.csv',
 'Train_09.csv',
 'Train_10.csv',
 'Train_11.csv',
 'Train_12.csv',
 'Train_13.csv',
 'Train_14.csv',
 'Train_15.csv',
 'Train_16.csv',
 'Train_17.csv',
 'Train_18.csv',
 'Train_19.csv',
 'Train_20.csv',
 'Train_21.csv',
 'Train_22.csv',
 'Train_23.csv',
 'Train_24.csv',
 'Train_25.csv',
 'Train_26.csv',
 'Train_27.csv',
 'Train_28.csv',
 'Train_29.csv',
 'Train_30.csv',
 'Train_31.csv',
 'Train_32.csv',
 'Train_33.csv',
 'Train_34.csv']

In [139]:
dfl = []
for file in sorted_train_labels:
    df2 = pd.read_csv(os.path.join(path_train_labels, file), sep=";")
    df2.drop(df2.columns[[0,1]], axis=1, inplace=True)
    dfl.append(df2)

In [140]:
dfl

[      0.000000.1  0.000000.2  0.000000.3
 0       0.000000    0.000000    0.000000
 1       0.000692    0.000000   -0.000205
 2       0.000692    0.000000   -0.000692
 3       0.000692    0.000000   -0.000692
 4       0.000692    0.000000    0.000000
 ...          ...         ...         ...
 1750    0.037728    0.004450   -0.005821
 1751    0.035572    0.000498   -0.005821
 1752    0.036095    0.000498   -0.005821
 1753    0.036095    0.000498   -0.005821
 1754    0.036095    0.000498   -0.005821
 
 [1755 rows x 3 columns],
       0.000000.1  0.000000.2  0.000000.3
 0       0.000000    0.000000    0.000000
 1       0.000346    0.000000    0.000000
 2       0.000346    0.000000    0.000000
 3       0.000000    0.000000    0.000000
 4       0.000000    0.000000    0.000000
 ...          ...         ...         ...
 1739   -0.059315    0.017056   -0.046187
 1740   -0.059066    0.024804   -0.045043
 1741   -0.057951    0.062731   -0.045043
 1742   -0.025040    0.078112   -0.045043
 1743 

In [143]:
df_lab = pd.concat(dfl, ignore_index=True)

In [144]:
df_lab

Unnamed: 0,0.000000.1,0.000000.2,0.000000.3,0.000191,0.000926,0.000205
0,0.000000,0.000000,0.000000,,,
1,0.000692,0.000000,-0.000205,,,
2,0.000692,0.000000,-0.000692,,,
3,0.000692,0.000000,-0.000692,,,
4,0.000692,0.000000,0.000000,,,
...,...,...,...,...,...,...
56053,0.072071,0.070308,0.002988,,,
56054,0.072071,0.057943,0.002785,,,
56055,0.062396,0.073620,0.003421,,,
56056,0.061745,0.080134,0.004164,,,


In [130]:
df_lab.drop(df_lab.columns[[0,1,4]], axis=1, inplace=True)

In [131]:
df_lab

Unnamed: 0,0.000000.1,0.000000.2,'Train_02','Train_03','Train_04','Train_05','Train_06','Train_07','Train_08',0.000191,...,'Train_25','Train_26','Train_27','Train_28','Train_29','Train_30','Train_31','Train_32','Train_33','Train_34'
0,0.000000,0.000000,,,,,,,,,...,,,,,,,,,,
1,0.000692,0.000000,,,,,,,,,...,,,,,,,,,,
2,0.000692,0.000000,,,,,,,,,...,,,,,,,,,,
3,0.000692,0.000000,,,,,,,,,...,,,,,,,,,,
4,0.000692,0.000000,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56053,0.072071,0.070308,,,,,,,,,...,,,,,,,,,,'Train_34'
56054,0.072071,0.057943,,,,,,,,,...,,,,,,,,,,'Train_34'
56055,0.062396,0.073620,,,,,,,,,...,,,,,,,,,,'Train_34'
56056,0.061745,0.080134,,,,,,,,,...,,,,,,,,,,'Train_34'


In [57]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [58]:
# create classifier and grouping object
clf = make_pipeline(
    StandardScaler(), 
    SVC(gamma='auto'),
)
logo = LeaveOneGroupOut()

def experiment(
    features,
    targets,
    groups,
):        
    truths = []
    preds = []
    
    # leave-one-speaker loop    
    pbar = audeer.progress_bar(
        total=len(groups.unique()),
        desc='Run experiment',
    )
    for train_index, test_index in logo.split(
        features, 
        targets, 
        groups=groups,
    ):
        train_x = features.iloc[train_index]
        train_y = targets[train_index]
        clf.fit(train_x, train_y)
        
        truth_x = features.iloc[test_index]
        truth_y = targets[test_index]
        predict_y = clf.predict(truth_x)
        
        truths.append(truth_y)
        preds.append(predict_y)
        
        pbar.update()
        
    # combine speaker folds
    truth = pd.concat(truths)
    truth.name = 'truth'
    pred = pd.Series(
        np.concatenate(preds),
        index=truth.index,
        name='prediction',
    )
    
    return truth, pred

In [59]:
truth_w2v2, pred_w2v2 = experiment(
    df,
    emotion,
    speaker,
)
audformat.utils.concat([truth_w2v2, pred_w2v2])

NameError: name 'emotion' is not defined