# WavLM Feature Extractor

### for AffWild - 6th ABAW

##### https://github.com/microsoft/unilm/tree/master/wavlm

In [1]:
%run WavLM

In [2]:
import pandas as pd
import numpy as np
import gc

In [3]:
import torch
from WavLM import WavLM, WavLMConfig

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(torch.cuda.get_device_name(0))

# load the pre-trained checkpoints
checkpoint = torch.load('model/WavLM-Base+.pt')
cfg = WavLMConfig(checkpoint['cfg'])
model = WavLM(cfg)
#model = model.to(device) #, dtype=torch.float32)
model.load_state_dict(checkpoint['model'])
model.eval()

Using device: cuda:2
NVIDIA A100-SXM4-40GB




WavLM(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
        (3): GELU(approximate='none')
      )
      (1-4): 4 x Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU(approximate='none')
      )
      (5-6): 2 x Sequential(
        (0): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU(approximate='none')
      )
    )
  )
  (post_extract_proj): Linear(in_features=512, out_features=768, bias=True)
  (dropout_input): Dropout(p=0.1, inplace=False)
  (dropout_features): Dropout(p=0.1, inplace=False)
  (encoder): TransformerEncoder(
    (pos_conv): Sequential(
      (0): Conv1d(768, 768, kernel_s

In [None]:
# Path to source files
path2 = '/home/etsmtl/akoerich/DEV/Affwild/Aff-Wild2-train-batch1-wav'
#path2 = '/home/etsmtl/akoerich/DEV/Affwild/Aff-Wild2-validation-batch2-wav'

In [None]:
import os

# Read a directory and put all files in a list
file_list = []
i = 0 
for path, subdirs, files in os.walk( path2 ):
    for name in files:
        file_list.append( os.path.join( path, name) )
        i += 1
print("Files processed: "+str(i) )


In [None]:
import soundfile as sf

# sample rate = 16,000
#  1s = 16,000 x 1 =  16,000
# 60s = 16,000 x 60 = 960,500

length = list()
i      = 0
avg    = 0

for file in file_list:
    data, samplerate = sf.read( file )
    
    if len(data) <= 960500:                         
            print("Audio length: "+str(len(data))+" with less than 60 s: "+str(file) )

    if len(data) >= 64000000:                         
            print("Audio length: "+str(len(data))+" higher than 7.5 min: "+str(file) )

    
    #computer average lenght of files
    avg = avg + len(data)
    length.append(len(data))
    i += 1

print( "Files processed: "+str(i) )
print( "Average file length: "+str(avg/i) + " samples   "+str(avg/i/samplerate)+" s   "+str(avg/i/samplerate/60)+" min" )
print( "Max length: "+str(max(length))+ " samples   "+str(max(length)/samplerate)+" s   "+str(max(length)/samplerate/60)+" min" )
print( "Min length: "+str(min(length))+ " samples   "+str(min(length)/samplerate)+" s   "+str(min(length)/samplerate/60)+" min" )

In [None]:
sampling_rate = 16000
track_count = 0

for file in file_list:
    data , samplerate = sf.read( file )
    print ("--------------")
    print ("Sample Rate: " + str(samplerate) + " Length: " + str(data.shape) + " " + "Time: " + str(data.shape[0]/samplerate) + " sec " + str( file ) )

    file_id = 'features/train/'+file.split('/')[6].split('.')[0]+'.wavlm' 
    
    if not os.path.exists(file_id):     
    
        # extract the representation of last layer
        wav_input = torch.from_numpy(data).float()
        wav_input_16khz = torch.unsqueeze(wav_input,0)

        del wav_input
        torch.cuda.empty_cache()
        
        # wav_input_16khz = torch.randn(1,48000)
        # wav_input_16khz = torch.randn(1,2786987)
        if cfg.normalize:
            wav_input_16khz = torch.nn.functional.layer_norm(wav_input_16khz , wav_input_16khz.shape)
    
        #rep = model.extract_features(wav_input_16khz.to(device, dtype=torch.float32))[0]
        rep = model.extract_features(wav_input_16khz)[0]

        del wav_input_16khz
        torch.cuda.empty_cache()
        
        rep[0].shape
        rep_np = rep[0].detach().numpy()
        rep_df = pd.DataFrame(rep_np)

        del rep_np

        file_id = 'features/valid/'+file.split('/')[6].split('.')[0]+'.wavlm'  
        rep_df.to_csv(file_id)

        del rep_df
        
        # Pooling 
        df_temp = pd.DataFrame(rep[0].detach().numpy())
        df_pool = df_temp.rolling(2, step=2).mean().drop(index=0) 

        del df_temp, rep
        
        file_id = 'features/valid/'+file.split('/')[6].split('.')[0]+'.pool.wavlm'  
        df_pool.to_csv(file_id)

        del df_pool
        
        print( file )

    else:
        print("Already exists: " + str(file_id))
    
    track_count += 1

    # Release memory using gc
    gc.collect()

In [None]:
file.split('/')[6].split('.')[0]

In [None]:
file.split('/')[6]

In [None]:
data.shape[0]/samplerate/60

In [None]:
!pwd

### Process Feature Files

In [None]:
# Path to feature files
path_train = 'features/train'
extension = 'wavlmbasefeatpool'

In [None]:
train_files = [file for file in os.listdir(path_train) if file.endswith(extension)]

In [None]:
sorted_train_files = sorted(train_files)
sorted_train_files

In [None]:
dfs = []
for file in sorted_train_files:
    df = pd.read_csv(os.path.join(path_train, file))
    dfs.append(df)

In [None]:
dfs

In [None]:
df_feat = pd.concat(dfs, ignore_index=True)

In [None]:
df_feat.drop(df_feat.columns[[0]], axis=1, inplace=True)

In [None]:
df_feat

### Process Label files

In [None]:
# Path to label files
path_train_labels = 'SEWA16/labels/Train/'
extension = 'csv'

In [None]:
train_files_labels = [file for file in os.listdir(path_train_labels) if file.endswith(extension)]

In [None]:
sorted_train_labels = sorted(train_files_labels)
sorted_train_labels

In [None]:
dfl = []
for file in sorted_train_labels:
    df2 = pd.read_csv(os.path.join(path_train_labels, file), sep=";")
    df2.drop(df2.columns[[0,1]], axis=1, inplace=True)
    dfl.append(df2)

In [None]:
dfl

In [None]:
df_lab = pd.concat(dfl, ignore_index=True)

In [None]:
df_lab

In [None]:
df_lab.drop(df_lab.columns[[0,1,4]], axis=1, inplace=True)

In [None]:
df_lab

In [None]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
# create classifier and grouping object
clf = make_pipeline(
    StandardScaler(), 
    SVC(gamma='auto'),
)
logo = LeaveOneGroupOut()

def experiment(
    features,
    targets,
    groups,
):        
    truths = []
    preds = []
    
    # leave-one-speaker loop    
    pbar = audeer.progress_bar(
        total=len(groups.unique()),
        desc='Run experiment',
    )
    for train_index, test_index in logo.split(
        features, 
        targets, 
        groups=groups,
    ):
        train_x = features.iloc[train_index]
        train_y = targets[train_index]
        clf.fit(train_x, train_y)
        
        truth_x = features.iloc[test_index]
        truth_y = targets[test_index]
        predict_y = clf.predict(truth_x)
        
        truths.append(truth_y)
        preds.append(predict_y)
        
        pbar.update()
        
    # combine speaker folds
    truth = pd.concat(truths)
    truth.name = 'truth'
    pred = pd.Series(
        np.concatenate(preds),
        index=truth.index,
        name='prediction',
    )
    
    return truth, pred

In [None]:
truth_w2v2, pred_w2v2 = experiment(
    df,
    emotion,
    speaker,
)
audformat.utils.concat([truth_w2v2, pred_w2v2])