In [None]:
import torch
import numpy as np
import torch.nn.functional as F
import pandas as pd
from utils.utils import *
from utils.get_data import load_data
from utils.get_generated_data import load_generated_data
from math import floor
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import pickle
from os.path import join


# Classifieur on believability for non verbal behaviour


In [None]:
class Conv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
            nn.Dropout(0.2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.conv(x)
    

class Discriminator(pl.LightningModule):

    def __init__(self):
        super().__init__()

        ##encode audio
        self.conv1_audio = Conv(1025, 512)
        self.conv2_audio = Conv(512, 128)
        self.conv3_audio = Conv(128, 64)

        ##encode behaviour
        self.conv1_behaviour = Conv(28, 32)
        self.conv2_behaviour = Conv(32, 64)

        self.conv_concat = Conv(128, 64)
        self.fc1 = torch.nn.Linear(64 * floor(100/4), 64)
        self.fc2 = torch.nn.Linear(64, 1)
    

    def forward(self, x_pose, c_audio):
        in_audio = torch.swapaxes(c_audio, 1, 2)
        c = self.conv1_audio(in_audio)
        c = F.max_pool1d(c, kernel_size=2, stride=2)
        c = self.conv2_audio(c)
        c = F.max_pool1d(c, kernel_size=2, stride=2)
        c = self.conv3_audio(c)
        c = F.max_pool1d(c, kernel_size=2, stride=2)

        x = torch.swapaxes(x_pose, 1, 2)
        x = self.conv1_behaviour(x)
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        x = self.conv2_behaviour(x)
        x = F.max_pool1d(x, kernel_size=2, stride=2)

        x = torch.cat([x, c], dim=1)
        x = self.conv_concat(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x

In [None]:
def supress_index(raw_labels_list, labels_list, x_audio, x_behaviour):
    indices_a_supprimer = []
    for i in range(len(raw_labels_list)):
        if "silence" in raw_labels_list[i]:
            indices_a_supprimer.append(i)

    tensor_audio = x_audio.clone()
    tensor_behaviour = x_behaviour.clone()
    masque = torch.ones(x_audio.size(0), dtype=torch.bool)
    masque[indices_a_supprimer] = False
    tensor_audio_without_silence_index = torch.index_select(tensor_audio, dim=0, index=torch.nonzero(masque).squeeze())
    tensor_behaviour_without_silence_index = torch.index_select(tensor_behaviour, dim=0, index=torch.nonzero(masque).squeeze())
    new_labels_list = torch.index_select(labels_list, dim=0, index=torch.nonzero(masque).squeeze())
    return tensor_audio_without_silence_index, tensor_behaviour_without_silence_index, new_labels_list

def reshape_behaviour_for_classif(x_audio, y_behaviour, scaler=None):
    minMaxScaler = MinMaxScaler((-1,1))
    if scaler is None:
        scaler = minMaxScaler.fit(y_behaviour.view(-1, y_behaviour.size()[2])) 
    y_scaled = torch.empty(size=(y_behaviour.size()[0], y_behaviour.size()[1], y_behaviour.size()[2]))
    for i in range(y_behaviour.size()[0]):
        y_scaled[i] = torch.tensor(scaler.transform(y_behaviour[i])) 
    return x_audio, y_scaled, scaler

#### Load data and pretrained model

In [None]:
classifier = Discriminator()
checkpoint = torch.load("saved_models/believability_classifier.ckpt", map_location=torch.device('cpu'))['state_dict']
for key in list(checkpoint.keys()):
    if 'discriminator.' in key:
        checkpoint[key.replace('discriminator.', '')] = checkpoint[key]
        del checkpoint[key]
    else:
        del checkpoint[key]
print(checkpoint.keys())
classifier.load_state_dict(checkpoint)

In [None]:
dict = load_data(12)

In [None]:
dict["test_generated"] = load_generated_data("25-04-2024_trueness_1_CGAN_17/epoch_450", create_init_files=True)

def trouver_tous_index(tableau, valeur):
    indexes = [i for i, x in enumerate(tableau) if x == valeur]
    return set(indexes)

dict["test_generated"]["X_audio_hubert"] = []

for i in range(len(dict["test_generated"]["keys"])):
    key = dict["test_generated"]["keys"][i]
    interval = dict["test_generated"]["interval"][i]
    index_interval = trouver_tous_index(dict["test"]["interval"],interval)
    index_key = trouver_tous_index(dict["test"]["keys"],key)
    index = index_interval.intersection(index_key)
    dict["test_generated"]["X_audio_hubert"].append(dict["test"]["X_audio_hubert"][list(index)[0]])
dict["test_generated"]["X_audio_hubert"] = torch.stack(dict["test_generated"]["X_audio_hubert"])

#### Use the discriminator from the trained model of behaviour generation

this classifier/discriminator of generated vs. real behavior is trained on a GAN-type model of non-verbal behavior generation. The discriminator parameters of this GAN are extracted and used here to discriminate the believability of behavior files. 

We get 1 if the classifier thinks the behavior comes from a real behavior and 0 if the classifier thinks the behavior comes from a generated behavior.

In [None]:

label = "gender"
tensor = "one_hot_tensor_gender"
nb_labels = 3
test = "test"

test_data_audio, test_data_behaviour, test_labels = supress_index(dict[test][label], dict[test][tensor], dict[test]["X_audio_hubert"], dict[test]["Y_behaviour"])
y_scaler = pickle.load(open(join("saved_models", 'scaler_y.pkl'), 'rb'))
X_test, y_test, _ = reshape_behaviour_for_classif(test_data_audio, test_data_behaviour, y_scaler)

test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

with torch.no_grad():
    all_predictions = []
    for audio, behaviour in test_loader:
        prediction = classifier(behaviour, audio)
        all_predictions.extend(prediction)
print(torch.mean(torch.stack(all_predictions)))
