# Active Inference Design Agent (demo)

In [1]:
using JLD
using Revise
using Rocket
using ReactiveMP
using GraphPPL
using Distributions
using LinearAlgebra
using Random 
import ProgressMeter
using WAV
using Plots
using ImageCore

In [2]:
# include auxilary functions for data processing
include("helpers/aida_segmentation.jl")
# include SNR quality metric
include("helpers/aida_snr.jl")
# include SNR quality metric
include("helpers/aida_ar.jl")
# include models and corresponding inference algos
include("models_inferences.jl")

batch_coupled_learning (generic function with 1 method)

In [3]:
include("models/everything.jl")

lar_inference_ex (generic function with 1 method)

In [4]:
# normalizing flows agent
#include("agent/agent_Bart/flow_agent.jl")

In [5]:
# return list of files from dir_name
function get_sounds_fn(dir_name)
    file_names = []
    for (root, dirs, files) in walkdir(dir_name)
        for file in files
            push!(file_names, joinpath(root, file)) # path to files
        end
    end
    file_names
end

get_sounds_fn (generic function with 1 method)

In [6]:
fs = 8000

8000

In this demo we will work with two different contexts (environemtns): babble and train noises. 

You can think of a user who keeps wandering around a train station: sometimes train arrives and he/she hears the train noises. When there is no train arriving to the platform, the user hears the babble from people waiting for the train.
When someone starts talking to the user he would (maybe) prefer to damp the environmental noise and focus only on the speaker.

Another possible scenario you can think of is the user who steps out of the train and goes to the bar, where people produce babble noise :D 

## Let's obtain priors for the environment

To make our model identifiabile, we would like to obtain priors for the environmental noises. To do this, we use Voice-Activity-Detection (VAD) to find the silent segments (with no speech). When those frames are identified, we fit them to AR model of order 2 to learn the parameters of AR.

Surely, you can obtain the priors based on different logic. (to discuss (1) it doesn't have to be AR, (2) it doesn't have to be VAD)

In [7]:
# Load sound from .wav
babble, fs = wavread("sound/AIDA/training/babble/0dB/sp01_babble_sn0.wav")
# split babble into overlapping segments (default 0.01s=10ms, 0.0025=2.5ms)
bbl_seg = get_frames(babble, fs)
# compute number of segments
bbl_totseg = size(bbl_seg, 1)

376

In [8]:
train, fs = wavread("sound/AIDA/training/train/0dB/sp01_train_sn0.wav")
tr_seg = get_frames(train, fs)
tr_totseg = size(tr_seg, 1)

376

In [9]:
# Priors for contexts were obtained through running VAD-AR block
# prior for train noise
trmη = [0.34546575880691316 -0.16774278649436555]
trvη = [0.00972993440498344 -0.0027103005529199036; -0.0027103005529199036 0.004281987640515784]
# trτ  = (41.0, 0.03644943410647206)
trτ  = (41.0, 0.05)

# prior for babble noise
bblmη = [1.1192255902602752 -0.43086292293101314]
bblvη = [0.007837790430663492 -0.005039080815241558; -0.005039080815241558 0.00596413119195013]
# bblτ  = (41.0, 0.0029780512310493387);
bblτ  = (41.0, 0.05);

## Source seperation
When priors for the contexts are indentified, we can run noise reduction algorithm based on coupled AR: AR_speech + AR_envrionment = output. This algorithm seperates speech (**z**) and noise (**x**)

We will split our dataset into training and test set. We use signals from training set to learn the mapping function between the gains proposed by agent and aprraisals provided by user. Bare in mind that this split is not necessary and in theory we can get along without it.

In [10]:
# coupled AR model is deisgned to work with time-varying priors for both speech and environmental noise
# prior_to_priors map "static" priors to the corresponding matrices with equal elements
function prior_to_priors(mη, vη, τ, totseg)
    ar_order = size(mη, 2)
    rmη = zeros(totseg, ar_order)
    rvη = zeros(totseg, ar_order, ar_order)
    for segnum in 1:totseg
        rmη[segnum, :], rvη[segnum, :, :] = reshape(mη, (ar_order,)), vη
    end
    priors_eta = rmη, rvη
    priors_tau = [τ for _ in 1:totseg]
    priors_eta[1], priors_eta[2], priors_tau
end

prior_to_priors (generic function with 1 method)

In [11]:
trmη_arr, trvη_arr, trτ_arr = prior_to_priors(trmη, trvη, trτ, tr_totseg)
bblmη_arr, bblvη_arr, bblτ_arr = prior_to_priors(bblmη, bblvη, bblτ, bbl_totseg);

In [12]:
function HA_algorithm(segments, priors_η, priors_τ, ar_1_order, ar_2_order, vmp_its)
    """Source seperation based on coupled AR model. Inference is performed in batch manner

       segments: segmented audio signal
       priors_η:   matrix of means and covariances of AR coefficients (see output formal of prior_to_priors
       priors_τ:   array of tupes contatining the prior of environmental noise precision
       ar_1_order: order of speech signal
       ar_2_order: order of environmental noise signal
       vmp_its:    number of variational iterations
    """
    n_sources = 2
    totseg = size(segments, 1)
    l      = size(segments, 2) # dimensionality of the buffer
    
    rmx = zeros(totseg, l)
    rvx = zeros(totseg, l)
    rmθ = zeros(totseg, ar_1_order)
    rvθ = zeros(totseg, ar_1_order, ar_1_order)
    rγ = fill(tuple(.0, .0), totseg)
    
    rmz = zeros(totseg, l)
    rvz = zeros(totseg, l)
    rmη = zeros(totseg, ar_2_order)
    rvη = zeros(totseg, ar_2_order, ar_2_order)
    rτ = fill(tuple(.0, .0), totseg)
    
    fe  = zeros(totseg, vmp_its)
    
    rmo = zeros(totseg, l)
    
    # agent proposes gains according to its beliefs
    ProgressMeter.@showprogress for segnum in 1:totseg
        prior_η                           = (priors_η[1][segnum, :], priors_η[2][segnum, :, :])
        prior_τ                           = priors_τ[segnum]
        γ, θ, zs, τ, η, xs, fe[segnum, :] = coupled_inference(segments[segnum, :], prior_η, prior_τ, ar_1_order, ar_2_order, vmp_its)
        mz, vz                            = mean.(zs), cov.(zs)
        mθ, vθ                            = mean(θ), cov(θ)
        rmz[segnum, :], rvz[segnum, :]    = first.(mz), first.(vz)
        rmθ[segnum, :], rvθ[segnum, :, :] = mθ, vθ
        rγ[segnum]                        = shape(γ), rate(γ)
        
        mx, vx                            = mean.(xs), cov.(xs)
        mη, vη                            = mean(η), cov(η)
        rmx[segnum, :], rvx[segnum, :]    = first.(mx), first.(vx)
        rmη[segnum, :], rvη[segnum, :, :] = mη, vη
        rτ[segnum]                        = shape(τ), rate(τ)
        
        # HA part
        speech = rmz[segnum, :]
        noise  = rmx[segnum, :]
        rmo[segnum, :] = speech .+ noise
    end
    rmz, rvz, rmθ, rvθ, rγ, rmx, rvx, rmη, rvη, rτ, fe, rmo
end

HA_algorithm (generic function with 1 method)

#### Obtain the outputs from HA

At this stage we run our inference algorithm to seperate **z** and **x**. We write the output into *.jld* files.

You don't need to run this snippet if you haven't changed the default parameters of HA_algorithm and priors (just see *sound/AIDA/separated_jld/training/*)

### Preference learning stage

#### Listening

User gets to listen new audio samples with proposed gains. After each listening he/she evaluates the performance of HA output by binary feedback.

## Planning
Few things must be said about the planning stage. 
First of all, the idea of planning is a reverse problem to prefernce learning. 
Given the parameters of neural network, the goal prior for the appraisal (1.0) and an informative prior for the future context (we have an idea of how the environment evolves), we want to infer the most suitable gains.
The evolution of the context will be based on HMM model, where the observations are 

For illustration purposes, we will first run the inference algorithm to obtain **z** and **x**. Secondly, we run our agent that proposes gains.

In [13]:
test_files = get_sounds_fn("sound/AIDA/test/");

In [14]:
test_jlds = get_sounds_fn("sound/AIDA/separated_jld/test/");

In [15]:
shuffle!(test_jlds);

## Train an agent here
Simulation is initialised with random values and a negative response. This should be updated in case it's actually close to good settings. Play with ``n_steps`` for a more finegrained search

In [21]:
# load helper functions
include("utils.jl")

get_new_proposal (generic function with 1 method)

In [22]:
ndims = 2
npoints = 1

n_steps = 20
gridticks =LinRange(0,2,n_steps)
grid = Iterators.product(gridticks,gridticks)

# Initial data point for train context
x1_train = rand(ndims,npoints) 
y1_train = [0.]

# Initial data point for babble context
x1_babble = rand(ndims,npoints) 
y1_babble = [0.]

# Parameters for train context. Just random numbers atm
σ_train = 0.2
l_train = 0.5

# Parameters for babble context
σ_babble = 0.2
l_babble = 0.5



# Keep track of last point visited. This is the position of the agent in param space
current_train = x1_train
current_babble = x1_train


2×1 Matrix{Float64}:
 0.9445637143824641
 0.29427224077866687

In [23]:
# Simulation loops
for t in 1:length(test_jlds)
    test_jld = test_jlds[t]
    d = JLD.load(test_jld)
    rmz, rmx = d["rmz"], d["rmx"]
    filename = d["filename"]
    # TODO: context inference
    
    if occursin("/babble/", filename)
        context = 1.0
        
        if t % 3 == 0 # This should be done in a smarter way
            σ_babble,l_babble = optimize_hyperparams(x1_babble,y1_babble,[σ_babble,l_babble])
        end
        # Get new proposal and update context specific state
        x2 = get_new_proposal(grid,x1_babble,y1_babble,current_babble,σ_babble,l_babble)
        current_babble = x2
        
    elseif occursin("/train/", filename)
        context = 0.0
        
        if t % 3 == 0
            σ_train,l_train = optimize_hyperparams(x1_train,y1_train,[σ_train,l_train])
        end
        
        # Get new proposal and update context specific state
        x2 = get_new_proposal(grid,x1_train,y1_train,current_train,σ_train,l_train)
        current_train = x2

    else
        println("Wrong file encountered")
        break
    end
   
    # Get the signal
    rz, rx = get_signal(rmz, fs), get_signal(rmx, fs)
 
    # Apply params
    ha_out = x2[1] .* rz + x2[2] .* rx
    
    # Let user listen and request feedback
    full_name = "sound/AIDA/planning/ha_out_$(x2[1])_$(x2[2])_"*filename[findfirst("sp", filename)[1]:end]
    WAV.wavwrite(ha_out, fs, full_name)
    WAV.wavplay(full_name)
    println("How's HA output 0..1 ?")
    appraisal = readline()
    
    # Add to GP dataset
    if context == 1.0
        y1_train = vcat(y1_train,parse(Int64,appraisal))
        x1_train = hcat(x1_train,collect(x2))
    elseif context == 0.0
        y1_babble = vcat(y1_babble,parse(Int64,appraisal))
        x1_babble = hcat(x1_babble,collect(x2))
    end

    
end

How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 1
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 1
How's HA output 0..1 ?
stdin> 1
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 1
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 0
How's HA output 0..1 ?
stdin> 1
