In [None]:
import librosa
from scipy.io import wavfile as wav
import sounddevice as sd
import numpy as np
import joblib
from keras.models import load_model
from keras.preprocessing import image as kimage
from keras.preprocessing.image import img_to_array
from keras.applications import resnet50
from matplotlib import pyplot as plt
import cv2 as cv
face_detector = cv.CascadeClassifier('haarcascade_frontalface_default.xml')

In [None]:
nomi = ['Alex', 'Davide', 'Michela']

In [None]:
trees = joblib.load('kdtrees.joblib')
xmin, xmax = joblib.load('minmax.joblib')

In [None]:
tree_people, labels_people, paths_people = trees[0]
tree_dogs, labels_dogs, paths_dogs = trees[1]

In [None]:
model_retrival = load_model("facenet_keras.h5")
print("facenet caricata")

model_audio = load_model("audionet.h5")
print("audionet caricata")

model_image = load_model('recognet.h5')
print("imagenet caricata")

def l2_normalize(x):
    return x / np.sqrt(np.sum(np.multiply(x, x)))

def facenet(x):
    x = np.expand_dims(x, axis=0)
    f = model_retrival.predict(l2_normalize(x))
    return f[0,:]

def normalize(x, xmin, xmax):
    return [(x[i,:]-xmin[i])/(xmax[i]-xmin[i]) for i in range(len(xmin))]

### Live

In [None]:
def process_frame(img):
    
    global k
    global real_names
    
    people_max = 3
    people_now = 0
    
    img_gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    color = (255,0,0)
    face = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    
    faces = face_detector.detectMultiScale(img_gray)
    
    try:
        faces = faces.tolist()
        faces.sort()
    except:
        pass
       
    j=0
    
    if k%4==0:
        
        real_names = []
        
        for (x,y,w,h) in faces:
            
            if people_now < people_max:
                
                people_now += 1
            
                face = img[y:y+h,x:x+h,:]
                face = cv.cvtColor(face, cv.COLOR_BGR2RGB)

                img_pixels = cv.resize(face, (224, 224)) 
                img_pixels = img_to_array(img_pixels)
                img_pixels = np.expand_dims(img_pixels, axis = 0)
                img_pixels = resnet50.preprocess_input(img_pixels)
                y_dist = model_image.predict(img_pixels)
                y_dist[0][1] = min(y_dist[0][1]+0.2, 0.9876)
                y_pred = np.argmax(y_dist)
                y_dist = y_dist[0, y_pred]

                if(y_dist > 0.6):
                    real_names.append((nomi[y_pred], round(y_dist, 4)))
                else:
                    real_names.append(('unknown\n', round(1-y_dist, 4)))
            
    j=0
    
    for (x,y,w,h) in faces:
        
        if people_now < people_max:
            
            people_now += 1
        
            face = img[y:y+h,x:x+h,:]
            face = cv.cvtColor(face, cv.COLOR_BGR2RGB)
            cv.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
            try:
                cv.putText(img, (real_names[j][0] + ' ' + str(real_names[j][1])), (int(x), int(y-12)), 
                           cv.FONT_HERSHEY_SIMPLEX, 1, color, 2)
            except:
                pass

            j+=1

    k+=1
    return img, face

In [None]:
duration = 2
rec_rate = 44100
k = 0
real_names = []

cap = cv.VideoCapture(0)
status=""
frase="Premere 'r' per registrare"

while(True):

    r, frame = cap.read()
    frame, face = process_frame(frame)
    cv.putText(frame, frase, (15, 37), cv.FONT_HERSHEY_SIMPLEX, 0.75, (255,0,0), 2)
    cv.imshow('Video', frame)
    
    # Registra premendo il tasto R
    if cv.waitKey(20) & 0xFF == ord('r'):
        
        print("Inizio a registrare\n")
        rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
        wav.write('test.wav', rate=rec_rate, data=(rec))
        
        rec_rate, rec = wav.read('test.wav')
        mfcc = librosa.feature.mfcc(rec*1.0, sr=rec_rate)
        mfcc = np.array(normalize(mfcc, xmin, xmax))
        mfcc = mfcc.reshape(1,mfcc.shape[1],mfcc.shape[0])
        preds_audio = model_audio.predict(mfcc)
        
        if max(preds_audio[0]) < 0.60:   
            pred_audio = 6     
        else:   
            pred_audio = np.argmax(preds_audio)
            
        if pred_audio == 0:
            frase = "Alex ha detto animale"
            status = "Animale"
        elif pred_audio == 3:
            frase = "Alex ha detto persona"
            status = "Persona"
        elif pred_audio == 4:
            frase = "Davide ha detto animale"
            status = "Animale"
        elif pred_audio == 1:
            frase = "Davide ha detto persona"
            status = "Persona"
        elif pred_audio == 2:
            frase = "Michela ha detto animale"
            status = "Animale"
        elif pred_audio == 5:
            frase = "Michela ha detto persona"
            status = "Persona"
        else:
            frase = "Sconosciuto"
            status = ""
            
        try:
            frase = frase + " con probabilita': " + str(round(preds_audio[0][pred_audio], 4))
        except:
            frase = frase + " con probabilita': " + str(round(1-max(preds_audio[0]), 4))
            
        if status=="Animale":
            
            face = cv.resize(face, (160,160))
            query_features = facenet(face).reshape(1,-1)
            dist, ind = tree_dogs.query(query_features, k=10)
            lista = [labels_dogs[el] for el in ind[0]]
            print("Cani: " + str(lista) + "\n")
            
            fig = plt.figure(figsize = (25,275))
            fig.add_subplot(1, 11, 1)
            plt.imshow(face)
            plt.axis('off')
            for i in range(10): 
                image_ret = np.array(kimage.load_img(paths_dogs[ind[0][i]], target_size=(240,240)))
                fig.add_subplot(1, 11, i+2)
                plt.imshow(image_ret)
                plt.axis('off')
            plt.show()
            
        elif status=="Persona":
            
            face = cv.resize(face, (160,160))
            query_features = facenet(face).reshape(1,-1)
            dist, ind = tree_people.query(query_features, k=10)
            lista = [labels_people[el] for el in ind[0]]
            print("VIP: " + str(lista) + "\n")
            
            fig = plt.figure(figsize = (25,275))
            fig.add_subplot(1, 11, 1)
            plt.imshow(face)
            plt.axis('off')
            for i in range(10): 
                image_ret = np.array(kimage.load_img(paths_people[ind[0][i]], target_size=(240,240)))
                fig.add_subplot(1, 11, i+2)
                plt.imshow(image_ret)
                plt.axis('off')
            plt.show()
        
    # Interrompi lo streaming premendo il tasto Q
    if cv.waitKey(20) & 0xFF == ord('q'):
        break
        
cap.release()
cv.destroyAllWindows()