In [4]:
import os

# Set cache directories for XDG and Hugging Face Hub 
# os.environ['XDG_CACHE_HOME'] = 'test/.cache'
# os.environ['HUGGINGFACE_HUB_CACHE'] = 'test/.cache'
from scipy.special import softmax

import torch

# Set device to GPU if available, otherwise use CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import seaborn as sns
from tqdm.notebook import tqdm

from moviepy.editor import VideoFileClip, ImageSequenceClip

import torch
from facenet_pytorch import (MTCNN)

from transformers import (AutoFeatureExtractor,
                          AutoModelForImageClassification,
                          AutoConfig)
                             
from PIL import Image, ImageDraw

def detect_emotions(image):
    """
    Detect emotions from a given image.
    Returns a tuple of the cropped face image and a
    dictionary of class probabilities.
    """
    temporary = image.copy()

    # Detect faces in the image using the MTCNN group model
    sample = mtcnn.detect(temporary)
    if sample[0] is not None:
        box = sample[0][0]

        # Crop the face
        face = temporary.crop(box)

        # Pre-process the face
        inputs = extractor(images=face, return_tensors="pt")

        # Run the image through the model
        outputs = model(**inputs)

        # Apply softmax to the logits to get probabilities
        probabilities = torch.nn.functional.softmax(outputs.logits,
                                                    dim=-1)

        # Retrieve the id2label attribute from the configuration
        config = AutoConfig.from_pretrained(
            "trpakov/vit-face-expression"
        )
        id2label = config.id2label

        # Convert probabilities tensor to a Python list
        probabilities = probabilities.detach().numpy().tolist()[0]

        # Map class labels to their probabilities
        class_probabilities = {
            id2label[i]: prob for i, prob in enumerate(probabilities)
        }

        return face, class_probabilities
    return None, None
mtcnn = MTCNN(
    margin=10,
    thresholds=[0.6, 0.7, 0.7],
    factor=0.709,
    post_process=True,
    keep_all=False,
    device=device
)
    
# Load the pre-trained model and feature extractor
extractor = AutoFeatureExtractor.from_pretrained(
    "trpakov/vit-face-expression"
)
model = AutoModelForImageClassification.from_pretrained(
    "trpakov/vit-face-expression"
)

# Choose a frame
def video_prob(video_data) :    
    skips = 2
    reduced_video = []

    for i in range(0, len(video_data), skips):
        reduced_video.append(video_data[i])

# Define a list of emotions
    emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]


# Create a list to hold the class probabilities for all frames
    all_class_probabilities = []

# Loop over video frames
    for i, frame in enumerate(reduced_video):
    # Convert frame to uint8
        frame = frame.astype(np.uint8)

    # Call detect_emotions to get face and class probabilities
        face, class_probabilities = detect_emotions(Image.fromarray(frame))
    
    # If a face was found
        if face is None:
            class_probabilities = {emotion: None for emotion in emotions}
        
    # Append class probabilities to the list
        all_class_probabilities.append(list(class_probabilities.values()))
    all_class_probabilities = np.asarray(all_class_probabilities)
    vid_prob = np.mean(all_class_probabilities,axis = 0)

    return softmax(vid_prob)

Running on device: cpu


Matplotlib created a temporary cache directory at /var/folders/y8/y2lrp6t51wz4np0f_f4kstnh0000gq/T/matplotlib-9hm602hy because the default path (/Users/arjiv_admin/.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
Matplotlib is building the font cache; this may take a moment.


In [5]:
vid_prob = []
vid_emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]

directory = '/Users/arjiv_admin/Desktop/Emotion_POC/Video Segments'
for filename in os.listdir(directory):
    # Check if the file has an MP4 extension
    if filename.lower().endswith('.mp4'):
        link = os.path.join(directory, filename)
        clip = VideoFileClip(link)
        vid_fps = clip.fps
        video = clip.without_audio()
        video_data = np.array(list(video.iter_frames()))
        prob = video_prob(video_data)
        vid_prob.append(prob)

KeyboardInterrupt: 

In [2]:
from transformers import AutoModelForAudioClassification
import librosa, torch

In [6]:
directory = '/Users/arjiv_admin/Desktop/Emotion_POC/Audio Segments'
modelDom = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Dominance", trust_remote_code=True)
modelVal = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Valence", trust_remote_code=True)
modelAro = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Arousal", trust_remote_code=True)

#get mean/std
mean = modelDom.config.mean
std = modelDom.config.std
Audio_pred = []
Audio_emotions = ["Dominance", "Valence", "Arousal"]
for filename in os.listdir(directory):
    if filename.lower().endswith('.mp3'):
        audio_path = os.path.join(directory, filename)
        raw_wav, _ = librosa.load(audio_path, sr=modelDom.config.sampling_rate)

    #normalize the audio by mean/std
        norm_wav = (raw_wav - mean) / (std+0.000001)

    #generate the mask
        mask = torch.ones(1, len(norm_wav))

    #batch it (add dim)
        wavs = torch.tensor(norm_wav).unsqueeze(0)

        pred = [modelDom(wavs, mask), modelVal(wavs,mask), modelAro(wavs,mask)]
        Audio_pred.append(pred)

Some weights of the model checkpoint at microsoft/wavlm-large were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

: 