In [7]:
import os

# Set cache directories for XDG and Hugging Face Hub 
# os.environ['XDG_CACHE_HOME'] = 'test/.cache'
# os.environ['HUGGINGFACE_HUB_CACHE'] = 'test/.cache'
from scipy.special import softmax

import torch

# Set device to GPU if available, otherwise use CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import seaborn as sns
from tqdm.notebook import tqdm

from moviepy.editor import VideoFileClip, ImageSequenceClip

import torch
from facenet_pytorch import (MTCNN)

from transformers import (AutoFeatureExtractor,
                          AutoModelForImageClassification,
                          AutoConfig)
                             
from PIL import Image, ImageDraw

def detect_emotions(image):
    """
    Detect emotions from a given image.
    Returns a tuple of the cropped face image and a
    dictionary of class probabilities.
    """
    temporary = image.copy()

    # Detect faces in the image using the MTCNN group model
    sample = mtcnn.detect(temporary)
    if sample[0] is not None:
        box = sample[0][0]

        # Crop the face
        face = temporary.crop(box)

        # Pre-process the face
        inputs = extractor(images=face, return_tensors="pt")

        # Run the image through the model
        outputs = model(**inputs)

        # Apply softmax to the logits to get probabilities
        probabilities = torch.nn.functional.softmax(outputs.logits,
                                                    dim=-1)

        # Retrieve the id2label attribute from the configuration
        config = AutoConfig.from_pretrained(
            "trpakov/vit-face-expression"
        )
        id2label = config.id2label

        # Convert probabilities tensor to a Python list
        probabilities = probabilities.detach().numpy().tolist()[0]

        # Map class labels to their probabilities
        class_probabilities = {
            id2label[i]: prob for i, prob in enumerate(probabilities)
        }

        return face, class_probabilities
    return None, None
mtcnn = MTCNN(
    margin=10,
    thresholds=[0.6, 0.7, 0.7],
    factor=0.709,
    post_process=True,
    keep_all=False,
    device=device
)
    
# Load the pre-trained model and feature extractor
extractor = AutoFeatureExtractor.from_pretrained(
    "trpakov/vit-face-expression"
)
model = AutoModelForImageClassification.from_pretrained(
    "trpakov/vit-face-expression"
)

# Choose a frame
def video_prob(video_data) :    
    skips = 2
    reduced_video = []

    for i in range(0, len(video_data), skips):
        reduced_video.append(video_data[i])

# Define a list of emotions
    emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]


# Create a list to hold the class probabilities for all frames
    all_class_probabilities = []

# Loop over video frames
    for i, frame in enumerate(reduced_video):
    # Convert frame to uint8
        frame = frame.astype(np.uint8)

    # Call detect_emotions to get face and class probabilities
        face, class_probabilities = detect_emotions(Image.fromarray(frame))
    
    # If a face was found
        if face is None:
            class_probabilities = {emotion: None for emotion in emotions}
        
    # Append class probabilities to the list
        all_class_probabilities.append(list(class_probabilities.values()))
    all_class_probabilities = np.asarray(all_class_probabilities)
    vid_prob = np.mean(all_class_probabilities,axis = 0)

    return softmax(vid_prob)

Running on device: cpu


Matplotlib created a temporary cache directory at /var/folders/y8/y2lrp6t51wz4np0f_f4kstnh0000gq/T/matplotlib-9t282jps because the default path (/Users/arjiv_admin/.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
Matplotlib is building the font cache; this may take a moment.


In [8]:
vid_prob = []
vid_emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]

directory = '/Users/arjiv_admin/Desktop/Emotion_POC/Video Segments'
files = [f for f in os.listdir(directory) if f.lower().endswith('.mp4')]
files.sort(key=lambda f: int(f.split('_')[1].split('.')[0]))
for filename in files:
        print(f"Processing file: {filename}")
        link = os.path.join(directory, filename)
        clip = VideoFileClip(link)
        vid_fps = clip.fps
        video = clip.without_audio()
        video_data = np.array(list(video.iter_frames()))
        prob = video_prob(video_data)
        vid_prob.append(prob)

Processing file: segment_0000.mp4
Processing file: segment_0001.mp4
Processing file: segment_0002.mp4
Processing file: segment_0003.mp4
Processing file: segment_0004.mp4
Processing file: segment_0005.mp4
Processing file: segment_0006.mp4


In [5]:
import os
import torch
import librosa
import numpy as np
from transformers import AutoModelForAudioClassification

os.environ["TOKENIZERS_PARALLELISM"] = "false"

directory = '/Users/arjiv_admin/Desktop/Emotion_POC/Audio Segments'

modelDom = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Dominance", trust_remote_code=True)
modelVal = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Valence", trust_remote_code=True)
modelAro = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Arousal", trust_remote_code=True)

Audio_pred = []
Audio_emotions = ["Dominance", "Valence", "Arousal"]

def Audio_emotion(model, audio_path):
    mean = model.config.mean
    std = model.config.std
    
    raw_wav, sr = librosa.load(audio_path, sr=model.config.sampling_rate)
    
    # normalize the audio by mean/std
    norm_wav = (raw_wav - mean) / (std + 0.000001)
    
    # generate the mask
    mask = torch.ones(1, len(norm_wav))
    
    # batch it (add dim)
    wavs = torch.tensor(norm_wav).unsqueeze(0)
    
    output = model(wavs, mask)
    return output.item()  # Return the scalar value

files = [f for f in os.listdir(directory) if f.lower().endswith('.mp3')]
files.sort(key=lambda f: int(f.split('_')[1].split('.')[0]))

for filename in files:
    print(f"Processing file: {filename}")
    audio_path = os.path.join(directory, filename)
    with torch.no_grad():
        pred_dom = Audio_emotion(modelDom, audio_path)
        pred_val = Audio_emotion(modelVal, audio_path)
        pred_aro = Audio_emotion(modelAro, audio_path)

    pred = [pred_dom, pred_val, pred_aro]
    Audio_pred.append(pred)



Some weights of the model checkpoint at microsoft/wavlm-large were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

Processing file: segment_0000.mp3
Predictions: [0.3891616761684418, 0.32505497336387634, 0.44186151027679443]
Processing file: segment_0001.mp3
Predictions: [0.6499191522598267, 0.15794216096401215, 0.692778468132019]
Processing file: segment_0002.mp3
Predictions: [0.6463385820388794, 0.2735784351825714, 0.7189842462539673]
Processing file: segment_0003.mp3
Predictions: [0.2887377142906189, 0.24893343448638916, 0.3193511366844177]
Processing file: segment_0004.mp3
Predictions: [0.7332590818405151, 0.31650781631469727, 0.7603095769882202]
Processing file: segment_0005.mp3
Predictions: [0.12423272430896759, 0.27066493034362793, 0.1293012946844101]
Processing file: segment_0006.mp3
Predictions: [0.15822279453277588, 0.3611600399017334, 0.10886049270629883]
Final Audio_pred: [[0.3891616761684418, 0.32505497336387634, 0.44186151027679443], [0.6499191522598267, 0.15794216096401215, 0.692778468132019], [0.6463385820388794, 0.2735784351825714, 0.7189842462539673], [0.2887377142906189, 0.248933

In [13]:
len(Audio_pred)

7

In [9]:
import whisper

model = whisper.load_model("base")


In [10]:
from transformers import pipeline

classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [11]:
directory = '/Users/arjiv_admin/Desktop/Emotion_POC/Audio Segments'
text_emotions = []
text_preds = []
files = [f for f in os.listdir(directory) if f.lower().endswith('.mp3')]
files.sort(key=lambda f: int(f.split('_')[1].split('.')[0]))

for filename in files:
    print(f"Processing file: {filename}")
    audio_path = os.path.join(directory, filename)
    result = model.transcribe(audio_path)
    model_outputs = classifier(result["text"])
    text_emotions = [d["label"] for d in model_outputs[0]]
    preds = [d["score"] for d in model_outputs[0]]
    text_preds.append(preds)

Processing file: segment_0000.mp3
Processing file: segment_0001.mp3
Processing file: segment_0002.mp3
Processing file: segment_0003.mp3
Processing file: segment_0004.mp3
Processing file: segment_0005.mp3
Processing file: segment_0006.mp3


In [15]:
import pandas as pd

df_text = pd.DataFrame(text_preds, columns= text_emotions)
df_audio = pd.DataFrame(Audio_pred, columns = Audio_emotions)
df_vid = pd.DataFrame(vid_prob, columns= vid_emotions)
df_text = df_text.add_prefix('text_')
df_audio = df_audio.add_prefix('audio_')
df_vid = df_vid.add_prefix('vid_')
df = pd.concat([df_text,df_audio, df_vid], axis=1)

print(df)


   text_neutral  text_annoyance  text_approval  text_desire  text_curiosity  \
0      0.615272        0.246925       0.099582     0.033347        0.016836   
1      0.347312        0.318416       0.102074     0.075763        0.042557   
2      0.380562        0.365789       0.095972     0.081652        0.054079   
3      0.364445        0.263701       0.147930     0.128249        0.067386   
4      0.699225        0.154372       0.061320     0.018445        0.018356   
5      0.613640        0.283586       0.064604     0.031193        0.024691   
6      0.941637        0.011763       0.009231     0.007006        0.006505   

   text_anger  text_confusion  text_realization  text_disapproval  \
0    0.016201        0.010918          0.007626          0.006645   
1    0.029315        0.019223          0.014425          0.008744   
2    0.020931        0.017390          0.010895          0.008928   
3    0.048579        0.045061          0.022408          0.009948   
4    0.016992        0