# Import dependencies and load data

In [1]:
!pip install opencv-python
!pip install mediapipe
!pip install moviepy
!pip install SpeechRecognition
!pip install librosa
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mediapipe
  Downloading mediapipe-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.9/33.9 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.0 sounddevice-0.4.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import cv2
import mediapipe as mp
import numpy as np
import glob
from scipy.spatial import ConvexHull
import math
import csv
from os import path
import json

from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import librosa
from moviepy.editor import VideoFileClip
import speech_recognition as sr
import moviepy.editor as m_editor

import pandas as pd

In [3]:
## connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# video = "/content/drive/MyDrive/interview.mp4"
# video_cut = "/content/drive/MyDrive/interview-cut.mp4"
# video_out = "/content/drive/MyDrive/interview-output.mp4"
# output_csv = "/content/drive/MyDrive/interview.csv"
video = '../../data/CHR/3035post.MTS'
video_cut = '../data/3035post-cut.mp4'
video_out = '../output/3035post-output.mp4'
output_csv = '../output/3035post-gesture.csv'
output_audio_csv = '../output/3035post-acoustic.csv'

In [5]:
input_dir = "/content/drive/MyDrive/train/"
output_dir = "/content/drive/MyDrive/train-landmarks/"

In [6]:
# Initialize MediaPipe's Holistic module
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic
mp_face_mesh = mp.solutions.face_mesh
holistic = mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)

## Define Functions



In [7]:
## Dataframe functions

# function to split values in dataframe
def split_xyz(x):
  try:
    lines = x.split('\n')

    x = float(lines[0].split(':')[1].strip())
    y = float(lines[1].split(':')[1].strip())
    z = float(lines[2].split(':')[1].strip())
    return x, y, z
  except:
    return np.nan, np.nan, np.nan

In [8]:
## Holistic functions

# Function to calculate the Euclidean distance between two points
def euclidean_distance(p1x, p1y, p2x, p2y):
    return np.sqrt((p1x - p2x) ** 2 + (p1y - p2y) ** 2)

# Function to calculate the openness of a pose
def pose_openness(keypoints_x, keypoints_y, image_w, image_h):

    # coords = np.array([(kp.x, kp.y) for kp in keypoints])
    coords = np.array([(int(kp_x * image_w), int(kp_y * image_h)) for kp_x, kp_y in zip(keypoints_x, keypoints_y)])
    hull = ConvexHull(coords)
    
    return hull.volume

# Function to calculate leaning direction
def leaning_direction(nose_z, lhip_z, rhip_z):
    avg_hip_z = (lhip_z + rhip_z) / 2

    if nose_z < avg_hip_z:
        return "Backward"
    else:
        return "Forward"

# Function to calculate head direction
def head_direction(prev_nose_x, prev_nose_y, curr_nose_x, curr_nose_y, image_w, image_h):
    curr_nose_cood = np.array([int(curr_nose_x * image_w), int(curr_nose_y * image_h)])
    prev_nose_cood = np.array([int(prev_nose_x * image_w), int(prev_nose_y * image_h)])
    nose_diff = curr_nose_cood - prev_nose_cood
    horizontal = 'STILL'
    vertical = 'STILL'
    if nose_diff[0] > 0:
        horizontal = "RIGHT"
    elif nose_diff[0] < 0:
        horizontal = 'LEFT'
    
    if nose_diff[1] > 0:
        vertical = 'UP'
    elif nose_diff[1] < 0:
        vertical = 'DOWN'
    return horizontal, vertical

# Function to calculate angle between three landmarks
def calculate_angle(l1x, l1y, l2x, l2y, l3x, l3y):

    # Calculate the angle between the three points
    angle = math.degrees(math.atan2(l3y - l2y, l3x - l2x) - math.atan2(l1y - l2y, l1x - l2x))
    
    # Check if the angle is less than zero.
    if angle < 0:
        # Add 360 to the found angle.
        angle += 360
  
    return angle

# Function to calculate angle between three landmarks using numpy module
def numpy_angle(l1x,l1y,l2x,l2y,l3x,l3y, width, height):
  a = np.array([l1x * width, l1y * height])
  b = np.array([l2x * width, l2y * height])
  c = np.array([l3x * width, l3y * height])

  ba = a - b
  bc = c - b

  cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
  angle = np.arccos(cosine_angle)

  return np.degrees(angle)

# Function to calculate hand orientation
def orientation(l0x, l0y, l9x,l9y): 
    x0 = l0x
    y0 = l0y
    
    x9 = l9x
    y9 = l9y
    
    if abs(x9 - x0) < 0.05:      # since tan(0) --> ∞
        m = 1000000000
    else:
        m = abs((y9 - y0)/(x9 - x0))       
        
    if m>=0 and m<=1:
        if x9 > x0:
            return "Right"
        else:
            return "Left"
    if m>1:
        if y9 < y0:       # since, y decreases upwards
            return "Up"
        else:
            return "Down"

# Initialize Variables

In [20]:
## Holistic
prev_row = pd.Series(dtype='float64') #initially empty series
total_movement = 0
frame_movement = 0
openness_value = 0
head_horizontal = ""
head_vertical = ""
holistic_threshold = 0.001  # Adjust the threshold to fine-tune movement detection sensitivity
holistic_keypoints1 = [
    'PL-LEFT_WRIST',
    'PL-RIGHT_WRIST',
    'PL-LEFT_ANKLE',
    'PL-RIGHT_ANKLE',
]
holistic_keypoints2 = [ 
    'PL-LEFT_SHOULDER',
    'PL-RIGHT_SHOULDER',
    'PL-LEFT_HIP',
    'PL-RIGHT_HIP',
    'PL-LEFT_WRIST',
    'PL-RIGHT_WRIST'
]

## Arms
left_arm_movement = 0
right_arm_movement = 0

la_counter = 0 
la_orientation = None
la_leaning = None

ra_counter = 0 
ra_orientation = None
ra_leaning = None

## Hand
lh_tip_distance = 0
lh_state = None
lh_orientation = None

rh_tip_distance = 0
rh_state = None
rh_orientation = None

# moving average window
window_size = 10

In [10]:
split_columns = ['PL-LEFT_WRIST', 'PL-RIGHT_WRIST',
       'PL-LEFT_ELBOW', 'PL-RIGHT_ELBOW', 'PL-LEFT_SHOULDER',
       'PL-RIGHT_SHOULDER', 'PL-LEFT_ANKLE', 'PL-RIGHT_ANKLE', 'PL-LEFT_HIP',
       'PL-RIGHT_HIP', 'PL-NOSE', 'LH-INDEX_FINGER_TIP', 'RH-INDEX_FINGER_TIP',
       'LH-THUMB_TIP', 'RH-THUMB_TIP', 'LH-MIDDLE_FINGER_TIP',
       'RH-MIDDLE_FINGER_TIP', 'LH-MIDDLE_FINGER_MCP', 'RH-MIDDLE_FINGER_MCP',
       'LH-RING_FINGER_TIP', 'RH-RING_FINGER_TIP', 'LH-RING_FINGER_MCP',
       'RH-RING_FINGER_MCP', 'LH-PINKY_TIP', 'RH-PINKY_TIP','LH-PINKY_MCP', 'RH-PINKY_MCP', 'LH-WRIST',
       'RH-WRIST']



In [11]:
csv_header = [  'frame',
                'total_movement_per_second',
                'pose_openness',
                'leaning',
                'head_horizontal',
                'head_vertical',
                'left_arm_angle', 
                'left_arm_v_movement', 
                'left_arm_h_movement',
                'right_arm_angle', 
                'right_arm_v_movement', 
                'right_arm_h_movement', 
                'left_hand_orientation', 
                'left_hand_state', 
                'right_hand_orientation', 
                'right_hand_state']

# Load landmarks CSV and extract Gesture data

In [12]:
pattern = output_dir+"*-landmarks.csv"
files = [path.basename(x) for x in glob.glob(pattern)]
print(files)

['couch-landmarks.csv', 'interview-landmarks.csv']


In [21]:
for file in files:
  # define output csv file
  filename = file.split('.')[0].split('-')[0]
  features_csv = output_dir+filename+'-features.csv'

  # load dataframe
  df = pd.read_csv(output_dir+file)
  ###df = df.head(10) ###

  # split columns to x y z 
  for column in split_columns:
    df[[column+'-X', column+'-Y', column+'-Z']] = df[column].apply(lambda x: pd.Series(split_xyz(x)))
  
  # drop original columns
  df.drop(columns=split_columns, inplace=True)

  # handle missing values by interpolation
  df.interpolate(method ='linear', limit_direction ='both', inplace=True, limit=10)

  # split the dataframes
  df2 = df.loc[:, ~df.columns.isin(["width", "height", "fps", "frame"])]
  df1 = df.loc[:, df.columns.isin(["width", "height", "fps", "frame"])]

  # moving average calculation
  df2 = df2.rolling(window_size, min_periods=1).mean()

  # concatenate dataframes back together
  df = pd.concat([df1, df2], axis=1)


  with open(features_csv, mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_header)

    for idx, row in df.iterrows():

      ## Holistic movements
      # Calculate the total movement
      if not prev_row.empty:
        frame_movement = 0
        for kp1 in holistic_keypoints1:
          distance = euclidean_distance(row[kp1+'-X'], row[kp1+'-Y'], prev_row[kp1+'-X'], prev_row[kp1+'-Y'])
          frame_movement += distance
        if frame_movement > holistic_threshold:
          total_movement += frame_movement
        
        # calculate head direction
        head_horizontal, head_vertical = head_direction(prev_row['PL-NOSE-X'], prev_row['PL-NOSE-Y'], row['PL-NOSE-X'], row['PL-NOSE-Y'], row['width'], row['height'])
      
      # Calculate and display the total movement and pose openness on the frame
      kp2_x = [ row[kp2+'-X'] for kp2 in holistic_keypoints2]
      kp2_y = [ row[kp2+'-Y'] for kp2 in holistic_keypoints2]
      openness_value = pose_openness(kp2_x, kp2_y, row['width'], row['height'])

      # Calculate and display the leaning direction
      leaning_dir = leaning_direction(row['PL-NOSE-Z'], row['PL-LEFT_HIP-Z'], row['PL-RIGHT_HIP-Z'])
      
      ## Hand movements
      # distance b/w INDEX_FINGER_TIP and THUMB_TIP
      lh_tip_distance = euclidean_distance(row['LH-INDEX_FINGER_TIP-X'], row['LH-INDEX_FINGER_TIP-Y'], row['LH-THUMB_TIP-X'], row['LH-THUMB_TIP-Y'])
      if row['LH-MIDDLE_FINGER_TIP-Y'] < row['LH-MIDDLE_FINGER_MCP-Y'] and row['LH-RING_FINGER_TIP-Y'] < row['LH-RING_FINGER_MCP-Y'] \
        and row['LH-PINKY_TIP-Y'] < row['LH-PINKY_MCP-Y'] and lh_tip_distance < 0.015:
        lh_state = 'CLOSED'
      else:
        lh_state = 'OPEN'
      lh_orientation = orientation(row['LH-WRIST-X'], row['LH-WRIST-Y'], row['LH-MIDDLE_FINGER_MCP-X'], row['LH-MIDDLE_FINGER_MCP-Y'])

      rh_tip_distance = euclidean_distance(row['RH-INDEX_FINGER_TIP-X'], row['RH-INDEX_FINGER_TIP-Y'], row['RH-THUMB_TIP-X'], row['RH-THUMB_TIP-Y'])
      if row['RH-MIDDLE_FINGER_TIP-Y'] < row['RH-MIDDLE_FINGER_MCP-Y'] and row['RH-RING_FINGER_TIP-Y'] < row['RH-RING_FINGER_MCP-Y'] \
        and row['RH-PINKY_TIP-Y'] < row['RH-PINKY_MCP-Y'] and rh_tip_distance < 0.015:
        rh_state = 'CLOSED'
      else:
        rh_state = 'OPEN'
      rh_orientation = orientation(row['RH-WRIST-X'], row['RH-WRIST-Y'], row['RH-MIDDLE_FINGER_MCP-X'], row['RH-MIDDLE_FINGER_MCP-Y'])


      ## Arm movements
      # Calculate weather arm is up or down
      la_angle = numpy_angle(row['PL-LEFT_WRIST-X'], row['PL-LEFT_WRIST-Y'], row['PL-LEFT_ELBOW-X'], row['PL-LEFT_ELBOW-Y'], row['PL-LEFT_SHOULDER-X'], row['PL-LEFT_SHOULDER-Y'], row['width'], row['height'])
      ra_angle = numpy_angle(row['PL-RIGHT_WRIST-X'], row['PL-RIGHT_WRIST-Y'], row['PL-RIGHT_ELBOW-X'], row['PL-RIGHT_ELBOW-Y'], row['PL-RIGHT_SHOULDER-X'], row['PL-RIGHT_SHOULDER-Y'], row['width'], row['height'])

      if la_angle > 160:
        la_orientation = "DOWN"
      if la_angle < 30 and la_orientation =='DOWN':
        la_orientation="UP"
        la_counter +=1

      if ra_angle > 160:
        ra_orientation = "DOWN"
      if ra_angle < 30 and ra_orientation =='DOWN':
        ra_orientation="UP"
        ra_counter +=1

      # Calculate wheather arm is leaning forward
      if abs(row['PL-LEFT_WRIST-Z']) > abs(row['PL-LEFT_ELBOW-Z']) :
        la_leaning = 'FORWARD'
      else:
        la_leaning = 'NOT FORWARD'

      if abs(row['PL-RIGHT_WRIST-Z']) > abs(row['PL-RIGHT_ELBOW-Z']):
        ra_leaning = 'FORWARD'
      else:
        ra_leaning = 'NOT FORWARD'
      
      # write features to csv file
      writer.writerow([row['frame'],
                          total_movement, 
                          openness_value,
                          leaning_dir,
                          head_horizontal,
                          head_vertical,
                          la_angle, 
                          la_orientation, 
                          la_leaning, 
                          ra_angle, 
                          ra_orientation, 
                          ra_leaning, 
                          lh_orientation, 
                          lh_state, 
                          rh_orientation, 
                          rh_state
                      ])
      
      # save previous row
      prev_row = row


# Transcript & Acoustic

In [None]:
# Load the video
clip = VideoFileClip(video_cut)

# Initialize the output CSV File
with open(output_audio_csv, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["time", "avg_pitch", "avg_intensity", "transcription"])

# Initialize the speech recognizer
r = sr.Recognizer()

# Process the audio one second at a time
for i in range(0, int(clip.duration), 10):
    # Extract one second of audio
    audio_segment = clip.subclip(i, i + 10).audio
    if audio_segment is None:
        continue
    audio_segment.write_audiofile("temp_audio.wav")

    # Load the audio file with librosa
    y, sampling_rate = librosa.load("temp_audio.wav")

    # Calculate pitch with librosa
    pitches, magnitudes = librosa.piptrack(y=y, sr=sampling_rate)

    # Calculate average pitch and intensity for this second
    avg_pitch = pitches.mean()
    avg_intensity = magnitudes.mean()

    # Transcribe the audio with SpeechRecognition
    with sr.AudioFile("temp_audio.wav") as source:
        audio = r.record(source, duration=10)  # read the entire audio file                  
        try:
            transcription = r.recognize_google(audio)
        except sr.UnknownValueError:
            transcription = ""
        except sr.RequestError as e:
            print(f"Could not request results from Google Speech Recognition service; {e}")
            transcription = ""

    # Write the features to the CSV
    with open(output_audio_csv, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([i, avg_pitch, avg_intensity, transcription])

# Cross Modality Features

In [None]:
# First, ensure both 'time' columns are sorted in ascending order
df_gesture = pd.read_csv('../output/3035post-gesture.csv')
df_acoustic = pd.read_csv('../output/3035post-acoustic.csv')

df_gesture = df_gesture.rename(columns={'time_in_seconds': 'time'})
df_gesture['time'] = df_gesture['time'].astype(int)
df_acoustic['time'] = df_acoustic['time'].astype(int)
df_acoustic['transcription'] = df_acoustic['transcription'].astype(str)
df_gesture = df_gesture.sort_values('time')
df_acoustic = df_acoustic.sort_values('time')

# Use merge_asof to merge
df = pd.merge_asof(df_gesture, df_acoustic, on='time', direction='backward')

# print(df)
# Define the correspondence between words and gestures
word_gesture_map = {
    'up': {'head_vertical': 'up', 
           'left_arm_v_movement': 'up', 
           'right_arm_v_movement': 'up',  
           'left_hand_orientation': 'up',
           'right_hand_orientation': 'up'},
    'down': {'head_vertical': 'down', 
             'left_arm_v_movement': 'down', 
             'right_arm_v_movement': 'down', 
             'left_hand_orientation': 'down', 
             'right_hand_orientation': 'down'},
    'left': {'head_horizontal': 'left',  
             'left_hand_orientation': 'left',
             'right_hand_orientation': 'left'},
    'right': {'head_horizontal': 'right', 
              'left_hand_orientation': 'right',
              'right_hand_orientation': 'right'},
    'open': {'left_hand_state': 'open',
             'right_hand_state': 'open'},
    'closed': {'left_hand_state': 'closed', 
               'right_hand_state': 'closed'},
    'forward': {'leaning': 'forward',
                'left_arm_h_movement': 'forward',
                'right_arm_h_movement': 'forward'},
    'backward': {'leaning': 'backward'}
}

def detect_contradictions(row):
    # Extract the words from the transcript
    words = row['transcription'].split()
    
    for word in words:
        if word in word_gesture_map:
            # Get the expected gestures for this word
            expected_gestures = word_gesture_map[word]
            
            # Compare the expected gestures with the actual gestures
            for gesture, expected_value in expected_gestures.items():
                if row[gesture].upper() != expected_value.upper():
                    # There is a contradiction, output the row
                    print('contradiction')
                    print(row[gesture], words)
                    return row

# Apply the function to each row
contradictions = df.apply(detect_contradictions, axis=1)

# Drop the rows where no contradiction was found
contradictions.dropna(how='all', inplace=True)