In [1]:
import pandas as pd
import numpy as np
import os
import ast

In [2]:
visual_dir = "~/first-impression/txt-dev/openface.csv"
visual_dir = os.path.expanduser(visual_dir)
visual = pd.read_csv(visual_dir)
visual['visual'].head()

0    [0.266977, 0.545247, -0.794625, 0.040729, 0.51...
1    [0.2670715, 0.5488470000000001, -0.792099, 0.0...
2    [0.27448100000000003, 0.5337489999999999, -0.7...
3    [0.26104, 0.5193365, -0.8137179999999999, 0.03...
4    [0.28621566666666665, 0.4919006666666667, -0.8...
Name: visual, dtype: object

In [3]:
# Preprocess strings to handle NaN values
def preprocess_string(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError, AttributeError):
        return None

In [5]:
visual['visual'] = visual['visual'].apply(preprocess_string)
visual['visual'].head()

0    [0.266977, 0.545247, -0.794625, 0.040729, 0.51...
1    [0.2670715, 0.5488470000000001, -0.792099, 0.0...
2    [0.27448100000000003, 0.5337489999999999, -0.7...
3    [0.26104, 0.5193365, -0.8137179999999999, 0.03...
4    [0.28621566666666665, 0.4919006666666667, -0.8...
Name: visual, dtype: object

In [6]:
datatype_counts = visual['visual'].apply(type).value_counts()

print(datatype_counts)

visual
<class 'list'>        92150
<class 'NoneType'>       95
Name: count, dtype: int64


In [7]:
datatype_counts = visual['visual'].apply(lambda x: len(x) if isinstance(x, list) else 0).value_counts()

print(datatype_counts)

visual
709    92150
0         95
Name: count, dtype: int64


In [9]:
def split_visual_column(segmented_data):
    gaze = []
    facial_landmarks = []
    action_unit = []
    # Replace problematic rows with the list from the previous or next row
    for i, value in enumerate(segmented_data['visual']):
        try:
            gaze.append(np.array(value[:8+56*5], dtype=np.float32))
            facial_landmarks.append(np.array(value[8+56*5:8+56*5+6+68*5+40], dtype=np.float32))
            action_unit.append(np.array(value[8+56*5+6+68*5+40:], dtype=np.float32))
        except:
            gaze.append(None)
            facial_landmarks.append(None)
            action_unit.append(None)
    return gaze, facial_landmarks, action_unit

gaze, facial_landmarks, action_unit = split_visual_column(visual)
visual['gaze'] = gaze
visual['facial_landmark'] = facial_landmarks
visual['action_unit'] = action_unit

visual['action_unit'].head()

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14, 0.0, 0.6,...
1    [0.0, 0.0, 0.03, 0.0, 0.0, 0.0, 0.11, 0.0, 0.5...
2    [0.01, 0.0, 0.26333332, 0.0, 0.006666667, 0.04...
3    [0.03, 0.0, 0.56, 0.0, 0.035, 0.245, 0.0, 0.00...
4    [0.03, 0.096666664, 0.5966667, 0.0, 0.09, 0.56...
Name: action_unit, dtype: object

In [10]:
datatype_counts = visual['action_unit'].apply(type).value_counts()

print(datatype_counts)

action_unit
<class 'numpy.ndarray'>    92150
<class 'NoneType'>            95
Name: count, dtype: int64


In [12]:
datatype_counts = visual['action_unit'].apply(lambda x: x.shape if isinstance(x, np.ndarray) else 0).value_counts()

print(datatype_counts)

action_unit
(35,)    92150
0           95
Name: count, dtype: int64


In [125]:
visual['VideoID'] = acoustic['SegmentedBaseName'].str.extract(r'(.*)_segment_\d+')
visual_list = segmented_data.groupby('VideoID')['action_unit'].apply(list)

visual_list.head()

VideoID
-6otZ7M-Mro.003    [[20.703983, 0.0018311196, 20.672487, 20.69632...
-6otZ7M-Mro.005    [[22.989721, 0.023344595, 22.52284, 22.651133,...
-8asrRvfJWA.001    [[32.0135, 0.0038038702, 31.99397, 32.04822, 3...
-9BZ8A9U7TE.002    [[48.35585, 0.003518914, 48.20473, 48.29195, 4...
-AmMDnVl4s8.001    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: acoustic_np, dtype: object

In [142]:
# Function to replace NaNs with element-wise average in a numpy array
def replace_nans_with_avg(row):
    masked_row = []
    for i in range(len(row)):
        if type(row[i]) is np.ndarray:
            masked_row.append(row[i])
            
    masked_row = np.array(masked_row, dtype=np.float32)
    average = np.average(masked_row, axis=0)
    
    # Replace NaNs with corresponding averages
    for i in range(len(row)):
        if type(row[i]) is float:
            row[i] = average
                
    return row

In [143]:
acoustic_np = acoustic_list.reset_index()
# Apply the function to each element of the column 
acoustic_np['acoustic_np'] = acoustic_np['acoustic_np'].apply(lambda x: replace_nans_with_avg(x))

acoustic_np['acoustic_np'].head()

0    [[20.703983, 0.0018311196, 20.672487, 20.69632...
1    [[22.989721, 0.023344595, 22.52284, 22.651133,...
2    [[32.0135, 0.0038038702, 31.99397, 32.04822, 3...
3    [[48.35585, 0.003518914, 48.20473, 48.29195, 4...
4    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: acoustic_np, dtype: object

In [61]:
acoustic['acoustic_np'].head()

0    [35.63478, 0.011104753, 35.34182, 35.80483, 35...
1    [35.191692, 0.005509247, 35.029175, 35.25186, ...
2    [41.460136, 0.07766185, 37.682922, 42.688736, ...
3    [35.7021, 0.017157597, 35.05126, 35.891663, 36...
4    [39.22027, 0.0054359366, 39.095097, 39.249043,...
Name: acoustic_np, dtype: object

In [146]:
data = np.arange(6).reshape((3, 2))
print(data)
print(np.average(data, axis=0))

[[0 1]
 [2 3]
 [4 5]]
[2. 3.]


In [71]:
datatype_counts = acoustic['acoustic_np'].apply(type).value_counts()

print(datatype_counts)

acoustic_np
<class 'list'>    2000
Name: count, dtype: int64


In [141]:
for row in acoustic_np['acoustic_np']:
    for i in range(len(row)):
        if type(row[i]) is float:
            print(row[i])
            break
print("NO NAN")

NO NAN


In [150]:
datatype_counts = acoustic['acoustic_np'].apply(lambda x: 0 if type(x) is float else x.shape).value_counts()

print(datatype_counts)

acoustic_np
(88,)    86660
0         5585
Name: count, dtype: int64


In [29]:
datatype_counts = acoustic_list.apply(len).value_counts()

print(datatype_counts)

acoustic_np
43     80
49     78
48     76
47     75
44     74
       ..
195     1
77      1
9       1
74      1
84      1
Name: count, Length: 81, dtype: int64
