In [8]:
import pandas as pd
import numpy as np
import os
import ast

In [123]:
acoustic_dir = "~/first-impression/txt-dev/sample_row_with_acoustic_whisper.csv"
acoustic_dir = os.path.expanduser(acoustic_dir)
acoustic = pd.read_csv(acoustic_dir)
acoustic['acoustic'].head()

0    [35.63478088378906, 0.011104753240942955, 35.3...
1    [35.19169235229492, 0.005509247072041035, 35.0...
2    [41.46013641357422, 0.07766184955835342, 37.68...
3    [35.70209884643555, 0.017157597467303276, 35.0...
4    [39.22026824951172, 0.005435936618596315, 39.0...
Name: acoustic, dtype: object

In [3]:
# Preprocess strings to handle NaN values
def preprocess_string_2(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError, AttributeError):
        return None

In [124]:
acoustic['acoustic'] = acoustic['acoustic'].apply(preprocess_string_2)
acoustic['acoustic'].head()

0    [35.63478088378906, 0.011104753240942955, 35.3...
1    [35.19169235229492, 0.005509247072041035, 35.0...
2    [41.46013641357422, 0.07766184955835342, 37.68...
3    [35.70209884643555, 0.017157597467303276, 35.0...
4    [39.22026824951172, 0.005435936618596315, 39.0...
Name: acoustic, dtype: object

In [27]:
datatype_counts = acoustic['acoustic'].apply(type).value_counts()

print(datatype_counts)

acoustic
<class 'list'>        86660
<class 'NoneType'>     5585
Name: count, dtype: int64


In [28]:
datatype_counts = acoustic['acoustic'].apply(lambda x: len(x) if isinstance(x, list) else 0).value_counts()

print(datatype_counts)

acoustic
88    86660
0      5585
Name: count, dtype: int64


In [125]:
acoustic['VideoID'] = acoustic['SegmentedBaseName'].str.extract(r'(.*)_segment_\d+')
acoustic['acoustic_np'] = acoustic['acoustic'].apply(lambda x: np.array(x, dtype=np.float32))
acoustic_list = acoustic.groupby('VideoID')['acoustic_np'].apply(list)

acoustic_list.head()

VideoID
-6otZ7M-Mro.003    [[20.703983, 0.0018311196, 20.672487, 20.69632...
-6otZ7M-Mro.005    [[22.989721, 0.023344595, 22.52284, 22.651133,...
-8asrRvfJWA.001    [[32.0135, 0.0038038702, 31.99397, 32.04822, 3...
-9BZ8A9U7TE.002    [[48.35585, 0.003518914, 48.20473, 48.29195, 4...
-AmMDnVl4s8.001    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: acoustic_np, dtype: object

In [142]:
# Function to replace NaNs with element-wise average in a numpy array
def replace_nans_with_avg(row):
    masked_row = []
    for i in range(len(row)):
        if type(row[i]) is np.ndarray:
            masked_row.append(row[i])
            
    masked_row = np.array(masked_row, dtype=np.float32)
    average = np.average(masked_row, axis=0)
    
    # Replace NaNs with corresponding averages
    for i in range(len(row)):
        if type(row[i]) is float:
            row[i] = average
                
    return row

In [143]:
acoustic_np = acoustic_list.reset_index()
# Apply the function to each element of the column 
acoustic_np['acoustic_np'] = acoustic_np['acoustic_np'].apply(lambda x: replace_nans_with_avg(x))

acoustic_np['acoustic_np'].head()

0    [[20.703983, 0.0018311196, 20.672487, 20.69632...
1    [[22.989721, 0.023344595, 22.52284, 22.651133,...
2    [[32.0135, 0.0038038702, 31.99397, 32.04822, 3...
3    [[48.35585, 0.003518914, 48.20473, 48.29195, 4...
4    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: acoustic_np, dtype: object

In [61]:
acoustic['acoustic_np'].head()

0    [35.63478, 0.011104753, 35.34182, 35.80483, 35...
1    [35.191692, 0.005509247, 35.029175, 35.25186, ...
2    [41.460136, 0.07766185, 37.682922, 42.688736, ...
3    [35.7021, 0.017157597, 35.05126, 35.891663, 36...
4    [39.22027, 0.0054359366, 39.095097, 39.249043,...
Name: acoustic_np, dtype: object

In [146]:
data = np.arange(6).reshape((3, 2))
print(data)
print(np.average(data, axis=0))

[[0 1]
 [2 3]
 [4 5]]
[2. 3.]


In [71]:
datatype_counts = acoustic['acoustic_np'].apply(type).value_counts()

print(datatype_counts)

acoustic_np
<class 'list'>    2000
Name: count, dtype: int64


In [141]:
for row in acoustic_np['acoustic_np']:
    for i in range(len(row)):
        if type(row[i]) is float:
            print(row[i])
            break
print("NO NAN")

NO NAN


In [150]:
datatype_counts = acoustic['acoustic_np'].apply(lambda x: 0 if type(x) is float else x.shape).value_counts()

print(datatype_counts)

acoustic_np
(88,)    86660
0         5585
Name: count, dtype: int64


In [29]:
datatype_counts = acoustic_list.apply(len).value_counts()

print(datatype_counts)

acoustic_np
43     80
49     78
48     76
47     75
44     74
       ..
195     1
77      1
9       1
74      1
84      1
Name: count, Length: 81, dtype: int64
