In [8]:
import os
import pandas as pd
from scipy.io import loadmat

RAW_BASE_PATH = '../../data/raw'
TARGET_SUFFIX = '_CNN_ResNet.mat'

def preview_mat_data_shapes(base_path):
    for user_folder in os.listdir(base_path):
        user_path = os.path.join(base_path, user_folder)

        if not os.path.isdir(user_path) or not user_folder.endswith('_P'):
            continue

        video_dir = os.path.join(user_path, 'video')
        if not os.path.isdir(video_dir):
            continue

        for fname in os.listdir(video_dir):
            if fname.endswith(TARGET_SUFFIX):
                file_path = os.path.join(video_dir, fname)
                try:
                    mat_data = loadmat(file_path)
                    data_keys = [k for k in mat_data.keys() if not k.startswith('__')]
                    if not data_keys:
                        print(f"No valid data found in {file_path}")
                        continue

                    features = mat_data[data_keys[0]]
                    print(f"\nData shape from {file_path}: {features.shape}")
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
                break

preview_mat_data_shapes(RAW_BASE_PATH)



Data shape from ../../data/raw/302_P/video/302_CNN_ResNet.mat: (22766, 2048)

Data shape from ../../data/raw/301_P/video/301_CNN_ResNet.mat: (24721, 2048)


In [None]:
"""
- Scans each user folder under the raw data base path
- Looks for '_CNN_ResNet.mat' files inside the 'video' subfolder of each user folder
- Loads the .mat file and extracts the main feature matrix
- Converts the matrix to a pandas DataFrame
- Renames columns as 'video01_f1', 'video01_f2', etc.
- Saves the processed data as 'processed_video_features01.parquet' inside an interim folder structure
- Creates output directories if they don't exist
"""

import os
import pandas as pd
from scipy.io import loadmat

RAW_BASE_PATH = '../../data/raw'
SAVE_BASE_PATH = '../../data/interim/video_features'
TARGET_SUFFIX = '_CNN_ResNet.mat'

def process_and_save_video_features(base_path, save_base_path):
    for user_folder in os.listdir(base_path):
        user_path = os.path.join(base_path, user_folder)

        if not os.path.isdir(user_path) or not user_folder.endswith('_P'):
            continue

        video_dir = os.path.join(user_path, 'video')
        if not os.path.isdir(video_dir):
            continue

        for fname in os.listdir(video_dir):
            if fname.endswith(TARGET_SUFFIX):
                file_path = os.path.join(video_dir, fname)
                try:
                    # Load the .mat file
                    mat_data = loadmat(file_path)
                    # Find main data key(s)
                    data_keys = [k for k in mat_data.keys() if not k.startswith('__')]
                    if not data_keys:
                        raise ValueError("No valid data found in mat file")

                    features = mat_data[data_keys[0]]

                    df = pd.DataFrame(features)

                    new_columns = [f'video01_f{i+1}' for i in range(df.shape[1])]
                    df.columns = new_columns

                    save_user_dir = os.path.join(save_base_path, user_folder)
                    os.makedirs(save_user_dir, exist_ok=True)
                    save_path = os.path.join(save_user_dir, 'processed_video_features01.parquet')

                    df.to_parquet(save_path, index=False)
                    print(f"✅ Processed and saved: {save_path}")

                except Exception as e:
                    print(f"❌ Error processing {file_path}: {e}")
                break

process_and_save_video_features(RAW_BASE_PATH, SAVE_BASE_PATH)


In [2]:
import os
import pandas as pd

SAVE_BASE_PATH = '../../data/interim/video_features'

def load_and_display_saved_audio_features(save_base_path):
    for user_folder in os.listdir(save_base_path):
        user_path = os.path.join(save_base_path, user_folder)
        if not os.path.isdir(user_path):
            continue

        parquet_file = os.path.join(user_path, 'processed_video_features01.parquet')
        if os.path.isfile(parquet_file):
            try:
                df = pd.read_parquet(parquet_file)
                print(f"\n📊 User {user_folder} - processed_audio_features.parquet")
                display(df.head())  # display first few rows as table
            except Exception as e:
                print(f"❌ Error loading {parquet_file}: {e}")

load_and_display_saved_audio_features(SAVE_BASE_PATH)



📊 User 302_P - processed_audio_features.parquet


Unnamed: 0,video01_f1,video01_f2,video01_f3,video01_f4,video01_f5,video01_f6,video01_f7,video01_f8,video01_f9,video01_f10,...,video01_f2039,video01_f2040,video01_f2041,video01_f2042,video01_f2043,video01_f2044,video01_f2045,video01_f2046,video01_f2047,video01_f2048
0,0.0,0.548137,0.0,0.154201,0.0,0.0,0.392391,0.0,0.0,0.83978,...,0.0,0.0,0.080486,0.0,0.0,0.506909,0.024813,0.0,0.0,0.579549
1,0.0,0.505975,0.0,0.071001,0.0,0.0,0.275212,0.0,0.0,1.023129,...,0.0,0.0,0.102905,0.0,0.0,0.566709,0.0,0.0,0.0,0.768368
2,0.0,0.522661,0.0,0.058863,0.0,0.0,0.30413,0.0,0.0,1.053486,...,0.0,0.0,0.112472,0.0,0.0,0.546014,0.0,0.0,0.0,0.763183
3,0.0,0.527545,0.0,0.091318,0.0,0.0,0.328666,0.0,0.0,0.97408,...,0.0,0.0,0.10807,0.0,0.0,0.559364,0.0,0.0,0.0,0.771671
4,0.0,0.536881,0.0,0.121252,0.0,0.0,0.314275,0.0,0.0,1.019297,...,0.0,0.0,0.099565,0.0,0.0,0.590459,0.0,0.0,0.0,0.723885



📊 User 301_P - processed_audio_features.parquet


Unnamed: 0,video01_f1,video01_f2,video01_f3,video01_f4,video01_f5,video01_f6,video01_f7,video01_f8,video01_f9,video01_f10,...,video01_f2039,video01_f2040,video01_f2041,video01_f2042,video01_f2043,video01_f2044,video01_f2045,video01_f2046,video01_f2047,video01_f2048
0,0.02704,0.377377,0.0,0.339695,1.334366,0.014927,1.398661,0.55805,0.386862,0.615942,...,0.0,1.375125,0.306819,0.0,2.007835,0.129868,2.062427,0.768292,0.0,0.82575
1,0.0,0.332009,0.027214,0.313303,2.138208,0.054268,2.033602,0.701261,0.66342,0.504166,...,0.0,2.159133,0.0,0.0,2.811998,0.043849,2.380494,1.166869,0.0,1.272068
2,0.025878,0.385017,0.000592,0.309159,1.726289,0.0,1.807138,0.628785,0.53089,0.408658,...,0.0,1.731496,0.0,0.0,2.364725,0.177065,2.209539,0.964951,0.0,1.047128
3,0.0,0.348933,0.08876,0.317954,2.232563,0.045081,2.211314,0.676074,0.661535,0.514439,...,0.0,2.238677,0.0,0.0,2.824675,0.027697,2.473712,1.226455,0.0,1.339866
4,0.112566,0.384054,0.04579,0.347998,1.742757,0.018785,1.884668,0.635607,0.508339,0.428812,...,0.0,1.75175,0.07035,0.0,2.281598,0.224951,2.312614,0.982391,0.0,1.06796


In [11]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

SAVE_BASE_PATH = '../../data/interim/video_features'

def apply_pca_to_all_users(base_path):
    for user_folder in os.listdir(base_path):
        user_path = os.path.join(base_path, user_folder)
        if not os.path.isdir(user_path):
            continue

        parquet_file = os.path.join(user_path, 'processed_video_features01.parquet')
        if os.path.isfile(parquet_file):
            try:
                df = pd.read_parquet(parquet_file)

                features = df.values 
                scaler = StandardScaler()
                features_scaled = scaler.fit_transform(features)

                pca = PCA(n_components=100) 
                features_reduced = pca.fit_transform(features_scaled)

              
                reduced_df = pd.DataFrame(features_reduced,
                                          columns=[f'pca_{i+1}' for i in range(features_reduced.shape[1])])

                reduced_parquet_file = os.path.join(user_path, 'processed_video_features01_pca.parquet')
                reduced_df.to_parquet(reduced_parquet_file, index=False)

                print(f"✅ PCA applied and saved for user {user_folder}")

            except Exception as e:
                print(f"❌ Error processing {parquet_file}: {e}")

apply_pca_to_all_users(SAVE_BASE_PATH)


✅ PCA applied and saved for user 302_P
✅ PCA applied and saved for user 301_P


In [12]:
import os
import pandas as pd

SAVE_BASE_PATH = '../../data/interim/video_features'

def print_pca_shapes(base_path):
    for user_folder in os.listdir(base_path):
        user_path = os.path.join(base_path, user_folder)
        if not os.path.isdir(user_path):
            continue

        parquet_file = os.path.join(user_path, 'processed_video_features01_pca.parquet')
        if os.path.isfile(parquet_file):
            try:
                df = pd.read_parquet(parquet_file)
                print(f"User {user_folder} PCA data shape: {df.shape}")
            except Exception as e:
                print(f"Error loading {parquet_file}: {e}")

print_pca_shapes(SAVE_BASE_PATH)


User 302_P PCA data shape: (22766, 100)
User 301_P PCA data shape: (24721, 100)


In [13]:
import os
import pandas as pd

SAVE_BASE_PATH = '../../data/interim/video_features'

def load_and_check_nulls(save_base_path):
    for user_folder in os.listdir(save_base_path):
        user_path = os.path.join(save_base_path, user_folder)
        if not os.path.isdir(user_path):
            continue

        parquet_file = os.path.join(user_path, 'processed_video_features01_pca.parquet')
        if os.path.isfile(parquet_file):
            try:
                df = pd.read_parquet(parquet_file)
                has_nulls = df.isnull().values.any()
                print(f"\n📊 User {user_folder} - processed_audio_features.parquet")
                print(f"Contains null values? {'Yes' if has_nulls else 'No'}")
            except Exception as e:
                print(f"❌ Error loading {parquet_file}: {e}")

load_and_check_nulls(SAVE_BASE_PATH)



📊 User 302_P - processed_audio_features.parquet
Contains null values? No

📊 User 301_P - processed_audio_features.parquet
Contains null values? No
