# Df preparation, filtration, and saving the processed df

In [None]:
pip install opencv-python numpy pandas scipy

In [None]:
# Define the path
data_path = '/YOUR PATH/csvs'
video_path = '/YOUR PATH/mp4s'
Processed_data_path = '/YOUR PATH/processed_csvs'

In [None]:
# Define the keypoints and new columns
pose_keypoints = [
    'Nose', 'Neck', 'RShoulder', 'RElbow', 'RWrist', 'LShoulder', 'LElbow', 'LWrist', 'MidHip',
    'RHip', 'RKnee', 'RAnkle', 'LHip', 'LKnee', 'LAnkle', 'REye', 'LEye', 'REar', 'LEar', 'LBigToe',
    'LSmallToe', 'LHeel', 'RBigToe', 'RSmallToe', 'RHeel'
]

new_columns = ['frame']
for keypoint in pose_keypoints:
    new_columns.extend([f'{keypoint}_x', f'{keypoint}_y', f'{keypoint}_c'])

# Define the required columns and confidence columns
required_columns_25 = ['MidHip_x', 'MidHip_y', 'LKnee_x', 'LKnee_y', 'LAnkle_x', 'LAnkle_y', 'Nose_x', 'Nose_y',
                       'RKnee_x', 'RKnee_y', 'RAnkle_x', 'RAnkle_y', 'Neck_x', 'Neck_y', 'RWrist_x', 'RWrist_y', 
                       'LWrist_x', 'LWrist_y', 'RElbow_x', 'RElbow_y', 'LShoulder_x', 'RShoulder_x', 'LShoulder_y', 'RShoulder_y',
                       'LHip_x', 'RHip_x', 'LHip_y', 'RHip_y']
confidence_columns = ['MidHip_c', 'LKnee_c', 'LAnkle_c', 'RKnee_c', 'RAnkle_c', 'Neck_c', 'RWrist_c', 'LWrist_c', 'Nose_c', 'RElbow_c',
                      'LShoulder_c', 'RShoulder_c', 'RHip_c', 'LHip_c']

# Function to get the duration of the video
def get_video_duration(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps
    cap.release()
    return duration


# Butterworth filter
def butter_lowpass_filter(data, cutoff, fs, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data)

def apply_butterworth_filter(df, video_path, cutoff=5, order=4):
    total_duration_seconds = get_video_duration(video_path)
    fps = len(df) / total_duration_seconds
    
    numerical_df = df.select_dtypes(include=[np.number])
    filtered_df = numerical_df.apply(lambda col: butter_lowpass_filter(col, cutoff, fps, order))
    non_numerical_df = df.select_dtypes(exclude=[np.number])
    return pd.concat([non_numerical_df, filtered_df], axis=1)

# Process each CSV file in the directory
for filename in os.listdir(data_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(data_path, filename)
        video_filename = filename.replace('.csv', '.mp4')
        video_file_path = os.path.join(video_path, video_filename)
        
        if not os.path.exists(video_file_path):
            print(f"Video file {video_file_path} not found for {filename}. Skipping this file.")
            continue
        
        df_Body25 = pd.read_csv(file_path)

        # Rename columns if the length matches
        if len(new_columns) == len(df_Body25.columns):
            df_Body25.columns = new_columns
        else:
            print(f"Column length mismatch in file {filename}. Skipping this file.")
            continue

        # Apply the Butterworth filter
        Processed_df = apply_butterworth_filter(df_Body25, video_file_path)

        # Extract the required columns and confidence columns
        df_required_25 = Processed_df[required_columns_25]
        df_confidence_25 = Processed_df[confidence_columns]

        # Interpolate missing values in df_required_25
        df_interpolated_25 = df_required_25.interpolate()

        # Combine the interpolated values and confidence columns into a single DataFrame
        Processed_df = pd.concat([df_interpolated_25, df_confidence_25], axis=1)

        # Filter based on confidence values
        for col_confidence in confidence_columns:
            matching_cols_required = [col for col in required_columns_25 if col[:2] == col_confidence[:2]]
            for index, row in Processed_df.iterrows():
                for col_required in matching_cols_required:
                    if row[col_confidence] <= 0.69:
                        Processed_df.at[index, col_required] = np.nan

        # Final interpolation after setting low confidence values to NaN
        Processed_df[required_columns_25] = Processed_df[required_columns_25].interpolate()

        ## Save the processed DataFrame to a new CSV file
        output_file_path = os.path.join(Processed_data_path, f'processed_{filename}')
        Processed_df.to_csv(output_file_path, index=False)

        print(f"Processed and saved: {output_file_path}")