<a href="https://colab.research.google.com/github/arham5siddiqui/Mitigating-Linkability-Attacks-through-Differential-Privacy-enabled-Neural-Network-Training/blob/main/Setp2_DifferentialFiltering_ProcessingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
import shutil
from scipy.spatial.transform import Rotation as R

Differential Filtering Function

In [None]:


# Function to compute Hamilton product
def hamilton_product(q1, q2):
    w1, x1, y1, z1 = q1
    w2, x2, y2, z2 = q2
    w = w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2
    x = w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2
    y = w1 * y2 + y1 * w2 + z1 * x2 - x1 * z2
    z = w1 * z2 + z1 * w2 + x1 * y2 - y1 * x2
    return np.array([w, x, y, z])

# Process each CSV file
def process_file(file_path, output_path):
    df = pd.read_csv(file_path)

    # Compute Quaternion Angular Velocity and Angular Acceleration
    quaternions = df[['UnitQuaternion.w', 'UnitQuaternion.x', 'UnitQuaternion.y', 'UnitQuaternion.z']].values
    quat_velocity = np.zeros_like(quaternions)
    quat_accel = np.zeros_like(quaternions)

    for t in range(1, len(quaternions)):
        quat_velocity[t] = hamilton_product(quaternions[t], np.conjugate(quaternions[t-1]))
        if t > 1:
            quat_accel[t] = hamilton_product(quat_velocity[t], np.conjugate(quat_velocity[t-1]))

    df['QuatAngVelocity.w'], df['QuatAngVelocity.x'], df['QuatAngVelocity.y'], df['QuatAngVelocity.z'] = quat_velocity.T
    df['QuatAngAccel.w'], df['QuatAngAccel.x'], df['QuatAngAccel.y'], df['QuatAngAccel.z'] = quat_accel.T

    # Compute Euler Angular Velocity and Angular Acceleration (assuming columns are 'yaw', 'pitch', 'roll')
    eulers = df[['yaw', 'pitch', 'roll']].values
    euler_velocity = np.diff(eulers, axis=0, prepend=np.zeros((1, eulers.shape[1])))
    euler_accel = np.diff(euler_velocity, axis=0, prepend=np.zeros((1, euler_velocity.shape[1])))

    df['EulerAngVelocity.yaw'], df['EulerAngVelocity.pitch'], df['EulerAngVelocity.roll'] = euler_velocity.T
    df['EulerAngAccel.yaw'], df['EulerAngAccel.pitch'], df['EulerAngAccel.roll'] = euler_accel.T

    # Save the updated DataFrame
    df.to_csv(output_path, index=False)

# Main code to loop through all users and files
base_input_folder = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/'  # Update with your folder path
base_output_folder = '//content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/'  # Update with your output folder path

for folder_type in ['Train', 'Test']:
    for user_id in range(1, 49):  # Looping through 48 users
        for video_id in range(9):  # Looping through 9 videos
            input_file = f"{base_input_folder}/{folder_type}/{user_id}_{folder_type.lower()}_resampled_video_{video_id}.csv"
            output_file = f"{base_output_folder}/{folder_type}/{user_id}_{folder_type.lower()}_resampled_video_{video_id}.csv"

            if os.path.exists(input_file):
                process_file(input_file, output_file)
            else:
                print(f"File {input_file} not found.")



Feature Extraction (t-second splits as 1) - Processing Data

In [None]:

def extract_features(df):
    df['PlaybackTime'] = pd.to_timedelta(df['PlaybackTime'], unit='s')  # Convert to timedelta
    df = df.dropna(subset=['PlaybackTime'])  # Drop rows where PlaybackTime is NaT
    df.sort_values(by='PlaybackTime', inplace=True)  # Sort by PlaybackTime
    df.set_index('PlaybackTime', inplace=True)

    feature_columns = [col for col in df.columns if col not in ['Timestamp', 'PlaybackTime', 'userID', 'videoID']]

    for col in feature_columns:
        df[f'{col}_max'] = df[col].rolling(window='1s').max()
        df[f'{col}_min'] = df[col].rolling(window='1s').min()
        df[f'{col}_median'] = df[col].rolling(window='1s').median()
        df[f'{col}_mean'] = df[col].rolling(window='1s').mean()
        df[f'{col}_std'] = df[col].rolling(window='1s').std()

    df.reset_index(inplace=True)
    return df




def process_directory(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            df = pd.read_csv(os.path.join(input_folder, filename))
            df = extract_features(df)
            new_filename = f"processed_{filename}"
            df.to_csv(os.path.join(output_folder, new_filename), index=False)

# Process Train and Test folders
process_directory('/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Train', '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Processed_Train')
process_directory('/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Test', '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Processed_Test')


Additional Features for Video and User Classification

In [None]:

# Function to calculate angle between two vectors
def angle_between(v1, v2):
    v1_u = v1 / (np.linalg.norm(v1) + 1e-8)  # Adding a small constant to avoid division by zero
    v2_u = v2 / (np.linalg.norm(v2) + 1e-8)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

# Function for advanced feature extraction
def advanced_feature_extraction(df):
    quaternion_cols = [col for col in df.columns if 'Quaternion' in col]
    euler_cols = [col for col in df.columns if col.lower() in ['yaw', 'pitch', 'roll']]

    # Calculate angles between Quaternion vectors
    for i in range(len(quaternion_cols)):
        for j in range(i+1, len(quaternion_cols)):
            col1, col2 = quaternion_cols[i], quaternion_cols[j]
            col1_pos = df.columns.get_loc(col1)
            col2_pos = df.columns.get_loc(col2)
            df[f'QuaternionAngle_{col1}_{col2}'] = df.apply(lambda row: angle_between(row.iloc[col1_pos:col1_pos+4].values, row.iloc[col2_pos:col2_pos+4].values), axis=1)

    # Calculate angles between Euler vectors
    for i in range(len(euler_cols)):
        for j in range(i+1, len(euler_cols)):
            col1, col2 = euler_cols[i], euler_cols[j]
            df[f'EulerAngle_{col1}_{col2}'] = df.apply(lambda row: angle_between(np.array([row[col1.lower()]]), np.array([row[col2.lower()]])), axis=1)

    return df

# Function to process all files in a directory
def process_advanced_features(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            df = pd.read_csv(os.path.join(input_folder, filename))

            # Perform advanced feature extraction
            df = advanced_feature_extraction(df)

            # Save the DataFrame to a new CSV file in the output folder
            new_filename = f"advanced_{filename}"
            df.to_csv(os.path.join(output_folder, new_filename), index=False)

# Paths to the input and output folders (Update these paths)
input_train_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Processed_Train'
input_test_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Processed_Test'
output_train_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Advanced_Features_Train'
output_test_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Advanced_Features_Test'

# Run the function for train and test datasets
process_advanced_features(input_test_path, output_test_path)


Code to duplicate the Processed_Test & Processed_Train folders and modify the CSV files (deleting the PlaybackTime and Timestamp columns)


In [None]:

# Define your paths
base_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/'  # Replace with your folder path
train_folder = os.path.join(base_path, 'Processed_Train')
test_folder = os.path.join(base_path, 'Processed_Test')

# Duplicate folders
train_duplicate_folder = os.path.join(base_path, 'Processed_Train_Duplicate')
test_duplicate_folder = os.path.join(base_path, 'Processed_Test_Duplicate')

# Create new folders if they don't exist
if not os.path.exists(train_duplicate_folder):
    os.makedirs(train_duplicate_folder)
if not os.path.exists(test_duplicate_folder):
    os.makedirs(test_duplicate_folder)

# Function to copy only CSV files from one folder to another
def copy_csv_files(src_folder, dest_folder):
    for filename in os.listdir(src_folder):
        if filename.endswith('.csv'):
            src_filepath = os.path.join(src_folder, filename)
            dest_filepath = os.path.join(dest_folder, filename)
            shutil.copy2(src_filepath, dest_filepath)

# Copy CSV files to create duplicates
copy_csv_files(train_folder, train_duplicate_folder)
copy_csv_files(test_folder, test_duplicate_folder)

# Function to delete specific columns from all CSV files in a folder
def delete_columns_from_csv(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            filepath = os.path.join(folder_path, filename)
            df = pd.read_csv(filepath)

            # Delete 'PlaybackTime' and 'Timestamp' columns if they exist
            if 'PlaybackTime' in df.columns:
                del df['PlaybackTime']
            if 'Timestamp' in df.columns:
                del df['Timestamp']

            # Save the DataFrame back to the CSV file
            df.to_csv(filepath, index=False)

# Delete columns from all CSV files in the duplicate folders
delete_columns_from_csv(train_duplicate_folder)
delete_columns_from_csv(test_duplicate_folder)

print("Process completed.")


Process completed.


Drop Unnecessary Columns from Large Combined Dataset csv files (Both Train & Test datasets)

In [None]:
import pandas as pd

# Paths to your input and output CSV files
input_file_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Combined_Test.csv'
output_file_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Reduced_Combined_Test.csv'

# Columns to be removed
cols_to_drop = ['Timestamp', 'PlaybackTime']

# Initialize variable to check for the first chunk
first_chunk = True

# Define chunk size (adjust based on your available memory)
chunksize = 10 ** 6

# Read the CSV file in chunks and drop specified columns
for chunk in pd.read_csv(input_file_path, chunksize=chunksize):
    chunk.drop(columns=cols_to_drop, inplace=True)

    # Write to new CSV file
    if first_chunk:
        chunk.to_csv(output_file_path, index=False, mode='w')
        first_chunk = False
    else:
        chunk.to_csv(output_file_path, index=False, mode='a', header=False)


Removing rows with empty columns, and then overwrite the original file in parts

In [None]:

file_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Reduced_Combined_Test.csv'

# Chunk size
chunk_size = 10**6  # Modify this value based on available memory (Colab has 12 GB, so 10^6 size is base)

# Temporary file path
temp_file_path = '/content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/temp_Reduced_Combined_Test.csv'

# Read the large CSV file in chunks
first_chunk = True
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Remove rows with any empty columns
    chunk.dropna(inplace=True)

    # Save the cleaned chunk to a new CSV file
    if first_chunk:
        chunk.to_csv(temp_file_path, mode='w', index=False)
        first_chunk = False
    else:
        chunk.to_csv(temp_file_path, mode='a', header=False, index=False)

# Replace the original file with the cleaned file
os.remove(file_path)
os.rename(temp_file_path, file_path)

print(f"Completed cleaning {file_path}.")


Completed cleaning /content/drive/MyDrive/MSc Project/Formated_Data/Experiment_1/TrainTestDataset/Reduced_Combined_Test.csv.
