### Preprocessing the original .mat data from Dan into a usable form for our model

#### Extract relevant fields from the .mat files and convert into .csv files

In [1]:
import scipy.io
import pandas as pd
import numpy as np
import os

# Define the fields you want to convert
fields_to_convert = ['map', 'startLoc', 'targLoc', 'stepDist', 'pathLength', 'path']

# Function to convert structured array to DataFrame
def structured_array_to_df(struct_array):
    if struct_array.dtype.names:
        return pd.DataFrame({name: struct_array[name].tolist() for name in struct_array.dtype.names})
    else:
        return pd.DataFrame(struct_array)

# Directory containing the .mat files
input_dir = 'BehaviouralData'
# Get a list of all .mat files in the directory
mat_files = [f for f in os.listdir(input_dir) if f.endswith('.mat')]

# Process each .mat file
for mat_file in mat_files:
    # Load the .mat file
    mat_data = scipy.io.loadmat(os.path.join(input_dir, mat_file))
    out_data = mat_data['out']
    
    # Create a directory for this .mat file
    output_dir = os.path.join(input_dir, os.path.splitext(mat_file)[0])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Iterate over each field to convert
    for field_name in fields_to_convert:
        if field_name in out_data.dtype.names:
            field_value = out_data[field_name][0, 0]
            if isinstance(field_value, np.ndarray):
                field_df = structured_array_to_df(field_value)
                field_df.to_csv(os.path.join(output_dir, f"{field_name}.csv"), index=False)
                print(f"Converted {field_name} to DataFrame and saved as CSV in {output_dir}.")
            else:
                print(f"{field_name} is not an ndarray and was not converted.")
        else:
            print(f"{field_name} does not exist in the .mat file and was not converted.")


Converted map to DataFrame and saved as CSV in BehaviouralData/Subject9Block1Data.
Converted startLoc to DataFrame and saved as CSV in BehaviouralData/Subject9Block1Data.
Converted targLoc to DataFrame and saved as CSV in BehaviouralData/Subject9Block1Data.
Converted stepDist to DataFrame and saved as CSV in BehaviouralData/Subject9Block1Data.
Converted pathLength to DataFrame and saved as CSV in BehaviouralData/Subject9Block1Data.
Converted path to DataFrame and saved as CSV in BehaviouralData/Subject9Block1Data.
Converted map to DataFrame and saved as CSV in BehaviouralData/Subject13Block1Data.
Converted startLoc to DataFrame and saved as CSV in BehaviouralData/Subject13Block1Data.
Converted targLoc to DataFrame and saved as CSV in BehaviouralData/Subject13Block1Data.
Converted stepDist to DataFrame and saved as CSV in BehaviouralData/Subject13Block1Data.
Converted pathLength to DataFrame and saved as CSV in BehaviouralData/Subject13Block1Data.
Converted path to DataFrame and saved a

In [None]:

# Define the fields you want to convert
fields_to_convert = ['map', 'startLoc', 'targLoc', 'stepDist', 'pathLength', 'path']

# Function to convert structured array to DataFrame
def structured_array_to_df(struct_array):
    if struct_array.dtype.names:
        return pd.DataFrame({name: struct_array[name].tolist() for name in struct_array.dtype.names})
    else:
        return pd.DataFrame(struct_array)

# Directory containing the .mat files
input_dir = 'BehaviouralData'
# Get a list of all .mat files in the directory
mat_files = [f for f in os.listdir(input_dir) if f.endswith('.mat')]

# Process each .mat file
for mat_file in mat_files:
    # Load the .mat file
    mat_data = scipy.io.loadmat(os.path.join(input_dir, mat_file))
    out_data = mat_data['out']
    
    # Create a directory for this .mat file
    output_dir = os.path.join(input_dir, os.path.splitext(mat_file)[0])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Iterate over each field to convert
    for field_name in fields_to_convert:
        if field_name in out_data.dtype.names:
            field_value = out_data[field_name][0, 0]
            if isinstance(field_value, np.ndarray):
                field_df = structured_array_to_df(field_value)
                field_df.to_csv(os.path.join(output_dir, f"{field_name}.csv"), index=False)
                print(f"Converted {field_name} to DataFrame and saved as CSV in {output_dir}.")
            else:
                print(f"{field_name} is not an ndarray and was not converted.")
        else:
            print(f"{field_name} does not exist in the .mat file and was not converted.")


#### Create single DataFrames for each .mat files
* Each row representing each trial and use coordinates instead of object keys
* An optimality assessment is added

In [16]:
import pandas as pd
import numpy as np
import os
import glob

# Define the base directory and the output directory
base_dir = 'BehaviouralData'
output_dir = 'PreprocessedData'

# Create the output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# Get the list of all folders matching the pattern "Subject*Block*Data"
folders = glob.glob(os.path.join(base_dir, 'Subject*Block*Data'))

# Process each folder
for folder in folders:
    folder_name = os.path.basename(folder)
    
    # Read in the CSV files
    path_file = pd.read_csv(os.path.join(folder, 'path.csv'))
    map_file = pd.read_csv(os.path.join(folder, 'map.csv'))
    startLoc_file = pd.read_csv(os.path.join(folder, 'startLoc.csv'))
    targLoc_file = pd.read_csv(os.path.join(folder, 'targLoc.csv'))
    stepDist_file = pd.read_csv(os.path.join(folder, 'stepDist.csv'))
    pathLength_file = pd.read_csv(os.path.join(folder, 'pathLength.csv'))

    # Assuming the columns based on the output provided
    # Update column names if necessary
    trial_col = '5'  # trial column
    location_col = '0'  # current location column
    target_col = '6'  # target column

    # Create a new DataFrame for path sequences
    path_sequences = pd.DataFrame(columns=['trial', 'location_sequence', 'target', 'startLoc', 'stepDist', 'pathLength'])

    # Iterate over unique trials
    for trial in path_file[trial_col].unique():
        # Filter rows for the current trial
        trial_data = path_file[path_file[trial_col] == trial]
        
        # Extract location sequence and target
        location_sequence = trial_data[location_col].tolist()
        location_sequence = [int(loc) for loc in location_sequence]  # Convert locations to integers
        target = int(trial_data[target_col].iloc[0])  # Target location should be consistent within the trial and convert to integer
        
        # Validate target with targLoc_file
        if target != int(targLoc_file.iloc[int(trial) - 1, 0]):
            raise ValueError(f"Target location mismatch for trial {trial}")
        
        # Get corresponding start location, step distance, and path length
        startLoc = int(startLoc_file.iloc[int(trial) - 1, 0])
        stepDist = int(stepDist_file.iloc[int(trial) - 1, 0])
        pathLength = int(pathLength_file.iloc[int(trial) - 1, 0])
        
        # Append data to the new DataFrame
        path_sequences = pd.concat([path_sequences, pd.DataFrame([{
            'trial': int(trial),
            'location_sequence': location_sequence,
            'target': target,
            'startLoc': startLoc,
            'stepDist': stepDist,
            'pathLength': pathLength
        }])], ignore_index=True)

    # Clean and prepare the map_file
    map_file = map_file.drop(columns=[map_file.columns[0], map_file.columns[-1]])  # Drop the first and last columns which contain NaNs
    map_file = map_file.apply(pd.to_numeric, errors='coerce')  # Convert all values to numeric
    map_file = map_file.dropna().astype(int)  # Drop any remaining NaNs and convert to integers

    # Create a dictionary to map location keys to coordinates
    location_to_coords = {}
    for row_idx, row in map_file.iterrows():
        for col_idx, val in enumerate(row):
            location_to_coords[val] = (row_idx - 1, col_idx)  # Adjust row_idx by subtracting 1

    # Create a new DataFrame for coordinate sequences
    coordinate_sequences = pd.DataFrame(columns=['trial', 'path', 'end', 'start', 'stepDist', 'pathLength'])

    # Transform the path_sequences into coordinate_sequences
    for _, row in path_sequences.iterrows():
        trial = row['trial']
        location_sequence = row['location_sequence']
        path = [location_to_coords[loc] for loc in location_sequence]
        end = location_to_coords[row['target']]
        start = location_to_coords[row['startLoc']]
        stepDist = row['stepDist']
        pathLength = row['pathLength']
        
        # Append the transformed data to the new DataFrame
        coordinate_sequences = pd.concat([coordinate_sequences, pd.DataFrame([{
            'trial': trial,
            'path': path,
            'end': end,
            'start': start,
            'stepDist': stepDist,
            'pathLength': pathLength
        }])], ignore_index=True)

    # Add optimality assessment to the coordinate_sequences DataFrame
    coordinate_sequences['optimal_response'] = coordinate_sequences['pathLength'] == coordinate_sequences['stepDist']

    # Save the coordinate_sequences DataFrame to a CSV file
    output_file = os.path.join(output_dir, f'{folder_name}.csv')
    coordinate_sequences.to_csv(output_file, index=False)

#### Validate variables in the data in coordinate format

In [1]:
import pandas as pd
import os
from scipy.spatial.distance import cityblock

# Define the processed data directory
processed_data_dir = 'PreprocessedBlockData'

# List all .csv files in the processed data directory
csv_files = [f for f in os.listdir(processed_data_dir) if f.endswith('.csv')]

# Function to validate the criteria for a given DataFrame
def validate_data(df, file_name):
    for idx, row in df.iterrows():
        path = row['path']
        end = row['end']
        start = row['start']
        stepDist = row['stepDist']
        pathLength = row['pathLength']
        
        # Validate "end" is the last location in "path"
        if end != path[-1]:
            print(f"Validation error in file {file_name}: 'end' does not match the last location in 'path' for trial {row['trial']}")
        
        # Validate "start" is the first location in "path"
        if start != path[0]:
            print(f"Validation error in file {file_name}: 'start' does not match the first location in 'path' for trial {row['trial']}")
        
        # Validate "stepDist" is the Manhattan distance between "end" and "start"
        if stepDist != cityblock(end, start):
            print(f"Validation error in file {file_name}: 'stepDist' is incorrect for trial {row['trial']}")
        
        # Validate "pathLength" is the number of steps taken in "path"
        if pathLength != (len(path) - 1):
            print(f"Validation error in file {file_name}: 'pathLength' is incorrect for trial {row['trial']}")

# Track the number of files checked
files_checked = 0

# Load and validate each CSV file
for csv_file in csv_files:
    file_path = os.path.join(processed_data_dir, csv_file)
    df = pd.read_csv(file_path)
    
    # Convert the string representation of lists back to lists
    df['path'] = df['path'].apply(eval)
    df['end'] = df['end'].apply(eval)
    df['start'] = df['start'].apply(eval)
    
    # Validate the DataFrame
    print(f"Checking file: {csv_file}")
    validate_data(df, csv_file)
    
    # Increment the files checked counter
    files_checked += 1

print(f"Validation complete and any error has been printed. Total number of files checked: {files_checked}")


Checking file: Subject8Block2Data.csv
Checking file: Subject16Block2Data.csv
Checking file: Subject15Block3Data.csv
Checking file: Subject26Block1Data.csv
Checking file: Subject22Block1Data.csv
Checking file: Subject1Block1Data.csv
Checking file: Subject25Block1Data.csv
Checking file: Subject6Block1Data.csv
Checking file: Subject21Block1Data.csv
Checking file: Subject2Block1Data.csv
Checking file: Subject18Block1Data.csv
Checking file: Subject8Block3Data.csv
Checking file: Subject16Block3Data.csv
Checking file: Subject15Block2Data.csv
Checking file: Subject6Block3Data.csv
Checking file: Subject26Block2Data.csv
Checking file: Subject25Block3Data.csv
Checking file: Subject18Block3Data.csv
Checking file: Subject1Block2Data.csv
Checking file: Subject2Block3Data.csv
Checking file: Subject21Block3Data.csv
Checking file: Subject22Block2Data.csv
Checking file: Subject8Block1Data.csv
Checking file: Subject16Block1Data.csv
Checking file: Subject15Block1Data.csv
Checking file: Subject6Block2Data.

#### Join different blocks from each participant into single .csv file

In [14]:
import os
import pandas as pd
from glob import glob

# Define input and output directories
input_dir = "PreprocessedBlockData"
output_dir = "PreprocessedSubjectData"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get all unique subjects by extracting unique parts of filenames
file_pattern = os.path.join(input_dir, "Subject*Block*Data.csv")
all_files = glob(file_pattern)

# Extract unique subject identifiers
subjects = set()
for file in all_files:
    parts = os.path.basename(file).split('Block')[0]  # 'SubjectXX'
    subjects.add(parts)

print(f"Found subjects: {subjects}")

# Process each subject
for subject in subjects:
    subject_files = sorted(glob(os.path.join(input_dir, f"{subject}Block*Data.csv")))

    if not subject_files:
        print(f"No files found for subject {subject}")
        continue

    print(f"Processing subject: {subject}")
    print(f"Files: {subject_files}")

    combined_data = pd.DataFrame()
    trial_offset = 0

    for block_file in subject_files:
        if not os.path.isfile(block_file):
            print(f"File not found: {block_file}")
            continue

        try:
            block_data = pd.read_csv(block_file)
            if block_data.empty:
                print(f"Warning: The file {block_file} is empty.")
                continue

            print(f"Reading file: {block_file}")
            print(f"Data preview:\n{block_data.head()}")

            # Adjust trial numbers
            block_data['trial'] += trial_offset
            trial_offset = block_data['trial'].max()

            # Concatenate data
            combined_data = pd.concat([combined_data, block_data], ignore_index=True)
        except Exception as e:
            print(f"Error reading {block_file}: {e}")
            continue
    
    if combined_data.empty:
        print(f"Warning: No data combined for subject {subject}.")
        continue
    
    output_file = os.path.join(output_dir, f"{subject}.csv")
    combined_data.to_csv(output_file, index=False)
    print(f"Combined data saved for subject {subject} to {output_file}")

print("All files have been combined and saved.")



Found subjects: {'Subject24', 'Subject1', 'Subject3', 'Subject27', 'Subject10', 'Subject17', 'Subject26', 'Subject14', 'Subject4', 'Subject9', 'Subject23', 'Subject2', 'Subject18', 'Subject21', 'Subject25', 'Subject8', 'Subject6', 'Subject13', 'Subject19', 'Subject15', 'Subject22', 'Subject20', 'Subject16'}
Processing subject: Subject24
Files: ['PreprocessedBlockData/Subject24Block1Data.csv', 'PreprocessedBlockData/Subject24Block2Data.csv', 'PreprocessedBlockData/Subject24Block3Data.csv']
Reading file: PreprocessedBlockData/Subject24Block1Data.csv
Data preview:
   trial                                              path     end   start  \
0      1  [(1, 1), (1, 2), (0, 2), (0, 3), (1, 3), (2, 3)]  (2, 3)  (1, 1)   
1      2                  [(2, 3), (1, 3), (0, 3), (0, 2)]  (0, 2)  (2, 3)   
2      3                  [(0, 2), (0, 1), (1, 1), (1, 0)]  (1, 0)  (0, 2)   
3      4          [(1, 0), (1, 1), (0, 1), (0, 2), (0, 3)]  (0, 3)  (1, 0)   
4      5                  [(0, 3), (0, 2),