In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from tensorflow.keras.layers import Embedding, Flatten, LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import Input



In [27]:
# Load the data
data_path = "../data/Eye-tracking Output/cleaned_data.csv"
df = pd.read_csv(data_path)


In [31]:
## Normalizing data ##

# Feature selection of relevant columns
relevant_columns = ['Participant', 'Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
                    'Tracking Ratio [%]', 'Category Right', 'Trial',
                    'Stimulus', 'Gender', 'Age', 'Class', 'CARS Score']

df_relevant = df[relevant_columns]

# Filling NaNs in 'CARS Score' with 0
df_relevant['CARS Score'].fillna(0, inplace=True)
df_relevant['Age'].fillna(0, inplace=True)

# List of numerical columns to scale
numerical_columns = ['Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
                     ] # Lidt i tvivl om vi skal have 'Age' med her. CARS og Age giver også 0. Pis lort

# Convert columns to numeric, coercing errors to NaN
for col in numerical_columns:
    df_relevant[col] = pd.to_numeric(df_relevant[col], errors='coerce')

# Define a function to fill NaN with the mean of the previous and next row
def fill_with_row_mean(df_relevant, col):
    # First, forward fill the first NaN (if any)
    df_relevant[col] = df_relevant[col].fillna(method='ffill')
    
    # Then, fill the rest with the mean of the previous and next row
    df_relevant[col] = df_relevant[col].fillna((df_relevant[col].shift(1) + df_relevant[col].shift(-1)) / 2)
    
    return df_relevant[col]

# Apply this function to each numerical column
for col in numerical_columns:
    df_relevant[col] = fill_with_row_mean(df_relevant, col)

# Handle any remaining NaNs, especially at the end of the DataFrame
df_relevant.fillna(method='bfill', inplace=True)

# Normalize data per combination of Trial, Participant, and Stimulus
for (trial, participant, stimulus), group_data in df_relevant.groupby(['Trial', 'Participant', 'Stimulus']):
    scaler = MinMaxScaler()
    # Apply the scaler to all numerical columns for this group
    df_relevant.loc[group_data.index, numerical_columns] = scaler.fit_transform(group_data[numerical_columns])

# For some reason Age and CARS Score are not scaled properly, so we do it manually
scaler = MinMaxScaler()
df_relevant[['CARS Score', 'Age', 'Tracking Ratio [%]']] = scaler.fit_transform(df_relevant[['CARS Score', 'Age', 'Tracking Ratio [%]']])

# Save the normalized data
df_relevant.to_csv("../data/Eye-tracking Output/normalized_data.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['CARS Score'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['Age'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant[col] = pd.to_numeric(df_relevant[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentati

In [52]:
df_relevant = pd.read_csv("../data/Eye-tracking Output/normalized_data.csv")
# Label encoding for participant, subject, and trial. 
# Input dimensions are the number of unique values in each column and output is the square root of the input
embeddings = []
inputs = ['Participant', 'Stimulus', 'Trial']
input_dims = [58, 114, 34]
output_dims = [8, 11, 7]

for input_name, input_dim, output_dim in zip(inputs, input_dims, output_dims):
    le = LabelEncoder()
    df_relevant[input_name] = le.fit_transform(df_relevant[input_name])
    
    input_layer = Input(shape=(1,))
    embedding_layer = Embedding(input_dim=input_dim, output_dim=output_dim)(input_layer)
    
    embeddings.append(embedding_layer)

# Concatenate embeddings
concatenated = Concatenate()(embeddings)

# print(df_relevant['Stimulus'].unique())
print(concatenated)


KerasTensor(type_spec=TensorSpec(shape=(None, 1, 26), dtype=tf.float32, name=None), name='concatenate_8/concat:0', description="created by layer 'concatenate_8'")


In [53]:
# checking the variables to convert into dummy variables.
from sklearn.preprocessing import LabelEncoder

for column in df_relevant.columns:
    print(column, " dtype: ", df_relevant.dtypes[column])
    # if df_relevant.dtypes[column] == "object":
    #     print(column)

# print(df_relevant["Gender"].unique())
# print(df_relevant["Category Left"].unique())
# print(df_relevant["Category Right"].unique())
# print(df_relevant["Trial"].unique())
# print(df_relevant["Stimulus"].unique())


#print(df_relevant["Gender"].unique())
le_gen = LabelEncoder()
df_relevant['Gender'] = le_gen.fit_transform(df_relevant['Gender']) # M, F


le_class = LabelEncoder()
df_relevant['Class'] = le_class.fit_transform(df_relevant['Class']) # ASD, TD


# Assuming 'Category' is your categorical variable in a DataFrame df
df_encoded = pd.get_dummies(df_relevant, columns=['Category Right'], prefix=['Category Right']) # one hot encoding

# print(df_encoded.iloc[:, 16:22].head(5)) # checking how it looks.

for column in df_encoded.columns: #convert int64 into float64 so network it expects the same value
    if df_encoded.dtypes[column] == "int64" or df_encoded.dtypes[column] == "uint8":
        df_encoded[column] = df_encoded[column].astype('float64')

for column in df_encoded.columns:
    print(column, " dtype: ", df_encoded.dtypes[column])
#print(df_relevant["Class"].unique())
print(df_relevant['Participant'].unique())


Participant  dtype:  int64
Point of Regard Right X [px]  dtype:  float64
Point of Regard Right Y [px]  dtype:  float64
Tracking Ratio [%]  dtype:  float64
Category Right  dtype:  object
Trial  dtype:  int64
Stimulus  dtype:  int64
Gender  dtype:  object
Age  dtype:  float64
Class  dtype:  object
CARS Score  dtype:  float64
Participant  dtype:  float64
Point of Regard Right X [px]  dtype:  float64
Point of Regard Right Y [px]  dtype:  float64
Tracking Ratio [%]  dtype:  float64
Trial  dtype:  float64
Stimulus  dtype:  float64
Gender  dtype:  float64
Age  dtype:  float64
Class  dtype:  float64
CARS Score  dtype:  float64
Category Right_-  dtype:  float64
Category Right_Blink  dtype:  float64
Category Right_Fixation  dtype:  float64
Category Right_Saccade  dtype:  float64
Category Right_Separator  dtype:  float64
[11 12 13 14 15 16 17 18 19 20 21 22 23 25 26 34 38 39 40 41 42 43 44 45
 46 47 48 49 50 51 54 55  0  9 10  1 24  2 27 28 29 30 31 32 33 35 36  3
 37  4 52 53  5  6  7  8]


In [5]:
#tror ikke vi skal bruge den her

# # Save only the normalized columns into a new DataFrame
# normalized_columns = ['Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
#                       'Tracking Ratio [%]',
#                       'CARS Score', 'Age']
# df_normalized = df_relevant[normalized_columns]


# # Combining all data into a single tensor
# # reshaping the normalized data into 3d np array
# normalized_np = np.stack([df_normalized[col].values for col in df_normalized.columns], 1)
# # converting from np array to keras tensors
# normalized_tensor = tf.convert_to_tensor(normalized_np, dtype=tf.float32)

# # Add a dimension to normalized_tensor and df_encoded
# # This transforms them from shape (905519, 7) and (905519, 19) to (905519, 1, 7) and (905519, 1, 19)
# normalized_tensor_3d = tf.expand_dims(normalized_tensor, axis=1)
# df_encoded_3d = tf.expand_dims(df_encoded, axis=1)

# # Now you can concatenate along the last axis
# df_all = tf.keras.layers.Concatenate(axis=-1)([normalized_tensor_3d, df_encoded_3d, concatenated])





In [56]:
# train test split. infør også evt padding

participant_ids = df_encoded['Participant'].unique()

train_participant_ids, test_participant_ids = train_test_split(participant_ids, test_size=0.2, random_state=42)

train_data = df_encoded[np.isin(df_encoded['Participant'], train_participant_ids)]
test_data = df_encoded[np.isin(df_encoded['Participant'], test_participant_ids)]


def create_tensors(data, samples_per_tensor=60): # this deletes the last Tensor but we could use padding for that
    num_tensors = len(data) // samples_per_tensor
    data_tensors = np.array_split(data[:num_tensors * samples_per_tensor], num_tensors)
    return np.stack(data_tensors)

train_data_tensors = create_tensors(train_data)
test_data_tensors = create_tensors(test_data)

print(train_data_tensors.shape)
print(test_data_tensors.shape)

for i in train_data.columns:
    print(i, train_data.dtypes[i])


(11145, 60, 15)
(3946, 60, 15)
Participant float64
Point of Regard Right X [px] float64
Point of Regard Right Y [px] float64
Tracking Ratio [%] float64
Trial float64
Stimulus float64
Gender float64
Age float64
Class float64
CARS Score float64
Category Right_- float64
Category Right_Blink float64
Category Right_Fixation float64
Category Right_Saccade float64
Category Right_Separator float64


In [7]:
# print(X.shape)

indices = torch.randperm(len(X))[:200]

lort = X[indices] # making a subset of 200

# lort.shape 

In [10]:
from tensorflow.keras.models import Sequential
# Define the LSTM model
# Define your LSTM model

model = Sequential()

# Add an LSTM layer with, for example, 50 units
model.add(LSTM(units=50, input_shape=(60, 30), return_sequences=True))

# Add a Dense layer with the number of classes as the output dimension and activation function
model.add(Dense(units=50, activation='relu'))

model.add(Flatten())

# Compile the model with an optimizer, loss function, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.add(Dense(units=1, activation='sigmoid'))

# Display a summary of the model's architecture
model.summary()