In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from tensorflow.keras.layers import Embedding, Flatten, LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import Input



In [3]:
# Load the data
data_path = "../data/Eye-tracking Output/cleaned_data.csv"
df = pd.read_csv(data_path)


In [43]:
## Normalizing data ##

# Feature selection of relevant columns
relevant_columns = ['Participant', 'Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
                    'Tracking Ratio [%]', 'Category Right',
                    'Stimulus', 'Gender', 'Age', 'Class', 'Trial', 'Pupil Diameter Right [mm]', 'Time.s']

df_relevant = df[relevant_columns]

# Filling NaNs in 'CARS Score' with 0
df_relevant['Age'].fillna(0, inplace=True)

# List of numerical columns to scale
numerical_columns = ['Point of Regard Right X [px]', 'Point of Regard Right Y [px]', 'Pupil Diameter Right [mm]', 'Time.s'
                     ] # Lidt i tvivl om vi skal have 'Age' med her. CARS og Age giver også 0. Pis lort

# Convert columns to numeric, coercing errors to NaN
for col in numerical_columns:
    df_relevant[col] = pd.to_numeric(df_relevant[col], errors='coerce')

# Define a function to fill NaN with the mean of the previous and next row
def fill_with_row_mean(df_relevant, col):
    # First, forward fill the first NaN (if any)
    df_relevant[col] = df_relevant[col].fillna(method='ffill')
    
    # Then, fill the rest with the mean of the previous and next row
    df_relevant[col] = df_relevant[col].fillna((df_relevant[col].shift(1) + df_relevant[col].shift(-1)) / 2)
    
    return df_relevant[col]

# Apply this function to each numerical column
for col in numerical_columns:
    df_relevant[col] = fill_with_row_mean(df_relevant, col)

# Handle any remaining NaNs, especially at the end of the DataFrame
df_relevant.fillna(method='bfill', inplace=True)

# Normalize data per combination of Trial, Participant, and Stimulus
for (trial, participant, stimulus), group_data in df_relevant.groupby(['Trial', 'Participant', 'Stimulus']):
    scaler = MinMaxScaler()
    # Apply the scaler to all numerical columns for this group
    df_relevant.loc[group_data.index, numerical_columns] = scaler.fit_transform(group_data[numerical_columns])

# For some reason Age and CARS Score are not scaled properly, so we do it manually
scaler = MinMaxScaler()
df_relevant[['Age', 'Tracking Ratio [%]']] = scaler.fit_transform(df_relevant[['Age', 'Tracking Ratio [%]']])

# Save the normalized data
df_relevant.to_csv("../data/Eye-tracking Output/normalized_data.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['Age'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant[col] = pd.to_numeric(df_relevant[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant[col] = df_relevant[col].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexe

In [4]:
df_relevant = pd.read_csv("../data/Eye-tracking Output/normalized_data.csv")
# Label encoding for participant, subject, and trial. 
# Input dimensions are the number of unique values in each column and output is the square root of the input
# embeddings = []
# inputs = ['Stimulus', 'Trial']
# input_dims = [114, 34]
# output_dims = [11, 7]

# for input_name, input_dim, output_dim in zip(inputs, input_dims, output_dims):
#     le = LabelEncoder()
#     df_relevant[input_name] = le.fit_transform(df_relevant[input_name])
    
#     input_layer = Input(shape=(1,))
#     embedding_layer = Embedding(input_dim=input_dim, output_dim=output_dim)(input_layer)
    
#     embeddings.append(embedding_layer)

# # Concatenate embeddings
# concatenated = Concatenate()(embeddings)

# # print(df_relevant['Stimulus'].unique())
# print(concatenated)


In [5]:
# checking the variables to convert into dummy variables.
from sklearn.preprocessing import LabelEncoder

# for column in df_relevant.columns:
#     print(column, " dtype: ", df_relevant.dtypes[column])
    # if df_relevant.dtypes[column] == "object":
    #     print(column)

# print(df_relevant["Gender"].unique())
# print(df_relevant["Category Left"].unique())
# print(df_relevant["Category Right"].unique())
# print(df_relevant["Trial"].unique())
# print(df_relevant["Stimulus"].unique())


#print(df_relevant["Gender"].unique())
le_gen = LabelEncoder()
df_relevant['Gender'] = le_gen.fit_transform(df_relevant['Gender']) # M, F


le_class = LabelEncoder()
df_relevant['Class'] = le_class.fit_transform(df_relevant['Class']) # ASD, TD


# Assuming 'Category' is your categorical variable in a DataFrame df
df_encoded = pd.get_dummies(df_relevant, columns=['Category Right', 'Stimulus'], prefix=['Category Right', 'Stimulus']) # one hot encoding

# print(df_encoded.iloc[:, 16:22].head(5)) # checking how it looks.

for column in df_encoded.columns: #convert int64 into float64 so network it expects the same value
    if df_encoded.dtypes[column] == "int64" or df_encoded.dtypes[column] == "uint8":
        df_encoded[column] = df_encoded[column].astype('float64')

for column in df_encoded.columns:
    print(column, " dtype: ", df_encoded.dtypes[column])
#print(df_relevant["Class"].unique())
#print(df_relevant['Participant'].unique())



Participant  dtype:  float64
Point of Regard Right X [px]  dtype:  float64
Point of Regard Right Y [px]  dtype:  float64
Tracking Ratio [%]  dtype:  float64
Gender  dtype:  float64
Age  dtype:  float64
Class  dtype:  float64
Trial  dtype:  object
Pupil Diameter Right [mm]  dtype:  float64
Time.s  dtype:  float64
Category Right_-  dtype:  float64
Category Right_Blink  dtype:  float64
Category Right_Fixation  dtype:  float64
Category Right_Saccade  dtype:  float64
Category Right_Separator  dtype:  float64
Stimulus_01 coucou g.jpg  dtype:  float64
Stimulus_01 neutre3.avi  dtype:  float64
Stimulus_01vnvg151201b1.avi  dtype:  float64
Stimulus_02 coucou d.jpg  dtype:  float64
Stimulus_02 devant.jpg  dtype:  float64
Stimulus_02 neutre visage gris.jpg  dtype:  float64
Stimulus_03 devant.jpg  dtype:  float64
Stimulus_03 regard chien g.jpg  dtype:  float64
Stimulus_03 vole triste vs joie1.avi  dtype:  float64
Stimulus_04 b joie triste - copie.jpg  dtype:  float64
Stimulus_04 regard chien d.jpg  

In [5]:
#tror ikke vi skal bruge den her

# # Save only the normalized columns into a new DataFrame
# normalized_columns = ['Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
#                       'Tracking Ratio [%]',
#                       'CARS Score', 'Age']
# df_normalized = df_relevant[normalized_columns]


# # Combining all data into a single tensor
# # reshaping the normalized data into 3d np array
# normalized_np = np.stack([df_normalized[col].values for col in df_normalized.columns], 1)
# # converting from np array to keras tensors
# normalized_tensor = tf.convert_to_tensor(normalized_np, dtype=tf.float32)

# # Add a dimension to normalized_tensor and df_encoded
# # This transforms them from shape (905519, 7) and (905519, 19) to (905519, 1, 7) and (905519, 1, 19)
# normalized_tensor_3d = tf.expand_dims(normalized_tensor, axis=1)
# df_encoded_3d = tf.expand_dims(df_encoded, axis=1)

# # Now you can concatenate along the last axis
# df_all = tf.keras.layers.Concatenate(axis=-1)([normalized_tensor_3d, df_encoded_3d, concatenated])





In [50]:
# train test split. infør også evt padding

import numpy as np
from sklearn.model_selection import train_test_split

# Assuming your data is loaded into a variable called 'df_encoded'
# Modify the following line based on the actual column name of 'Class'

# Extract the unique participant IDs
participant_ids = df_encoded['Participant'].unique()

# Split participant IDs into train and test sets
train_participant_ids, test_participant_ids = train_test_split(participant_ids, test_size=0.2, random_state=42)

# Filter data based on participant IDs
train_data = df_encoded[np.isin(df_encoded['Participant'], train_participant_ids)]
test_data = df_encoded[np.isin(df_encoded['Participant'], test_participant_ids)]

# Extract 'Class' column index dynamically and 
class_column_index = df_encoded.columns.get_loc('Class')
participant_column_index = df_encoded.columns.get_loc('Participant')


# Extract 'Class' values for train and test
train_labels = train_data.iloc[:, class_column_index].values
test_labels = test_data.iloc[:, class_column_index].values


train_labels = pd.get_dummies(train_labels, columns=['Class'], prefix=['Class']) # one hot encoding
test_labels = pd.get_dummies(test_labels, columns=['Class'], prefix=['Class']) # one hot encoding


# Drop the 'Class' and 'Participant' column from the data
train_data = train_data.drop(columns=['Class', 'Participant', 'Trial'])
test_data = test_data.drop(columns=['Class', 'Participant', 'Trial'])



# Convert data into tensors with 60 samples each
def create_tensors(data, labels, samples_per_tensor=60):
    num_tensors = len(data) // samples_per_tensor
    data_tensors = np.array_split(data[:num_tensors * samples_per_tensor], num_tensors)
    labels_tensors = np.array_split(labels[:num_tensors * samples_per_tensor], num_tensors)
    return np.stack(data_tensors), np.stack(labels_tensors) #, np.expand_dims(np.stack(labels_tensors), axis=-1)

train_data_tensors, train_label_tensors = create_tensors(train_data.values, train_labels)
test_data_tensors, test_label_tensors = create_tensors(test_data.values, test_labels)

# checking if everythin looks good
print(train_data_tensors.shape)
print(test_data_tensors.shape)
print(train_label_tensors.shape)
print(test_label_tensors.shape)

#checking up on my data before feeding it to network.
for i in train_data.columns:
    print(i, train_data.dtypes[i])


   ['Class']_0.0  ['Class']_1.0
0              1              0
1              1              0
2              1              0
3              1              0
4              1              0 (236795, 2)
(11145, 60, 126)
(3946, 60, 126)
(11145, 60, 2)
(3946, 60, 2)
Point of Regard Right X [px] float64
Point of Regard Right Y [px] float64
Tracking Ratio [%] float64
Gender float64
Age float64
Pupil Diameter Right [mm] float64
Time.s float64
Category Right_- float64
Category Right_Blink float64
Category Right_Fixation float64
Category Right_Saccade float64
Category Right_Separator float64
Stimulus_01 coucou g.jpg float64
Stimulus_01 neutre3.avi float64
Stimulus_01vnvg151201b1.avi float64
Stimulus_02 coucou d.jpg float64
Stimulus_02 devant.jpg float64
Stimulus_02 neutre visage gris.jpg float64
Stimulus_03 devant.jpg float64
Stimulus_03 regard chien g.jpg float64
Stimulus_03 vole triste vs joie1.avi float64
Stimulus_04 b joie triste - copie.jpg float64
Stimulus_04 regard chien d.jpg float64

In [39]:
# print(X.shape)

indices = torch.randperm(len(train_data_tensors))[:200]

lort = train_data_tensors[indices] # making a subset of 200

lort.shape 

(200, 60, 124)

In [21]:
from tensorflow.keras.models import Sequential
# Define the LSTM model

def classification_model():
    model = Sequential()

    # Add an LSTM layer with, for example, 50 units
    model.add(LSTM(units=50, input_shape=(60, 126), return_sequences=True))

    # Add a Dense layer with the number of classes as the output dimension and activation function
    model.add(Dense(units=50))

    model.add(Dense(units=2, activation='softmax'))

    # Compile the model with an optimizer, loss function, and metrics
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


    return model

model = classification_model()

model.fit(train_data_tensors, train_label_tensors, epochs = 5, batch_size= 1, validation_data= (test_data_tensors, test_label_tensors))


model.save('modelus.h5')

# Display a summary of the model's architecture
model.summary()

preds = model.predict(test_data_tensors)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_10 (LSTM)              (None, 60, 50)            35400     
                                                                 
 dense_20 (Dense)            (None, 60, 50)            2550      
                                                                 
 dense_21 (Dense)            (None, 60, 2)             102       
                                                                 
Total params: 38052 (148.64 KB)
Trainable params: 38052 (148.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [73]:
print(preds.shape)
print(test_label_tensors.shape)
# Assuming preds is the output of model.predict
import pandas as pd

# Assuming preds is the output of model.predict
predicted_class_labels = preds.argmax(axis=-1)
predicted_class_labels_df = pd.DataFrame({'Predicted_Class': predicted_class_labels[:, 0]}) #puts into dataframe

# Print the first 50 rows
#print(predicted_class_labels_df.head(50))

true_class_labels = test_label_tensors.argmax(axis=-1)
true_class_labels_df = pd.DataFrame({'Predicted_Class': true_class_labels[:, 0]}) #puts into dataframe


# for i in predicted_class_labels_df['Predicted_Class'][60:100]:
#     print(i)

# for i in true_class_labels_df['Predicted_Class'][60:100]:
#     print(i)




(3946, 60, 2)
(3946, 60, 2)


In [None]:
#
