In [50]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from tensorflow.keras.layers import Embedding, Flatten, LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import Input



In [57]:
# Load the data
data_path = "../data/Eye-tracking Output/cleaned_data.csv"
df = pd.read_csv(data_path)


In [59]:
## Normalizing data ##

# Feature selection of relevant columns
relevant_columns = ['Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
                    'Pupil Diameter Left [mm]', 'Pupil Diameter Right [mm]', 'Tracking Ratio [%]',
                    'Category Right', 'Participant', 'Trial', 'Stimulus', 'Gender', 'Age', 'Class', 'CARS Score']
df_relevant = df[relevant_columns]

# Filling NaNs in 'CARS Score' with 0
df_relevant['CARS Score'].fillna(0, inplace=True)

# List of numerical columns to scale
numerical_columns = ['Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
                     'Pupil Diameter Left [mm]', 'Pupil Diameter Right [mm]', 'Tracking Ratio [%]', 'CARS Score', 'Age'] # Lidt i tvivl om vi skal have 'Age' med her. CARS og Age giver også 0. Pis lort

# Convert columns to numeric, coercing errors to NaN
for col in numerical_columns:
    df_relevant[col] = pd.to_numeric(df_relevant[col], errors='coerce')

# Define a function to fill NaN with the mean of the previous and next row
def fill_with_row_mean(df, col):
    # First, forward fill the first NaN (if any)
    df[col] = df[col].fillna(method='ffill')
    
    # Then, fill the rest with the mean of the previous and next row
    df[col] = df[col].fillna((df[col].shift(1) + df[col].shift(-1)) / 2)
    
    return df[col]

# Apply this function to each numerical column
for col in numerical_columns:
    df_relevant[col] = fill_with_row_mean(df_relevant, col)

# Handle any remaining NaNs, especially at the end of the DataFrame
df_relevant.fillna(method='bfill', inplace=True)

# Normalize data per combination of Trial, Participant, and Stimulus
for (trial, participant, stimulus), group_data in df_relevant.groupby(['Trial', 'Participant', 'Stimulus']):
    scaler = MinMaxScaler()
    # Apply the scaler to all numerical columns for this group
    df_relevant.loc[group_data.index, numerical_columns] = scaler.fit_transform(group_data[numerical_columns])

# Save the normalized data
df_relevant.to_csv("../data/Eye-tracking Output/normalized_data.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['CARS Score'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant[col] = pd.to_numeric(df_relevant[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [52]:
# Label encoding
le_participants = LabelEncoder()
df_relevant['Participant'] = le_participants.fit_transform(df_relevant['Participant'])

le_stimulus = LabelEncoder()
df_relevant['Stimulus'] = le_stimulus.fit_transform(df_relevant['Stimulus'])

# Define the model
participant_input = Input(shape=(1,))
participant_embedding = Embedding(input_dim=58, output_dim=8)(participant_input)

stimulus_input = Input(shape=(1,))
stimulus_embedding = Embedding(input_dim=114, output_dim=11)(stimulus_input)

# Additional inputs can be added here if needed
# ...

# Concatenate embeddings and pass through LSTM
concatenated = Concatenate()([participant_embedding, stimulus_embedding])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['Participant'] = le_participants.fit_transform(df_relevant['Participant'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['Stimulus'] = le_stimulus.fit_transform(df_relevant['Stimulus'])


In [47]:
# checking the variables to convert into dummy variables.
from sklearn.preprocessing import LabelEncoder

for column in df_relevant.columns:
    if df_relevant.dtypes[column] == "object":
        print(column)

# print(df_relevant["Gender"].unique())
# print(df_relevant["Category Left"].unique())
# print(df_relevant["Category Right"].unique())
# print(df_relevant["Trial"].unique())
# print(df_relevant["Stimulus"].unique())


#print(df_relevant["Gender"].unique())
# le_gen = LabelEncoder()
# df_relevant['Gender'] = le_gen.fit_transform(df_relevant['Gender']) # M, F




# le_class = LabelEncoder()
# df_relevant['Class'] = le_class.fit_transform(df_relevant['Class']) # ASD, TD


# Assuming 'Category' is your categorical variable in a DataFrame df
df_encoded = pd.get_dummies(df_relevant, columns=['Category Left'], prefix=['Category Left']) # one hot encoding

print(df_encoded.iloc[:, 16:22].head(5)) # checking how it looks.



#print(df_relevant["Class"].unique())




# This creates one-hot encoded columns for 'Category'



Category Right
Trial
Gender
Class
[110   2 113  80  89  90  81  76  78  70  72  73  74  75  77  79  82  83
  84  85  86  91  92  93  94 112 108  15  20  21  24  29  32  36  41  46
   3   6  10  13  17  18  22  25  30  31  40  42  50  52  55  59  66  67
 109  27  68  87  95  97  99 101 103 106  35  39  45  49  51  57  62  65
 107   0   4   7  11  14  16  19  23  26  34  38  44  48  53  58  61  63
  71 111  28  69  88  96  98 100 102 104 105  33  37  43  47  54  56  60
  64   1   5   8   9  12]


KeyError: "None of [Index(['Category Left'], dtype='object')] are in the [columns]"

In [31]:
# Splitting the data into training and test sets

def create_sequences(data, sequence_length):
    sequences = []
    labels = []  # Adjust based on your task
    for i in range(len(data) - sequence_length):
        sequence = data[i:i + sequence_length]
        label = ...  # Define your label
        sequences.append(sequence)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequence_length = 60  # 1 second of data - Er lidt usikker på denne her, da det ligner at de har et sample hvert 20. millisekund.
data_sequences, labels = create_sequences(df_relevant.values, sequence_length)

# Reshape for LSTM
X = data_sequences.reshape((data_sequences.shape[0], sequence_length, data_sequences.shape[2]))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, shuffle=False)



In [7]:
# print(X.shape)

indices = torch.randperm(len(X))[:200]

lort = X[indices] # making a subset of 200

# lort.shape 

In [None]:

# Define the LSTM model
model = Sequential()
# Add LSTM layers and other necessary layers
# ...

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
# ...

# Evaluate the model
# ...

# Make predictions
# ...

NameError: name 'Sequential' is not defined