# LT Machine Learning

Analyze with auto-ML Laughing Together data

## 1. Import and prepare data

The data will have this format: 
- features:
    - for each participant and each interval, coherence for each channel combination (one value per channel combination - 10 in total)
    - list of intervals (1 for video 1, 2 for video 2, 3 for interaction 1 and 4 for interaction 5)
    - liking pre (average of scores)
    - laughter group (laughter, control)
    - interaction group (interaction, control)
    - age
    - gender
- outcome:
    - liking post (average of scores)

### 1.1 General settings

In [447]:
import numpy as np
import pandas as pd

# Set random seed for replicability
np.random.seed(42)

# Set coherence data folder
coherence_folder = "Z:/projects/LT/LT_adults/Carolina_analyses/fNIRS/data_prep/data"

# Set data folder where the rest is saved
data_path = "Y:/Documents/Projects/LT_machine_learning/Data"


### 1.2 Coherence data

#### 1.2.1 Coherence during videos

In [448]:
# load video data

video_filename = coherence_folder + "/Data_ROI_laughter_all.csv"
df = pd.read_csv(video_filename)

df.head()

Unnamed: 0,IFGr_IFGr,IFGr_IFGl,IFGr_TPJr,IFGr_TPJl,IFGl_IFGr,IFGl_IFGl,IFGl_TPJr,IFGl_TPJl,TPJr_IFGr,TPJr_IFGl,TPJr_TPJr,TPJr_TPJl,TPJl_IFGr,TPJl_IFGl,TPJl_TPJr,TPJl_TPJl,Interval,Pair,Group,Segment
0,0.253717,0.276044,0.258718,0.285384,0.249407,0.215108,0.21434,0.219691,,,,,,,,,1,1,IC,laughter
1,0.231923,0.289607,0.239531,0.248008,0.232951,0.28887,0.260857,0.253972,,,,,,,,,2,1,IC,laughter
2,,0.229439,,,,0.198435,,,,0.211421,,,,0.229041,,,1,2,IC,laughter
3,,0.453068,,,,0.422309,,,,0.217067,,,,0.174231,,,2,2,IC,laughter
4,0.288269,0.257061,0.267321,0.27445,0.352842,0.303648,0.305565,0.343827,0.187786,0.294686,0.193177,0.183882,0.194623,0.247054,0.233287,0.187955,1,3,IC,laughter


In [449]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   IFGr_IFGr  177 non-null    float64
 1   IFGr_IFGl  190 non-null    float64
 2   IFGr_TPJr  155 non-null    float64
 3   IFGr_TPJl  162 non-null    float64
 4   IFGl_IFGr  184 non-null    float64
 5   IFGl_IFGl  197 non-null    float64
 6   IFGl_TPJr  160 non-null    float64
 7   IFGl_TPJl  167 non-null    float64
 8   TPJr_IFGr  155 non-null    float64
 9   TPJr_IFGl  168 non-null    float64
 10  TPJr_TPJr  133 non-null    float64
 11  TPJr_TPJl  141 non-null    float64
 12  TPJl_IFGr  148 non-null    float64
 13  TPJl_IFGl  161 non-null    float64
 14  TPJl_TPJr  127 non-null    float64
 15  TPJl_TPJl  135 non-null    float64
 16  Interval   199 non-null    int64  
 17  Pair       199 non-null    int64  
 18  Group      199 non-null    object 
 19  Segment    199 non-null    object 
dtypes: float64

In [450]:
# average redundant channels

# Create new dataframe including the variables to keep

video_data = df[['Pair', 'Group', 'Segment', 'Interval', 'IFGr_IFGr', 'IFGl_IFGl', 'TPJr_TPJr', 'TPJl_TPJl']].copy()

# Define the ROI pairs and the columns to average
roi_pairs = [
    ('IFGl_IFGr', 'IFGr_IFGl'),
    ('IFGr_TPJr', 'TPJr_IFGr'),
    ('IFGr_TPJl', 'TPJl_IFGr'),
    ('IFGl_TPJr', 'TPJr_IFGl'),
    ('IFGl_TPJl', 'TPJl_IFGl'),
    ('TPJr_TPJl', 'TPJl_TPJr'),
]

# Iterate through the pairs and compute the mean for each one
for col1, col2 in roi_pairs:
    # Compute row-wise mean for the pair of columns
    video_data[f'{col1}'] = df[[col1, col2]].mean(axis=1)

# Check the updated dataframe
video_data.head()

Unnamed: 0,Pair,Group,Segment,Interval,IFGr_IFGr,IFGl_IFGl,TPJr_TPJr,TPJl_TPJl,IFGl_IFGr,IFGr_TPJr,IFGr_TPJl,IFGl_TPJr,IFGl_TPJl,TPJr_TPJl
0,1,IC,laughter,1,0.253717,0.215108,,,0.262726,0.258718,0.285384,0.21434,0.219691,
1,1,IC,laughter,2,0.231923,0.28887,,,0.261279,0.239531,0.248008,0.260857,0.253972,
2,2,IC,laughter,1,,0.198435,,,0.229439,,,0.211421,0.229041,
3,2,IC,laughter,2,,0.422309,,,0.453068,,,0.217067,0.174231,
4,3,IC,laughter,1,0.288269,0.303648,0.193177,0.187955,0.304951,0.227554,0.234536,0.300126,0.29544,0.208585


In [451]:
video_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pair       199 non-null    int64  
 1   Group      199 non-null    object 
 2   Segment    199 non-null    object 
 3   Interval   199 non-null    int64  
 4   IFGr_IFGr  177 non-null    float64
 5   IFGl_IFGl  197 non-null    float64
 6   TPJr_TPJr  133 non-null    float64
 7   TPJl_TPJl  135 non-null    float64
 8   IFGl_IFGr  199 non-null    float64
 9   IFGr_TPJr  179 non-null    float64
 10  IFGr_TPJl  176 non-null    float64
 11  IFGl_TPJr  195 non-null    float64
 12  IFGl_TPJl  193 non-null    float64
 13  TPJr_TPJl  154 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 21.9+ KB


#### 1.2.2 Coherence during free interaction

In [452]:
# load free interaction data

interaction_filename = coherence_folder + "/Data_ROI_interaction_all.csv"
df = pd.read_csv(interaction_filename)

df.head()

Unnamed: 0,IFGr_IFGr,IFGr_IFGl,IFGr_TPJr,IFGr_TPJl,IFGl_IFGr,IFGl_IFGl,IFGl_TPJr,IFGl_TPJl,TPJr_IFGr,TPJr_IFGl,TPJr_TPJr,TPJr_TPJl,TPJl_IFGr,TPJl_IFGl,TPJl_TPJr,TPJl_TPJl,Interval,Pair,Group,Segment
0,,0.261954,0.229777,0.24825,,0.234832,0.296894,0.306257,,,,,,,,,1,1,IC,interaction
1,,,,,,0.278682,0.259312,0.236873,,,,,,,,,2,1,IC,interaction
2,,0.225737,,,,0.254043,,,,0.281262,,,,0.276343,,,1,2,IC,interaction
3,,0.196668,,,,0.221411,,,,0.293187,,,,0.20042,,,2,2,IC,interaction
4,0.238468,0.300067,0.21371,0.174938,0.227807,0.342319,0.262133,0.181897,0.244471,0.254158,0.190864,0.197932,0.261414,0.31828,0.255131,0.221397,1,3,IC,interaction


In [453]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   IFGr_IFGr  137 non-null    float64
 1   IFGr_IFGl  156 non-null    float64
 2   IFGr_TPJr  125 non-null    float64
 3   IFGr_TPJl  134 non-null    float64
 4   IFGl_IFGr  147 non-null    float64
 5   IFGl_IFGl  171 non-null    float64
 6   IFGl_TPJr  137 non-null    float64
 7   IFGl_TPJl  150 non-null    float64
 8   TPJr_IFGr  131 non-null    float64
 9   TPJr_IFGl  146 non-null    float64
 10  TPJr_TPJr  114 non-null    float64
 11  TPJr_TPJl  126 non-null    float64
 12  TPJl_IFGr  126 non-null    float64
 13  TPJl_IFGl  140 non-null    float64
 14  TPJl_TPJr  105 non-null    float64
 15  TPJl_TPJl  121 non-null    float64
 16  Interval   194 non-null    int64  
 17  Pair       194 non-null    int64  
 18  Group      194 non-null    object 
 19  Segment    194 non-null    object 
dtypes: float64

In [454]:
# average redundant channels

# Create new dataframe including the variables to keep

interaction_data = df[['Pair', 'Group', 'Segment', 'Interval', 'IFGr_IFGr', 'IFGl_IFGl', 'TPJr_TPJr', 'TPJl_TPJl']].copy()

# Define the ROI pairs and the columns to average
roi_pairs = [
    ('IFGl_IFGr', 'IFGr_IFGl'),
    ('IFGr_TPJr', 'TPJr_IFGr'),
    ('IFGr_TPJl', 'TPJl_IFGr'),
    ('IFGl_TPJr', 'TPJr_IFGl'),
    ('IFGl_TPJl', 'TPJl_IFGl'),
    ('TPJr_TPJl', 'TPJl_TPJr'),
]

# Iterate through the pairs and compute the mean for each one
for col1, col2 in roi_pairs:
    # Compute row-wise mean for the pair of columns
    interaction_data[f'{col1}'] = df[[col1, col2]].mean(axis=1)


# Change labelling of interval: instead of 1 and 2, 3 and 4
interaction_data['Interval'] = interaction_data['Interval']+ 2

# Check the updated dataframe
interaction_data.head()

Unnamed: 0,Pair,Group,Segment,Interval,IFGr_IFGr,IFGl_IFGl,TPJr_TPJr,TPJl_TPJl,IFGl_IFGr,IFGr_TPJr,IFGr_TPJl,IFGl_TPJr,IFGl_TPJl,TPJr_TPJl
0,1,IC,interaction,3,,0.234832,,,0.261954,0.229777,0.24825,0.296894,0.306257,
1,1,IC,interaction,4,,0.278682,,,,,,0.259312,0.236873,
2,2,IC,interaction,3,,0.254043,,,0.225737,,,0.281262,0.276343,
3,2,IC,interaction,4,,0.221411,,,0.196668,,,0.293187,0.20042,
4,3,IC,interaction,3,0.238468,0.342319,0.190864,0.221397,0.263937,0.22909,0.218176,0.258146,0.250088,0.226531


In [455]:
interaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pair       194 non-null    int64  
 1   Group      194 non-null    object 
 2   Segment    194 non-null    object 
 3   Interval   194 non-null    int64  
 4   IFGr_IFGr  137 non-null    float64
 5   IFGl_IFGl  171 non-null    float64
 6   TPJr_TPJr  114 non-null    float64
 7   TPJl_TPJl  121 non-null    float64
 8   IFGl_IFGr  178 non-null    float64
 9   IFGr_TPJr  162 non-null    float64
 10  IFGr_TPJl  163 non-null    float64
 11  IFGl_TPJr  184 non-null    float64
 12  IFGl_TPJl  185 non-null    float64
 13  TPJr_TPJl  137 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 21.3+ KB


#### 1.2.3 Append coherence data in one dataframe

In [456]:
# append the two dataframes

coherence_data = pd.concat([video_data, interaction_data], ignore_index=True)

coherence_data.head()

# save for later use
filename_data = data_path + "/coherence_data.csv"
coherence_data.to_csv(filename_data, index=True)

In [457]:
coherence_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393 entries, 0 to 392
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pair       393 non-null    int64  
 1   Group      393 non-null    object 
 2   Segment    393 non-null    object 
 3   Interval   393 non-null    int64  
 4   IFGr_IFGr  314 non-null    float64
 5   IFGl_IFGl  368 non-null    float64
 6   TPJr_TPJr  247 non-null    float64
 7   TPJl_TPJl  256 non-null    float64
 8   IFGl_IFGr  377 non-null    float64
 9   IFGr_TPJr  341 non-null    float64
 10  IFGr_TPJl  339 non-null    float64
 11  IFGl_TPJr  379 non-null    float64
 12  IFGl_TPJl  378 non-null    float64
 13  TPJr_TPJl  291 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 43.1+ KB


In [458]:
coherence_data.describe()

Unnamed: 0,Pair,Interval,IFGr_IFGr,IFGl_IFGl,TPJr_TPJr,TPJl_TPJl,IFGl_IFGr,IFGr_TPJr,IFGr_TPJl,IFGl_TPJr,IFGl_TPJl,TPJr_TPJl
count,393.0,393.0,314.0,368.0,247.0,256.0,377.0,341.0,339.0,379.0,378.0,291.0
mean,13.188295,2.486005,0.278476,0.275465,0.280907,0.276633,0.278824,0.274032,0.275987,0.271864,0.272334,0.277691
std,7.427896,1.12051,0.062932,0.060387,0.062117,0.060263,0.050774,0.049678,0.050294,0.051936,0.049237,0.049425
min,1.0,1.0,0.131923,0.154268,0.147143,0.139612,0.156761,0.131825,0.162377,0.152551,0.170665,0.162374
25%,7.0,1.0,0.233011,0.231536,0.234332,0.228557,0.241112,0.242223,0.239869,0.238155,0.236484,0.243337
50%,13.0,2.0,0.273019,0.266415,0.27772,0.27111,0.273881,0.268057,0.273256,0.268829,0.268162,0.270801
75%,20.0,3.0,0.318583,0.314437,0.323039,0.313877,0.311751,0.304163,0.304178,0.300299,0.301968,0.309163
max,26.0,4.0,0.504713,0.468392,0.452858,0.458713,0.453068,0.479757,0.459239,0.542336,0.44653,0.428475


### 1.3 Questionnaire data

In [459]:
# load questionnaire data

questionnaire_filename = data_path + "/Excel_Auswertung_2(Franzi).xlsx"

sheet_names = pd.ExcelFile(questionnaire_filename).sheet_names
print(sheet_names)

['Participant Information', 'General information_v1', 'PANAS_v1', 'Handiness', 'BFI-10', 'Videos', 'PANAS_v2', 'Free Time Studie', 'Questions_v2', 'IOS']


#### 1.3.1 Participant Information

In [460]:
sheet_name = 'Participant Information'

# Load the specified sheet into a DataFrame
df = pd.read_excel(questionnaire_filename, sheet_name=sheet_name)

# get only relevant rows and columns
part_info = df.iloc[2:, 0:3]

# Rename the columns (headers)
part_info.columns = ['Participant', 'Age', 'Gender']

# convert Age to numeric
part_info['Age'] = pd.to_numeric(part_info['Age'], errors='coerce')

# convert Gender to categorical
part_info['Gender'] = part_info['Gender'].astype('category')

# Drop rows where 'Column 1' contains the letter 'P'
part_info = part_info[~part_info['Participant'].str.contains('P', case=False, na=False)]

# Drop rows where any column contains NaN
part_info = part_info.dropna(how='any')

# split group, pair number and participant label into three different columns
part_info[['Group', 'Pair', 'Participant']] = part_info['Participant'].str.extract(r'([A-Za-z]+)\s*(\d+)\s*([A-Za-z])')

# Reset the index
part_info.reset_index(drop=True, inplace=True)

part_info.head()

  warn(msg)


Unnamed: 0,Participant,Age,Gender,Group,Pair
0,A,21.0,f,IL,1
1,B,18.0,f,IL,1
2,A,21.0,f,IL,2
3,B,18.0,f,IL,2
4,A,19.0,f,IL,3


#### 1.3.2 Liking_pre

In [461]:
sheet_name = 'General information_v1'

# Load the specified sheet into a DataFrame
df = pd.read_excel(questionnaire_filename, sheet_name=sheet_name)

# get only relevant rows and columns
df = df.iloc[4:, ]

# Set the first row as header and reset the dataframe
df.columns = df.iloc[0]  # Set first row as column headers
df = df.drop(4)  # Drop the first row, as it is now the header

# Rename the columns (headers)
df.columns = ['Participant', 'Comfort_Pre', 'Likeable_Pre', 'More_Time_Pre']

# convert scores to numeric
df[['Comfort_Pre', 'Likeable_Pre', 'More_Time_Pre']] = df[['Comfort_Pre', 'Likeable_Pre', 'More_Time_Pre']].apply(pd.to_numeric, errors = 'coerce')

# Drop rows where 'Column 1' contains the letter 'P'
df = df[~df['Participant'].str.contains('P', case=False, na=False)]

# Drop rows where any column contains NaN
df = df.dropna(how='any')

# Reset the index (optional)
df.reset_index(drop=True, inplace=True)

# split group, pair number and participant label into three different columns
df[['Group', 'Pair', 'Participant']] = df['Participant'].str.extract(r'([A-Za-z]+)\s*(\d+)\s*([A-Za-z])')

# create mean liking
df['Mean_liking_Pre'] = df[['Comfort_Pre', 'Likeable_Pre', 'More_Time_Pre']].mean(axis = 1)

df.head()

Unnamed: 0,Participant,Comfort_Pre,Likeable_Pre,More_Time_Pre,Group,Pair,Mean_liking_Pre
0,A,6.0,5.0,0.0,IL,1,3.666667
1,B,7.0,7.0,7.0,IL,1,7.0
2,A,7.0,8.0,6.0,IL,2,7.0
3,B,6.0,7.0,6.0,IL,2,6.333333
4,A,8.0,8.0,8.0,IL,3,8.0


In [462]:
# drop unncecessary variables
df = df.drop(['Comfort_Pre', 'Likeable_Pre', 'More_Time_Pre'], axis = 1)

# merge the two dataframes

part_info = pd.merge(part_info, df, on=["Participant", "Group", "Pair"], how="outer")

part_info.head()


Unnamed: 0,Participant,Age,Gender,Group,Pair,Mean_liking_Pre
0,A,19.0,f,IC,1,8.0
1,A,21.0,f,IC,2,7.333333
2,A,22.0,f,IC,3,6.666667
3,A,21.0,m,IC,4,5.333333
4,A,22.0,f,IC,5,6.666667


In [463]:
part_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Participant      204 non-null    object  
 1   Age              204 non-null    float64 
 2   Gender           204 non-null    category
 3   Group            204 non-null    object  
 4   Pair             204 non-null    object  
 5   Mean_liking_Pre  204 non-null    float64 
dtypes: category(1), float64(2), object(3)
memory usage: 8.4+ KB


#### 1.3.3 Liking_post

In [464]:
sheet_name = 'Questions_v2'

# Load the specified sheet into a DataFrame
df = pd.read_excel(questionnaire_filename, sheet_name=sheet_name)

# get only relevant rows and columns
df = df.iloc[3:, ]

# Set the first row as header and reset the dataframe
df.columns = df.iloc[0]  # Set first row as column headers
df = df.drop(3)  # Drop the first row, as it is now the header

# Rename the columns (headers)
df.columns = ['Participant', 'Comfort_Post', 'Likeable_Post', 'More_Time_Post']

# convert scores to numeric
df[['Comfort_Post', 'Likeable_Post', 'More_Time_Post']] = df[['Comfort_Post', 'Likeable_Post', 'More_Time_Post']].apply(pd.to_numeric, errors = 'coerce')

# Drop rows where 'Column 1' contains the letter 'P'
df = df[~df['Participant'].str.contains('P', case=False, na=False)]

# Drop rows where any column contains NaN
df = df.dropna(how='any')

# Reset the index (optional)
df.reset_index(drop=True, inplace=True)

# split group, pair number and participant label into three different columns
df[['Group', 'Pair', 'Participant']] = df['Participant'].str.extract(r'([A-Za-z]+)\s*(\d+)\s*([A-Za-z])')

# create mean liking
df['Mean_liking_Post'] = df[['Comfort_Post', 'Likeable_Post', 'More_Time_Post']].mean(axis = 1)

df.head()

Unnamed: 0,Participant,Comfort_Post,Likeable_Post,More_Time_Post,Group,Pair,Mean_liking_Post
0,A,6.0,6.0,0.0,IL,1,4.0
1,B,7.0,7.0,7.0,IL,1,7.0
2,A,7.0,8.0,4.0,IL,2,6.333333
3,B,7.0,7.0,7.0,IL,2,7.0
4,A,8.0,8.0,6.0,IL,3,7.333333


In [465]:
# drop unncecessary variables
df = df.drop(['Comfort_Post', 'Likeable_Post', 'More_Time_Post'], axis = 1)

# merge the two dataframes

part_info = pd.merge(part_info, df, on=["Participant", "Group", "Pair"], how="outer")

# convert Pair to numeric
part_info['Pair'] = pd.to_numeric(part_info['Pair'])

part_info.head()

Unnamed: 0,Participant,Age,Gender,Group,Pair,Mean_liking_Pre,Mean_liking_Post
0,A,19.0,f,IC,1,8.0,8.0
1,A,21.0,f,IC,2,7.333333,8.0
2,A,22.0,f,IC,3,6.666667,7.0
3,A,21.0,m,IC,4,5.333333,4.333333
4,A,22.0,f,IC,5,6.666667,6.666667


In [466]:
part_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Participant       204 non-null    object  
 1   Age               204 non-null    float64 
 2   Gender            204 non-null    category
 3   Group             204 non-null    object  
 4   Pair              204 non-null    int64   
 5   Mean_liking_Pre   204 non-null    float64 
 6   Mean_liking_Post  204 non-null    float64 
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 10.0+ KB


In [467]:
# save data for later use
filename_data = data_path + "/questionnaire_LT_data.csv"
df.to_csv(filename_data, index=True)

## 1.4 Merge all data

In [468]:
print("coherence Pair values:", coherence_data['Pair'].unique())
print("coherence_data Group values:", coherence_data['Group'].unique())
print("part_info Pair values:", part_info['Pair'].unique())
print("part_info Group values:", part_info['Group'].unique())

coherence Pair values: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 20 21 22 23 24 25
 18 26]
coherence_data Group values: ['IC' 'IL' 'NIC' 'NIL']
part_info Pair values: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26]
part_info Group values: ['IC' 'IL' 'NIC' 'NIL']


In [469]:
# merge the two dataframes

all_data = pd.merge(part_info, coherence_data, on=["Group", "Pair"], how="inner")

all_data.head()

Unnamed: 0,Participant,Age,Gender,Group,Pair,Mean_liking_Pre,Mean_liking_Post,Segment,Interval,IFGr_IFGr,IFGl_IFGl,TPJr_TPJr,TPJl_TPJl,IFGl_IFGr,IFGr_TPJr,IFGr_TPJl,IFGl_TPJr,IFGl_TPJl,TPJr_TPJl
0,A,19.0,f,IC,1,8.0,8.0,laughter,1,0.253717,0.215108,,,0.262726,0.258718,0.285384,0.21434,0.219691,
1,A,19.0,f,IC,1,8.0,8.0,laughter,2,0.231923,0.28887,,,0.261279,0.239531,0.248008,0.260857,0.253972,
2,A,19.0,f,IC,1,8.0,8.0,interaction,3,,0.234832,,,0.261954,0.229777,0.24825,0.296894,0.306257,
3,A,19.0,f,IC,1,8.0,8.0,interaction,4,,0.278682,,,,,,0.259312,0.236873,
4,A,21.0,f,IC,2,7.333333,8.0,laughter,1,,0.198435,,,0.229439,,,0.211421,0.229041,


In [470]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786 entries, 0 to 785
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Participant       786 non-null    object  
 1   Age               786 non-null    float64 
 2   Gender            786 non-null    category
 3   Group             786 non-null    object  
 4   Pair              786 non-null    int64   
 5   Mean_liking_Pre   786 non-null    float64 
 6   Mean_liking_Post  786 non-null    float64 
 7   Segment           786 non-null    object  
 8   Interval          786 non-null    int64   
 9   IFGr_IFGr         628 non-null    float64 
 10  IFGl_IFGl         736 non-null    float64 
 11  TPJr_TPJr         494 non-null    float64 
 12  TPJl_TPJl         512 non-null    float64 
 13  IFGl_IFGr         754 non-null    float64 
 14  IFGr_TPJr         682 non-null    float64 
 15  IFGr_TPJl         678 non-null    float64 
 16  IFGl_TPJr         758 non-

### 1.3 Clean data (real data only)

In [471]:
# Set pair, participant and segment as indexes

#all_data.set_index(["Pair", "Participant", "Segment"], inplace=True)
all_data['combined'] = all_data['Group'] + all_data['Pair'].astype(str) + '_' + all_data['Participant'] + '_' + all_data['Interval'].astype(str)

all_data.set_index('combined', inplace=True)

# divide group into two variables: laughter_group (0,1) and interaction_group(0,1)
all_data['laughter_group'] = (all_data['Group'].str.contains('L', case=False, na=False))
all_data['interaction_group'] = (~all_data['Group'].str.contains('N', case=False, na=False))

# drop unnecessary variables
all_data = all_data.drop(['Participant', 'Pair', 'Segment', 'Group', 'Interval'], axis = 1)


# Display the updated DataFrame with multi-index
all_data.sample(5)


Unnamed: 0_level_0,Age,Gender,Mean_liking_Pre,Mean_liking_Post,IFGr_IFGr,IFGl_IFGl,TPJr_TPJr,TPJl_TPJl,IFGl_IFGr,IFGr_TPJr,IFGr_TPJl,IFGl_TPJr,IFGl_TPJl,TPJr_TPJl,laughter_group,interaction_group
combined,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
NIL24_B_4,23.0,f,6.0,5.666667,0.329731,0.264475,0.31438,0.233995,0.322295,0.231246,0.226679,0.26649,0.258251,0.247747,True,False
IC10_A_4,22.0,f,6.0,6.666667,0.203247,0.237688,,,0.216901,0.247509,0.227032,0.30519,0.214944,,False,True
NIC12_A_3,19.0,f,5.666667,6.666667,,0.254039,,,0.330648,,,0.269665,0.357894,,False,False
NIC1_A_3,19.0,f,7.666667,8.0,,0.274304,,,0.404315,0.267253,0.27222,0.282518,0.223234,,False,False
NIC18_A_3,24.0,m,5.333333,6.0,,0.154268,0.243228,0.37647,0.205237,0.175599,0.190539,0.29458,0.331686,0.264672,False,False


In [472]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 786 entries, IC1_A_1 to NIL9_B_4
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Age                786 non-null    float64 
 1   Gender             786 non-null    category
 2   Mean_liking_Pre    786 non-null    float64 
 3   Mean_liking_Post   786 non-null    float64 
 4   IFGr_IFGr          628 non-null    float64 
 5   IFGl_IFGl          736 non-null    float64 
 6   TPJr_TPJr          494 non-null    float64 
 7   TPJl_TPJl          512 non-null    float64 
 8   IFGl_IFGr          754 non-null    float64 
 9   IFGr_TPJr          682 non-null    float64 
 10  IFGr_TPJl          678 non-null    float64 
 11  IFGl_TPJr          758 non-null    float64 
 12  IFGl_TPJl          756 non-null    float64 
 13  TPJr_TPJl          582 non-null    float64 
 14  laughter_group     786 non-null    bool    
 15  interaction_group  786 non-null    bool    
dtypes:

In [473]:
# save data for later use
filename_data = data_path + "/all_LT_data.csv"
all_data.to_csv(filename_data, index=True)

#### 1.3.1 First strategy: eliminate all rows with missing data

In [474]:
# Drop rows where any column contains NaN
all_data = all_data.dropna(how='any')

all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 372 entries, IC3_A_1 to NIL9_B_4
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Age                372 non-null    float64 
 1   Gender             372 non-null    category
 2   Mean_liking_Pre    372 non-null    float64 
 3   Mean_liking_Post   372 non-null    float64 
 4   IFGr_IFGr          372 non-null    float64 
 5   IFGl_IFGl          372 non-null    float64 
 6   TPJr_TPJr          372 non-null    float64 
 7   TPJl_TPJl          372 non-null    float64 
 8   IFGl_IFGr          372 non-null    float64 
 9   IFGr_TPJr          372 non-null    float64 
 10  IFGr_TPJl          372 non-null    float64 
 11  IFGl_TPJr          372 non-null    float64 
 12  IFGl_TPJl          372 non-null    float64 
 13  TPJr_TPJl          372 non-null    float64 
 14  laughter_group     372 non-null    bool    
 15  interaction_group  372 non-null    bool    
dtypes:

In [475]:
all_data.head()

Unnamed: 0_level_0,Age,Gender,Mean_liking_Pre,Mean_liking_Post,IFGr_IFGr,IFGl_IFGl,TPJr_TPJr,TPJl_TPJl,IFGl_IFGr,IFGr_TPJr,IFGr_TPJl,IFGl_TPJr,IFGl_TPJl,TPJr_TPJl,laughter_group,interaction_group
combined,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
IC3_A_1,22.0,f,6.666667,7.0,0.288269,0.303648,0.193177,0.187955,0.304951,0.227554,0.234536,0.300126,0.29544,0.208585,False,True
IC3_A_2,22.0,f,6.666667,7.0,0.361407,0.284197,0.228178,0.232146,0.299744,0.327344,0.291087,0.274232,0.29131,0.214959,False,True
IC3_A_3,22.0,f,6.666667,7.0,0.238468,0.342319,0.190864,0.221397,0.263937,0.22909,0.218176,0.258146,0.250088,0.226531,False,True
IC4_A_1,21.0,m,5.333333,4.333333,0.348762,0.350074,0.285111,0.224902,0.344359,0.261578,0.217713,0.207882,0.188696,0.253347,False,True
IC4_A_2,21.0,m,5.333333,4.333333,0.343636,0.286722,0.221597,0.252993,0.337413,0.335984,0.341463,0.350946,0.324989,0.262662,False,True


In [476]:
all_data.index

Index(['IC3_A_1', 'IC3_A_2', 'IC3_A_3', 'IC4_A_1', 'IC4_A_2', 'IC4_A_3',
       'IC4_A_4', 'IC6_A_1', 'IC6_A_2', 'IC6_A_3',
       ...
       'NIL5_B_1', 'NIL5_B_2', 'NIL5_B_3', 'NIL8_B_1', 'NIL8_B_2', 'NIL8_B_3',
       'NIL9_B_1', 'NIL9_B_2', 'NIL9_B_3', 'NIL9_B_4'],
      dtype='object', name='combined', length=372)

In [477]:
all_data.index.nunique()

372

Potential problems in the data:
- Many missing values for coherence! How to deal with that? Participant exclusion, channel exclusion or inputation?

## 2. Auto ML

### 2.1 Create test and train data

In [478]:
from sklearn.model_selection import train_test_split

# Load data
#data = pd.read_csv(filename_data)

data = all_data

# Shuffle and split the dataset into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)  # 80% train, 20% test

# Check the resulting shapes
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (297, 16)
Test data shape: (75, 16)


### 2.2 AutoGluon regressor

In [479]:
model_path = "Y:\Documents\Projects\LT_machine_learning\Models\Liking_post"  # Replace this with the desired folder path

from autogluon.tabular import TabularPredictor
predictor = TabularPredictor(label="Mean_liking_Post",
                             eval_metric="mean_absolute_percentage_error", path = model_path).fit(train_data, time_limit=500, )

  model_path = "Y:\Documents\Projects\LT_machine_learning\Models\Liking_post"  # Replace this with the desired folder path


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.8
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          12
Memory Avail:       18.18 GB / 31.71 GB (57.3%)
Disk Space Avail:   6.28 GB / 9.77 GB (64.3%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.

In [None]:
# here is the code if I wanted to pre-set the hyperparameter

from autogluon.tabular import TabularPredictor

# Define custom hyperparameters
hyperparameters = {
    'NN_TORCH': [{}],  # Use the default settings for NN_TORCH
    'GBM': [
        {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},  # Ensure ag_args is a dict
        {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0}},
    ],
    'CAT': [{}],
    'XGB': [{}],
    'FASTAI': [{}],
    'RF': [
        {'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}},
        {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}},
        {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}},
    ],
    'XT': [
        {'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}},
        {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}},
        {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}},
    ],
    'KNN': [
        {'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}},
        {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}
    ],
}

# Now use these hyperparameters when fitting the model
predictor = TabularPredictor(label="joint_action_performance",
                             eval_metric="mean_absolute_percentage_error").fit(
    train_data, time_limit=500, hyperparameters=hyperparameters
)



In [480]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.08035,mean_absolute_percentage_error,0.037034,4.013341,0.0,0.067521,2,True,12
1,KNeighborsDist,-0.08598,mean_absolute_percentage_error,0.020038,0.026292,0.020038,0.026292,1,True,2
2,XGBoost,-0.087699,mean_absolute_percentage_error,0.003994,0.745207,0.003994,0.745207,1,True,9
3,ExtraTreesMSE,-0.092245,mean_absolute_percentage_error,0.043147,0.539078,0.043147,0.539078,1,True,7
4,LightGBMLarge,-0.097686,mean_absolute_percentage_error,0.006003,1.475219,0.006003,1.475219,1,True,11
5,RandomForestMSE,-0.102782,mean_absolute_percentage_error,0.052939,0.523771,0.052939,0.523771,1,True,5
6,CatBoost,-0.108962,mean_absolute_percentage_error,0.000968,1.163818,0.000968,1.163818,1,True,6
7,KNeighborsUnif,-0.110363,mean_absolute_percentage_error,0.02176,0.032705,0.02176,0.032705,1,True,1
8,NeuralNetTorch,-0.110431,mean_absolute_percentage_error,0.004547,1.682604,0.004547,1.682604,1,True,10
9,NeuralNetFastAI,-0.111833,mean_absolute_percentage_error,0.006999,1.699102,0.006999,1.699102,1,True,8


### 2.3 Test the model

In [481]:
from autogluon.tabular import TabularPredictor
pred = TabularPredictor.load(model_path)

In [482]:
preds = pred.predict( test_data, model="WeightedEnsemble_L2" )
preds

combined
NIL16_B_2    4.823733
IC17_A_1     7.470622
IC9_A_3      5.984307
NIL10_B_2    6.400978
IL6_A_2      5.716728
               ...   
IL25_A_4     6.316867
IC6_B_3      6.856371
NIC8_B_1     4.278941
NIC21_B_2    6.254253
NIL23_B_2    6.346714
Name: Mean_liking_Post, Length: 75, dtype: float32

In [483]:
metrics_data = pred.evaluate_predictions(test_data["Mean_liking_Post"], preds)
metrics_data

{'mean_absolute_percentage_error': -0.09619269072108302,
 'root_mean_squared_error': -0.7096959947942022,
 'mean_squared_error': -0.5036684050269323,
 'mean_absolute_error': -0.5470707257588704,
 'r2': 0.5147264577403995,
 'pearsonr': 0.7311660230432856,
 'median_absolute_error': -0.4890875816345215}

### 2.4 Visualize the model

In [484]:
from autogluon.tabular import TabularPredictor
predictor = TabularPredictor.load(model_path)
path_to_png = predictor.plot_ensemble_model()
path_to_png

'Y:\\Documents\\Projects\\LT_machine_learning\\Models\\Liking_post\\ensemble_model.png'

## 3. Calculate feature importance

In [486]:
feature_importances = pred.feature_importance(test_data, model="WeightedEnsemble_L2", num_shuffle_sets=10)
feature_importances

Computing feature importance via permutation shuffling for 15 features using 75 rows with 10 shuffle sets...
	161.88s	= Expected runtime (16.19s per shuffle set)
	4.21s	= Actual runtime (Completed 10 of 10 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Mean_liking_Pre,0.085444,0.015024,1.155762e-08,10,0.100884,0.070005
Age,0.020151,0.004202,5.124614e-08,10,0.024469,0.015833
TPJr_TPJl,0.002461,0.00105,2.026007e-05,10,0.00354,0.001382
IFGl_TPJl,0.001655,0.001959,0.01278493,10,0.003669,-0.000358
interaction_group,0.001526,0.000698,3.471859e-05,10,0.002243,0.000809
laughter_group,0.000697,0.000386,0.0001457082,10,0.001094,0.0003
IFGr_TPJl,0.000458,0.000795,0.05077457,10,0.001275,-0.000358
IFGr_IFGr,0.000357,0.001472,0.2311222,10,0.00187,-0.001155
Gender,0.000167,0.001013,0.3072876,10,0.001208,-0.000874
IFGl_IFGl,-0.000437,0.001223,0.856099,10,0.00082,-0.001694


##