In [1]:
import os
import shap

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# get data from all 3 datasets
full_feature_data = pd.DataFrame()
full_ema_data = pd.DataFrame()
full_bfi_data = pd.DataFrame()

for i in range(2, 5):
    curr_folder = './data/globem/INS-W_'+str(i)
    rapids_df = pd.read_csv(curr_folder+'/FeatureData/rapids.csv', usecols=lambda col: col.endswith('7dhist') and 'norm' in col or col=='pid' or col=='date')
    full_feature_data = pd.concat([full_feature_data, rapids_df], ignore_index=True)
    
    ema_df = pd.read_csv(curr_folder+'/SurveyData/ema.csv')
    full_ema_data = pd.concat([full_ema_data, ema_df], ignore_index=True)
    
    bfi_df = pd.read_csv(curr_folder+'/SurveyData/pre.csv')[['pid', 'date', \
                                    'BFI10_extroversion_PRE', \
                                    'BFI10_agreeableness_PRE', \
                                    'BFI10_conscientiousness_PRE', \
                                    'BFI10_neuroticism_PRE', \
                                    'BFI10_openness_PRE']]
    full_bfi_data = pd.concat([full_bfi_data, bfi_df], ignore_index=True)
print(len(full_feature_data))
full_feature_data.to_csv('merged_unfiltered_feature_data.csv')
full_ema_data.to_csv('merged_unfiltered_ema_data.csv')
full_bfi_data.to_csv('merged_unfiltered_bfi_data.csv')

55342


In [3]:
# merge wrt pid and date
ema_data = pd.read_csv('merged_unfiltered_ema_data.csv')[['pid', 'date', \
                                    'positive_affect_EMA', \
                                    'negative_affect_EMA']]
clean_ema_data = ema_data.dropna()
bfi_data = pd.read_csv('merged_unfiltered_bfi_data.csv')[['pid', \
                                    'BFI10_extroversion_PRE', \
                                    'BFI10_agreeableness_PRE', \
                                    'BFI10_conscientiousness_PRE', \
                                    'BFI10_neuroticism_PRE', \
                                    'BFI10_openness_PRE']]
clean_bfi_data = bfi_data.dropna()
merged_data = pd.merge(full_feature_data, clean_ema_data, on=['pid', 'date'], how='inner')
merged_data = pd.merge(merged_data, clean_bfi_data, on=['pid'], how='inner')
print(len(merged_data.columns))
print(len(merged_data))

215
10618


In [4]:
# no longer need pid and date
merged_data.drop(columns=['pid', 'date'], inplace=True)
print(len(merged_data.columns))

213


In [5]:
# drop string columns, can come back to this and one hot encode these but doesnt seem worth it
for col in merged_data.columns:
    if merged_data[col].apply(lambda x: isinstance(x, str)).any():
        merged_data.drop(columns=col, inplace=True)
        continue
print(len(merged_data.columns))

212


In [6]:
# impute wrt nn for NaN data
numeric_data = merged_data.select_dtypes(include=['number'])
imputer = KNNImputer(n_neighbors=1)
imputed_data = imputer.fit_transform(merged_data)
merged_data[numeric_data.columns] = imputed_data

In [7]:
# normalize bfi data
columns_to_normalize = [
    'BFI10_extroversion_PRE', 
    'BFI10_agreeableness_PRE', 
    'BFI10_conscientiousness_PRE', 
    'BFI10_neuroticism_PRE', 
    'BFI10_openness_PRE'
]
scaler = MinMaxScaler()
merged_data[columns_to_normalize] = scaler.fit_transform(merged_data[columns_to_normalize])

In [8]:
# make sure no na at this point
merged_data.isna().any().any()

np.False_

In [9]:
merged_data.to_csv('merged_knn_imputed_filtered_data_panas.csv')

In [10]:
# split into our feature and target variables
pos_y = merged_data['positive_affect_EMA']
neg_y = merged_data['negative_affect_EMA']
merged_data = merged_data.drop(columns=['positive_affect_EMA', 'negative_affect_EMA'])
merged_data_no_bfi = merged_data.drop(columns=['BFI10_extroversion_PRE', 'BFI10_agreeableness_PRE', 'BFI10_conscientiousness_PRE', 'BFI10_neuroticism_PRE', 'BFI10_openness_PRE'])

In [22]:
# get statisticallt relevant for neg no bfi and w/bfi
gamma_model = sm.GLM(neg_y, merged_data)
gamma_results = gamma_model.fit()
p_values = gamma_results.pvalues
significant_columns = p_values[p_values < 0.05].index
new_merged_data_neg = merged_data[significant_columns]
print(new_merged_data_no_bfi_neg.columns)
gamma_model = sm.GLM(neg_y, merged_data_no_bfi)
gamma_results = gamma_model.fit()
p_values = gamma_results.pvalues
significant_columns = p_values[p_values < 0.05].index
new_merged_data_no_bfi_neg = merged_data_no_bfi[significant_columns]

Index(['f_loc:phone_locations_doryab_normalizedlocationentropy:7dhist',
       'f_slp:fitbit_sleep_summary_rapids_sumdurationasleepmain_norm:7dhist',
       'f_slp:fitbit_sleep_summary_rapids_sumdurationawakemain_norm:7dhist',
       'f_slp:fitbit_sleep_summary_rapids_sumdurationinbedmain_norm:7dhist',
       'f_slp:fitbit_sleep_summary_rapids_avgdurationawakemain_norm:7dhist',
       'f_slp:fitbit_sleep_summary_rapids_countepisodemain_norm:7dhist',
       'f_slp:fitbit_sleep_summary_rapids_lastbedtimemain_norm:7dhist',
       'f_slp:fitbit_sleep_intraday_rapids_maxdurationawakeunifiedmain_norm:7dhist',
       'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain_norm:7dhist',
       'f_slp:fitbit_sleep_intraday_rapids_countepisodeawakeunifiedmain_norm:7dhist',
       'f_slp:fitbit_sleep_intraday_rapids_ratiocountasleepunifiedwithinmain_norm:7dhist',
       'f_slp:fitbit_sleep_intraday_rapids_ratiocountawakeunifiedwithinmain_norm:7dhist',
       'f_call:phone_calls_rapids_in

In [12]:
# get columns in one but not the other
columns_in_df1_not_in_df2 = set(new_merged_data_no_bfi_neg.columns) - set(new_merged_data_neg.columns)

columns_in_df2_not_in_df1 = set(new_merged_data_neg.columns) - set(new_merged_data_no_bfi_neg.columns)

print("Columns only in not BFI predicting neg panas:")
for col in columns_in_df1_not_in_df2:
    print(f"  - {col}")

print('---------')
print("Columns only in BFI predicting neg panas:")
for col in columns_in_df2_not_in_df1:
    print(f"  - {col}")

Columns only in not BFI predicting neg panas:
  - f_screen:phone_screen_rapids_sumdurationunlock_locmap_home_norm:7dhist
  - f_steps:fitbit_steps_summary_rapids_minsumsteps_norm:7dhist
  - f_screen:phone_screen_rapids_sumdurationunlock_locmap_study_norm:7dhist
  - f_loc:phone_locations_doryab_loglocationvariance_norm:7dhist
  - f_steps:fitbit_steps_intraday_rapids_maxdurationactivebout_norm:7dhist
  - f_steps:fitbit_steps_intraday_rapids_maxdurationsedentarybout_norm:7dhist
  - f_blue:phone_bluetooth_doryab_countscansmostfrequentdeviceacrosssegmentsothers_norm:7dhist
  - f_call:phone_calls_rapids_outgoing_sumduration_norm:7dhist
  - f_loc:phone_locations_doryab_timeathome_norm:7dhist
  - f_slp:fitbit_sleep_summary_rapids_lastbedtimemain_norm:7dhist
  - f_loc:phone_locations_doryab_locationentropy_norm:7dhist
  - f_loc:phone_locations_barnett_siglocentropy_norm:7dhist
  - f_loc:phone_locations_doryab_timeattop2location_norm:7dhist
  - f_blue:phone_bluetooth_doryab_countscansmostfrequent

In [13]:
# get statisticallt relevant for pos no bfi and w/bfi
gamma_model = sm.GLM(pos_y, merged_data)
gamma_results = gamma_model.fit()
p_values = gamma_results.pvalues
significant_columns = p_values[p_values < 0.05].index
new_merged_data_pos = merged_data[significant_columns]

gamma_model = sm.GLM(pos_y, merged_data_no_bfi)
gamma_results = gamma_model.fit()
p_values = gamma_results.pvalues
significant_columns = p_values[p_values < 0.05].index
new_merged_data_no_bfi_pos = merged_data_no_bfi[significant_columns]

In [14]:
# get columns in one but not the other
columns_in_df1_not_in_df2 = set(new_merged_data_no_bfi_pos.columns) - set(new_merged_data_pos.columns)

columns_in_df2_not_in_df1 = set(new_merged_data_pos.columns) - set(new_merged_data_no_bfi_pos.columns)

print("Columns only in not BFI predicting pos panas:")
for col in columns_in_df1_not_in_df2:
    print(f"  - {col}")

print('---------')
print("Columns only in not BFI predicting pos panas:")
for col in columns_in_df2_not_in_df1:
    print(f"  - {col}")

Columns only in not BFI predicting pos panas:
  - f_loc:phone_locations_doryab_homelabel_norm:7dhist
  - f_loc:phone_locations_doryab_numberofsignificantplaces_norm:7dhist
  - f_call:phone_calls_rapids_outgoing_modeduration_norm:7dhist
  - f_call:phone_calls_rapids_outgoing_meanduration_norm:7dhist
  - f_loc:phone_locations_doryab_avglengthstayatclusters_norm:7dhist
  - f_screen:phone_screen_rapids_mindurationunlock_locmap_study_norm:7dhist
  - f_blue:phone_bluetooth_doryab_countscansleastfrequentdeviceacrossdatasetall_norm:7dhist
  - f_slp:fitbit_sleep_summary_rapids_sumdurationawakemain_norm:7dhist
  - f_blue:phone_bluetooth_doryab_countscansmostfrequentdeviceacrosssegmentsothers_norm:7dhist
  - f_loc:phone_locations_barnett_maxdiam_norm:7dhist
  - f_wifi:phone_wifi_connected_rapids_countscansmostuniquedevice_norm:7dhist
  - f_loc:phone_locations_barnett_rog_norm:7dhist
  - f_slp:fitbit_sleep_summary_rapids_lastbedtimemain_norm:7dhist
  - f_loc:phone_locations_doryab_avgspeed_norm:7d

In [15]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [16]:
print(mean_squared_error(np.mean(neg_y)*np.ones_like(neg_y), neg_y))

12.008874462706526


In [17]:
# no bfi panas negative
rf = RandomForestRegressor()
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=kf,
    verbose=1,
    n_jobs=-1  
)
grid_search.fit(new_merged_data_no_bfi_neg, neg_y)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

best_params = grid_search.best_params_
final_rf_model = RandomForestRegressor(**best_params)
final_rf_model.fit(new_merged_data_no_bfi_neg, neg_y)
final_cv_scores = cross_val_score(
    final_rf_model, 
    new_merged_data_no_bfi_neg, 
    neg_y, 
    cv=kf, 
    scoring=mse_scorer
)

print("Final model cross-validated scores:", final_cv_scores)
print("Final model mean score:", np.mean(final_cv_scores))

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 35}
Best cross-validated score: -9.949285375948776
Final model cross-validated scores: [-10.64256455  -9.57779819  -9.99539997  -9.89220854 -10.40540222]
Final model mean score: -10.102674694004687


In [18]:
# with bfi panas negative
rf = RandomForestRegressor()
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=kf,
    verbose=1,
    n_jobs=-1  
)
grid_search.fit(new_merged_data_neg, neg_y)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

best_params = grid_search.best_params_
final_rf_model = RandomForestRegressor(**best_params)
final_rf_model.fit(new_merged_data_neg, neg_y)
final_cv_scores = cross_val_score(
    final_rf_model, 
    new_merged_data_neg, 
    neg_y, 
    cv=kf, 
    scoring=mse_scorer
)

print("Final model cross-validated scores:", final_cv_scores)
print("Final model mean score:", np.mean(final_cv_scores))

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 35}
Best cross-validated score: -7.866726324760887
Final model cross-validated scores: [-8.41396752 -7.41902648 -8.17363598 -7.56772328 -7.60103616]
Final model mean score: -7.835077882364585


In [19]:
print(mean_squared_error(np.mean(pos_y)*np.ones_like(pos_y), pos_y))

22.479801143006924


In [20]:
# no bfi panas positive
rf = RandomForestRegressor()
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=kf,
    verbose=1,
    n_jobs=-1  # Use all available cores
)
grid_search.fit(new_merged_data_no_bfi_pos, pos_y)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

best_params = grid_search.best_params_
final_rf_model = RandomForestRegressor(**best_params)
final_rf_model.fit(new_merged_data_no_bfi_pos, pos_y)
final_cv_scores = cross_val_score(
    final_rf_model, 
    new_merged_data_no_bfi_pos, 
    pos_y, 
    cv=kf, 
    scoring=mse_scorer
)

print("Final model cross-validated scores:", final_cv_scores)
print("Final model mean score:", np.mean(final_cv_scores))

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 40}
Best cross-validated score: -16.805635714862916
Final model cross-validated scores: [-17.52383076 -16.48733869 -16.41281191 -16.75352727 -16.69563483]
Final model mean score: -16.774628689555232


In [21]:
# with bfi panas positive
rf = RandomForestRegressor()
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=kf,
    verbose=1,
    n_jobs=-1  
)
grid_search.fit(new_merged_data_pos, pos_y)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

best_params = grid_search.best_params_
final_rf_model = RandomForestRegressor(**best_params)
final_rf_model.fit(new_merged_data_pos, pos_y)
final_cv_scores = cross_val_score(
    final_rf_model, 
    new_merged_data_pos, 
    pos_y, 
    cv=kf, 
    scoring=mse_scorer
)

print("Final model cross-validated scores:", final_cv_scores)
print("Final model mean score:", np.mean(final_cv_scores))

Fitting 5 folds for each of 360 candidates, totalling 1800 fits




Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40}
Best cross-validated score: -10.821474541110614
Final model cross-validated scores: [-11.11672084 -10.36566076 -10.83627607 -11.33154764 -10.8128228 ]
Final model mean score: -10.892605625298817
