In [50]:
from datasets import load_from_disk
import os
from copy import deepcopy
from collections import Counter
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, ks_2samp

In [3]:
full_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max90days_v6_60combined_5fold_oots_shuffled_v6.2")
dev_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_devset_oots_shuffled_v6.2")
dev_rand_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_randomized_oots_shuffled_v6.2")
dev_norm_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_normative_oots_shuffled_v6.2")
dev_idio_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_idio_oots_shuffled_v6.2")

In [4]:
full_data

Dataset({
    features: ['seq_id', 'time_ids', 'embeddings_subscales', 'mask_subscales', 'embeddings_lang', 'mask_lang', 'outcomes', 'outcomes_mask', 'orig_time_ids', 'folds', 'oots_mask', 'embeddings_subscales_z', 'embeddings_lang_z'],
    num_rows: 258
})

In [5]:
dev_data

Dataset({
    features: ['seq_id', 'time_ids', 'embeddings_subscales', 'mask_subscales', 'embeddings_lang', 'mask_lang', 'outcomes', 'outcomes_mask', 'orig_time_ids', 'folds', 'oots_mask', 'embeddings_subscales_z', 'embeddings_lang_z', 'lastobserved_subscales', 'lastobserved_lang'],
    num_rows: 207
})

In [6]:
dev_rand_data

Dataset({
    features: ['seq_id', 'time_ids', 'embeddings_subscales', 'mask_subscales', 'embeddings_lang', 'mask_lang', 'outcomes', 'outcomes_mask', 'orig_time_ids', 'folds', 'oots_mask', 'embeddings_subscales_z', 'embeddings_lang_z', 'lastobserved_subscales', 'lastobserved_lang'],
    num_rows: 207
})

In [7]:
dev_norm_data

Dataset({
    features: ['seq_id', 'time_ids', 'embeddings_subscales', 'mask_subscales', 'embeddings_lang', 'mask_lang', 'outcomes', 'outcomes_mask', 'orig_time_ids', 'folds', 'oots_mask', 'embeddings_subscales_z', 'embeddings_lang_z', 'lastobserved_subscales', 'lastobserved_lang'],
    num_rows: 207
})

In [8]:
dev_idio_data

Dataset({
    features: ['seq_id', 'time_ids', 'embeddings_subscales', 'mask_subscales', 'embeddings_lang', 'mask_lang', 'outcomes', 'outcomes_mask', 'orig_time_ids', 'folds', 'oots_mask', 'embeddings_subscales_z', 'embeddings_lang_z', 'lastobserved_subscales', 'lastobserved_lang'],
    num_rows: 207
})

In [9]:
from datasets import DatasetDict, concatenate_datasets
def get_datasetDict(train_data, val_data=None, test_data=None, val_folds:list=None, test_folds:list=None, fold_col:str='folds'):
    """
        Returns the Huggingface datasets.DatasetDict.
        Each input dictionary contains three key value pairs:
            1. embeddings: List of embeddings for each sequence of shape (1, seq_len, hidden_dim)
            2. labels: List of labels for each sequence of shape (seq_len, )
            3. time_ids [Optional]: List of sequence numbers for each sequence of shape (seq_len, )
    """
    
    datasetDict = DatasetDict()
    if train_data is not None: datasetDict['train'] = Dataset.from_dict(train_data) if isinstance(train_data, dict) else train_data
    if val_data is not None: datasetDict['val'] = Dataset.from_dict(val_data) if isinstance(val_data, dict) else val_data
    if test_data is not None: datasetDict['test']  = Dataset.from_dict(test_data) if isinstance(test_data, dict) else test_data

    if val_folds is not None:
        val_folds = set(val_folds)
        datasetDict['val'] = datasetDict['train'].filter(lambda example: example[fold_col] in val_folds)
        datasetDict['train'] = datasetDict['train'].filter(lambda example: example[fold_col] not in val_folds)
    
    if test_folds is not None:
        test_folds = set(test_folds)
        datasetDict['test'] = datasetDict['train'].filter(lambda example: example[fold_col] in test_folds)
        datasetDict['train'] = datasetDict['train'].filter(lambda example: example[fold_col] not in test_folds)

    # Note: ooss_mask indicates whether a sequence is out of sample or not
    # Hence all the train sequences will have oots_mask as 0
    # All the val and test sequences will have oots_mask as 1

    datasetDict['train'] = datasetDict['train'].remove_columns(fold_col)
    datasetDict['train'] = datasetDict['train'].add_column('ooss_mask', [0]*len(datasetDict['train']['seq_id']))
    
    if 'val' in datasetDict: 
        datasetDict['val'] = datasetDict['val'].remove_columns(fold_col)
        datasetDict['val'] = datasetDict['val'].add_column('ooss_mask', [1]*len(datasetDict['val']['seq_id']))
        datasetDict['val'] = concatenate_datasets([datasetDict['train'], datasetDict['val']])
        
    if 'test' in datasetDict:
        datasetDict['test'] = datasetDict['test'].remove_columns(fold_col)
        datasetDict['test'] = datasetDict['test'].add_column('ooss_mask', [1]*len(datasetDict['test']['seq_id']))
    
    def create_defaut_time_ids(instance):
        """
            Creates a default time_ids for the instance assuming no breaks in timestep
        """
        instance['time_ids'] = list(range(len(instance['outcomes_mask'])))
        return instance
    
    for dataset_name in datasetDict:
        if 'time_ids' not in datasetDict[dataset_name].features:
            datasetDict[dataset_name] = datasetDict[dataset_name].map(create_defaut_time_ids)
        
    return datasetDict

In [10]:
def calc_traintest_instances(dataset):
    cols = ['ooss_mask', 'outcomes_mask', 'oots_mask']
    withinseq_data = dataset.select_columns(cols).filter(lambda x: x['ooss_mask'] == 0)
    outofseq_data = dataset.select_columns(cols).filter(lambda x: x['ooss_mask'] == 1)
    total_metadata, valid_metadata = {'wswt':0, 'wsot': 0, 'oswt': 0, 'osot': 0}, {'wswt':0, 'wsot': 0, 'oswt': 0, 'osot': 0}
    
    for row in withinseq_data:
        for oots_mask, outcomes_mask in zip(row['oots_mask'], row['outcomes_mask']):
            if oots_mask == 0:
                total_metadata['wswt'] += 1
                valid_metadata['wswt'] += 1 if outcomes_mask[0] == 1 else 0
            else:
                total_metadata['wsot'] += 1
                valid_metadata['wsot'] += 1 if outcomes_mask[0] == 1 else 0
    
    for row in outofseq_data:
        for oots_mask, outcomes_mask in zip(row['oots_mask'], row['outcomes_mask']):
            if oots_mask == 0:
                total_metadata['oswt'] += 1
                valid_metadata['oswt'] += 1 if outcomes_mask[0] == 1 else 0
            else:
                total_metadata['osot'] += 1
                valid_metadata['osot'] += 1 if outcomes_mask[0] == 1 else 0
    
    return total_metadata, valid_metadata

In [11]:
# for full data
temp_datasetDict = get_datasetDict(full_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[4])
full_totmeta, full_validmeta = calc_traintest_instances(temp_datasetDict['val'])

temp_datasetDict = get_datasetDict(dev_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[0])
dev_totmeta, dev_validmeta = calc_traintest_instances(temp_datasetDict['val'])

temp_rand_datasetDict = get_datasetDict(dev_rand_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[])
devrand_totmeta, devrand_validmeta = calc_traintest_instances(temp_rand_datasetDict['train'])

temp_datasetDict = get_datasetDict(dev_norm_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[1])
devnorm_totmeta, devnorm_validmeta = calc_traintest_instances(temp_datasetDict['val'])

temp_datasetDict = get_datasetDict(dev_idio_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[])
devidio_totmeta, devidio_validmeta = calc_traintest_instances(temp_datasetDict['train'])

Filter:   0%|          | 0/258 [00:00<?, ? examples/s]

Filter:   0%|          | 0/258 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/207 [00:00<?, ? examples/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


Flattening the indices:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

Filter:   0%|          | 0/258 [00:00<?, ? examples/s]

Filter:   0%|          | 0/258 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/155 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/52 [00:00<?, ? examples/s]

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/104 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

In [12]:
total_meta_df = pd.DataFrame([full_totmeta, dev_totmeta, devrand_totmeta, devnorm_totmeta, devidio_totmeta], index=['full', 'dev', 'dev_rand', 'dev_norm', 'dev_idio'])
valid_meta_df = pd.DataFrame([full_validmeta, dev_validmeta, devrand_validmeta, devnorm_validmeta, devidio_validmeta], index=['full', 'dev', 'dev_rand', 'dev_norm', 'dev_idio'])
total_meta_df['total_eval'] = total_meta_df.iloc[:, 1:].sum(axis=1)
valid_meta_df['total_eval'] = valid_meta_df.iloc[:, 1:].sum(axis=1)

In [13]:
total_meta_df

Unnamed: 0,wswt,wsot,oswt,osot,total_eval
full,12213,6416,3009,1573,10998
dev,6200,3100,2080,1040,6220
dev_rand,7435,4985,0,0,4985
dev_norm,6240,0,6180,0,6180
dev_idio,6624,5796,0,0,5796


In [14]:
total_meta_df['total_eval']/(total_meta_df['total_eval'] + total_meta_df['wswt'])

full        0.473827
dev         0.500805
dev_rand    0.401369
dev_norm    0.497585
dev_idio    0.466667
dtype: float64

In [15]:
valid_meta_df

Unnamed: 0,wswt,wsot,oswt,osot,total_eval
full,10299,5283,2529,1300,9112
dev,5269,2620,1705,875,5200
dev_rand,5484,4985,0,0,4985
dev_norm,5314,0,5155,0,5155
dev_idio,5583,4886,0,0,4886


In [16]:
valid_meta_df['total_eval']/(valid_meta_df['total_eval'] + valid_meta_df['wswt'])

full        0.469425
dev         0.496705
dev_rand    0.476168
dev_norm    0.492406
dev_idio    0.466711
dtype: float64

In [17]:
valid_meta_df['wswt']/total_meta_df['wswt']

full        0.843282
dev         0.849839
dev_rand    0.737592
dev_norm    0.851603
dev_idio    0.842844
Name: wswt, dtype: float64

In [18]:
valid_meta_df['total_eval']/total_meta_df['total_eval']

full        0.828514
dev         0.836013
dev_rand    1.000000
dev_norm    0.834142
dev_idio    0.842995
Name: total_eval, dtype: float64

## V6.3 data

In [35]:
full_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max90days_v6_60combined_5fold_oots_shuffled_v6.3")
dev_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_devset_oots_shuffled_v6.3")
dev_rand_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_randomized_oots_shuffled_v6.3")
dev_norm_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_normative_oots_shuffled_v6.3")
dev_idio_data = load_from_disk("/cronus_data/avirinchipur/ptsd_stop/forecasting/datasets/PCLsubscales_selfreportZ_roberta_laL23Z_merged_PCL_1_days_ahead_reset_time2zero2_max60days_v6_40combined_idio_oots_shuffled_v6.3")

In [36]:
# for full data
temp_datasetDict = get_datasetDict(full_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[4])
full_totmeta, full_validmeta = calc_traintest_instances(temp_datasetDict['val'])

temp_datasetDict = get_datasetDict(dev_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[0])
dev_totmeta, dev_validmeta = calc_traintest_instances(temp_datasetDict['val'])

temp_datasetDict = get_datasetDict(dev_rand_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[])
devrand_totmeta, devrand_validmeta = calc_traintest_instances(temp_datasetDict['train'])

temp_datasetDict = get_datasetDict(dev_norm_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[1])
devnorm_totmeta, devnorm_validmeta = calc_traintest_instances(temp_datasetDict['val'])

temp_datasetDict = get_datasetDict(dev_idio_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id']), val_folds=[])
devidio_totmeta, devidio_validmeta = calc_traintest_instances(temp_datasetDict['train'])

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


In [37]:
total_meta_df = pd.DataFrame([full_totmeta, dev_totmeta, devrand_totmeta, devnorm_totmeta, devidio_totmeta], index=['full', 'dev', 'dev_rand', 'dev_norm', 'dev_idio'])
valid_meta_df = pd.DataFrame([full_validmeta, dev_validmeta, devrand_validmeta, devnorm_validmeta, devidio_validmeta], index=['full', 'dev', 'dev_rand', 'dev_norm', 'dev_idio'])
total_meta_df['total_eval'] = total_meta_df.iloc[:, 1:].sum(axis=1)
valid_meta_df['total_eval'] = valid_meta_df.iloc[:, 1:].sum(axis=1)

In [38]:
total_meta_df

Unnamed: 0,wswt,wsot,oswt,osot,total_eval
full,12213,6416,3009,1573,10998
dev,6200,3100,2080,1040,6220
dev_rand,6210,6210,0,0,6210
dev_norm,6240,0,6180,0,6180
dev_idio,7740,7740,0,0,7740


In [39]:
total_meta_df['total_eval']/(total_meta_df['total_eval'] + total_meta_df['wswt'])

full        0.473827
dev         0.500805
dev_rand    0.500000
dev_norm    0.497585
dev_idio    0.500000
dtype: float64

In [40]:
valid_meta_df

Unnamed: 0,wswt,wsot,oswt,osot,total_eval
full,10299,5283,2529,1300,9112
dev,5269,2620,1705,875,5200
dev_rand,5226,5243,0,0,5243
dev_norm,5314,0,5155,0,5155
dev_idio,6524,6517,0,0,6517


In [41]:
valid_meta_df['total_eval']/(valid_meta_df['total_eval'] + valid_meta_df['wswt'])

full        0.469425
dev         0.496705
dev_rand    0.500812
dev_norm    0.492406
dev_idio    0.499732
dtype: float64

In [42]:
valid_meta_df['wswt']/total_meta_df['wswt']

full        0.843282
dev         0.849839
dev_rand    0.841546
dev_norm    0.851603
dev_idio    0.842894
Name: wswt, dtype: float64

In [43]:
valid_meta_df['total_eval']/total_meta_df['total_eval']

full        0.828514
dev         0.836013
dev_rand    0.844283
dev_norm    0.834142
dev_idio    0.841990
Name: total_eval, dtype: float64

## Perform face valid ttest_ind or ks_2samp

In [64]:
# First test: idio vs normative on train data (only the valid ws-wt samples)
norm_traindata = get_datasetDict(dev_norm_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id', 'outcomes']), val_folds=[1])['train']
idio_traindata = get_datasetDict(dev_idio_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id', 'outcomes']), val_folds=[])['train']


  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


In [82]:
norm_traindata_outcomes = []
for row in norm_traindata:
    row_outcomes = np.array(row['outcomes']).reshape(-1)
    row_outcomes_mask = np.array(row['outcomes_mask']).reshape(-1)
    norm_traindata_outcomes.extend(row_outcomes[row_outcomes_mask == 1].tolist())
    
idio_traindata_outcomes = []
for row in idio_traindata:
    row_outcomes = np.array(row['outcomes']).reshape(-1)
    row_oots_masks = np.array(row['oots_mask']).reshape(-1)
    row_outcomes_mask = np.array(row['outcomes_mask']).reshape(-1)
    idio_traindata_outcomes.extend(row_outcomes[(row_outcomes_mask == 1) & (row_oots_masks == 0)].tolist())

In [None]:
t_stat, pval = ttest_ind(idio_traindata_outcomes, norm_traindata_outcomes, equal_var=False, alternative='two-sided')
print(f"t-statistic: {round(t_stat, 3)}, p-value: {round(pval, 4)}")

t-statistic: -0.784, p-value: 0.4329


In [90]:
# Second test: normative vs dev on train data
# In fact: dev > normative (alternate hyp: more)
dev_traindata = get_datasetDict(dev_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id', 'outcomes']), val_folds=[0])['train']
dev_traindata_outcomes = []
for row in dev_traindata:
    row_outcomes = np.array(row['outcomes']).reshape(-1)
    row_oots_masks = np.array(row['oots_mask']).reshape(-1)
    row_outcomes_mask = np.array(row['outcomes_mask']).reshape(-1)
    dev_traindata_outcomes.extend(row_outcomes[(row_outcomes_mask == 1) & (row_oots_masks == 0)].tolist())

t_stat, pval = ttest_ind(dev_traindata_outcomes, norm_traindata_outcomes, equal_var=False, alternative='two-sided')
print(f"t-statistic: {round(t_stat, 3)}, p-value: {round(pval, 4)}")

t-statistic: -0.54, p-value: 0.5891


In [92]:
# Third test: dev vs dev_rand on train data
# In fact: dev != dev_rand (alternate hyp: same)
devrand_traindata = get_datasetDict(dev_rand_data.select_columns(['folds', 'oots_mask', 'outcomes_mask', 'seq_id', 'outcomes']), val_folds=[])['train']
devrand_traindata_outcomes = []
for row in devrand_traindata:
    row_outcomes = np.array(row['outcomes']).reshape(-1)
    row_outcomes_mask = np.array(row['outcomes_mask']).reshape(-1)
    row_oots_masks = np.array(row['oots_mask']).reshape(-1)
    devrand_traindata_outcomes.extend(row_outcomes[(row_outcomes_mask == 1) & (row_oots_masks == 0)].tolist())

t_stat, pval = ttest_ind(devrand_traindata_outcomes, dev_traindata_outcomes, equal_var=False, alternative='two-sided')
print(f"t-statistic: {round(t_stat, 3)}, p-value: {round(pval, 4)}")

t-statistic: -0.672, p-value: 0.5015
