In [1]:
import pickle
import torch

from scipy.stats import pearsonr
import pandas as pd

In [2]:
FOLD4_PRED_PATH = "/data/avirinchipur/EMI/outputs/ptsd_stop/PCL_avg_roba128/ptsd_stop_PCL_avg_v1.f/69384f7a0ab14eff85abdc4a112fc5d3/preds.pkl"
FOLD3_PRED_PATH = "/data/avirinchipur/EMI/outputs/ptsd_stop/PCL_avg_roba128/ptsd_stop_PCL_avg_v1.f/21855bd3734b402cbd79eca70793d7a5/preds.pkl"
FOLD2_PRED_PATH = "/data/avirinchipur/EMI/outputs/ptsd_stop/PCL_avg_roba128/ptsd_stop_PCL_avg_v1.f/f6983d7cc2dc4a739b1903e980a32a51/preds.pkl"
FOLD1_PRED_PATH = "/data/avirinchipur/EMI/outputs/ptsd_stop/PCL_avg_roba128/ptsd_stop_PCL_avg_v1.f/2abba26646944fc98658d956ed6ff022/preds.pkl"
FOLD0_PRED_PATH = "/data/avirinchipur/EMI/outputs/ptsd_stop/PCL_avg_roba128/ptsd_stop_PCL_avg_v1.f/01b63d2a5247459686cfc43ec4f39ea0/preds.pkl"

In [3]:
fold4_preds = pickle.load(open(FOLD4_PRED_PATH, "rb"))
fold3_preds = pickle.load(open(FOLD3_PRED_PATH, "rb"))
fold2_preds = pickle.load(open(FOLD2_PRED_PATH, "rb"))
fold1_preds = pickle.load(open(FOLD1_PRED_PATH, "rb"))
fold0_preds = pickle.load(open(FOLD0_PRED_PATH, "rb"))

In [4]:
fold4_preds.keys()

dict_keys(['train', 'val', 'test'])

In [5]:
fold4_preds['val'].keys()

dict_keys(['preds', 'target', 'mask', 'infill_mask'])

In [6]:
def get_ytrue(preds_dict):
    y_true = []
    for batch in preds_dict['val']['target'][-1]:
        temp = batch[:, 0].cpu().numpy().tolist()
        y_true.extend(temp)
    return y_true

def get_avg_ypred(preds_dict, epoch=-1):
    # Strategy 1 for y_pred: Get the avg y_pred indicated by the mask = 1
    y_pred_avg = []
    for idx, batch in enumerate(preds_dict['val']['preds'][epoch]):
        temp_mask = preds_dict['val']['mask'][epoch][idx]
        temp = (batch * temp_mask).sum(1)/torch.sum(temp_mask, 1).to(torch.float)
        y_pred_avg.extend(temp.cpu().numpy().tolist())
    return y_pred_avg

def get_nth_ypred(preds_dict, epoch=-1, p=1.0):
    assert p>=0 and p<=1, "n should be >=0 and <=1. Represents the nth percentile"
    
    y_pred_last = []
    for idx, batch in enumerate(preds_dict['val']['preds'][epoch]):
        # Find the index where the mask is 0 and subtract 1 from it to get to the index of the last 1
        last_idx = torch.argmin(preds_dict['val']['mask'][epoch][idx].to(torch.int), dim=-1) - 1
        # find the index where there last_idx is < 0 and set it to the number of columns in mask 
        last_idx[last_idx<0] = preds_dict['val']['mask'][epoch][idx].shape[1] - 1
        for row in range(len(batch)):
            y_pred_last.append(batch[row, int(last_idx[row]*p)].item())
    return y_pred_last

In [9]:
folds_y_pred = {"y_avg": [], "y_last": [], "y_threeforths": [],  "y_half": [], "y_quarter": [], "y_tenth": [], "y_fifth": [], "y_first": []}
folds_y_true = []
for fold in [fold0_preds, fold1_preds, fold2_preds, fold3_preds, fold4_preds]:
    folds_y_true.extend(get_ytrue(fold))
    folds_y_pred["y_avg"].extend(get_avg_ypred(fold))
    folds_y_pred["y_last"].extend(get_nth_ypred(fold, p=1.0))
    folds_y_pred["y_threeforths"].extend(get_nth_ypred(fold, p=0.75))
    folds_y_pred["y_half"].extend(get_nth_ypred(fold, p=0.5))
    folds_y_pred["y_quarter"].extend(get_nth_ypred(fold, p=0.25))
    folds_y_pred["y_tenth"].extend(get_nth_ypred(fold, p=0.1))
    folds_y_pred["y_fifth"].extend(get_nth_ypred(fold, p=0.05))
    folds_y_pred["y_first"].extend(get_nth_ypred(fold, p=0.01))

pearsonr_yavg = pearsonr(folds_y_true, folds_y_pred["y_avg"])
pearsonr_ylast = pearsonr(folds_y_true, folds_y_pred["y_last"])
pearsonr_ythreeforths = pearsonr(folds_y_true, folds_y_pred["y_threeforths"])
pearsonr_yhalf = pearsonr(folds_y_true, folds_y_pred["y_half"])
pearsonr_yquarter = pearsonr(folds_y_true, folds_y_pred["y_quarter"])
pearsonr_ytenth = pearsonr(folds_y_true, folds_y_pred["y_tenth"])
pearsonr_yfifth = pearsonr(folds_y_true, folds_y_pred["y_fifth"])
pearsonr_yfirst = pearsonr(folds_y_true, folds_y_pred["y_first"])

# Print results_df with desc, pearson, p_val. upto 3 digits of precision
results_df = pd.DataFrame({"desc": ["y_avg", "y_last", "y_threeforths" ,"y_half", "y_quarter", "y_tenth", "y_fifth", "y_first"],
                            "pearson": [pearsonr_yavg[0], pearsonr_ylast[0], pearsonr_ythreeforths[0], pearsonr_yhalf[0], pearsonr_yquarter[0], pearsonr_ytenth[0], pearsonr_yfifth[0], pearsonr_yfirst[0]],
                            "p_val": [pearsonr_yavg[1], pearsonr_ylast[1], pearsonr_ythreeforths[1], pearsonr_yhalf[1], pearsonr_yquarter[1], pearsonr_ytenth[1], pearsonr_yfifth[1], pearsonr_yfirst[1]]})
results_df = results_df.round(3)
print(results_df)


            desc  pearson  p_val
0          y_avg    0.636  0.000
1         y_last    0.667  0.000
2  y_threeforths    0.668  0.000
3         y_half    0.614  0.000
4      y_quarter    0.568  0.000
5        y_tenth    0.460  0.000
6        y_fifth    0.422  0.000
7        y_first    0.198  0.028


DLATK Baseline for the same fold setup for avg RoBERTa predicting the avg scores: 0.5327

Command: ./dlatkInterface.py -d ptsd_stop -t whisper_transcripts_v1 -c user_id --group_freq_thresh 100 -f 'feat$dr_rpca_128_fb20$whisper_transcripts_v1$user_id' --nfold_regression --outcome_table outcomes_user_level_v2 --outcomes PCL_avg --fold_column folds_col --where " num_days >=40 "  

Note: This uses slightly more longitudinal data than the EMI model. 

## Fold 4 Analysis

In [6]:
y_true = []
for batch in fold4_preds['val']['target'][-1]:
    temp = batch[:, 0].cpu().numpy().tolist()
    y_true.extend(temp)

In [7]:
# Strategy 1 for y_pred: Get the avg y_pred indicated by the mask = 1
y_pred_avg = []
for idx, batch in enumerate(fold4_preds['val']['preds'][-1]):
    temp_mask = fold4_preds['val']['mask'][-1][idx]
    temp = (batch * temp_mask).sum(1)/torch.sum(temp_mask, 1).to(torch.float)
    y_pred_avg.extend(temp.cpu().numpy().tolist())


In [8]:
pearson_avg_pred = pearsonr(y_true, y_pred_avg)
print (f"Pearson correlation for avg pred: {pearson_avg_pred[0].round(3)}")

Pearson correlation for avg pred: 0.733


In [9]:
# Strategy 1 for y_pred: Get the last valid y_pred indicated by the mask = 1
y_pred_last = []
for idx, batch in enumerate(fold4_preds['val']['preds'][-1]):
    temp_mask_idx = torch.argmin(fold4_preds['val']['mask'][-1][idx].to(torch.int), dim=-1) - 1
    temp_mask_idx[temp_mask_idx < 0] = -1
    for row in range(len(batch)):
        y_pred_last.append(batch[row, temp_mask_idx[row]].item())

In [10]:
pearson_last_pred = pearsonr(y_true, y_pred_last)
print (f"Pearson correlation for last pred: {pearson_last_pred[0].round(3)}")

Pearson correlation for last pred: 0.704


## Fold 3 Analysis

In [11]:
y_true = []
for batch in fold3_preds['val']['target'][-1]:
    temp = batch[:, 0].cpu().numpy().tolist()
    y_true.extend(temp)

In [12]:
# Strategy 1 for y_pred: Get the avg y_pred indicated by the mask = 1
y_pred_avg = []
for idx, batch in enumerate(fold3_preds['val']['preds'][-1]):
    temp_mask = fold3_preds['val']['mask'][-1][idx]
    temp = (batch * temp_mask).sum(1)/torch.sum(temp_mask, 1).to(torch.float)
    y_pred_avg.extend(temp.cpu().numpy().tolist())


In [13]:
pearson_avg_pred = pearsonr(y_true, y_pred_avg)
print (f"Pearson correlation for avg pred: {pearson_avg_pred[0].round(3)}")

Pearson correlation for avg pred: 0.791


In [14]:
# Strategy 1 for y_pred: Get the last valid y_pred indicated by the mask = 1
y_pred_last = []
for idx, batch in enumerate(fold3_preds['val']['preds'][-1]):
    temp_mask_idx = torch.argmin(fold3_preds['val']['mask'][-1][idx].to(torch.int), dim=-1) - 1
    temp_mask_idx[temp_mask_idx < 0] = -1
    for row in range(len(batch)):
        y_pred_last.append(batch[row, temp_mask_idx[row]].item())

In [15]:
pearson_last_pred = pearsonr(y_true, y_pred_last)
print (f"Pearson correlation for last pred: {pearson_last_pred[0].round(3)}")

Pearson correlation for last pred: 0.798


## Fold 2 Analysis

In [22]:
y_true = []
for batch in fold2_preds['val']['target'][-1]:
    temp = batch[:, 0].cpu().numpy().tolist()
    y_true.extend(temp)

In [25]:
# Strategy 1 for y_pred: Get the avg y_pred indicated by the mask = 1
y_pred_avg = []
for idx, batch in enumerate(fold2_preds['val']['preds'][-1]):
    temp_mask = fold2_preds['val']['mask'][-1][idx]
    temp = (batch * temp_mask).sum(1)/torch.sum(temp_mask, 1).to(torch.float)
    y_pred_avg.extend(temp.cpu().numpy().tolist())


In [26]:
pearson_avg_pred = pearsonr(y_true, y_pred_avg)
print (f"Pearson correlation for avg pred: {pearson_avg_pred[0].round(3)}")

Pearson correlation for avg pred: 0.465


In [30]:
# Strategy 1 for y_pred: Get the last valid y_pred indicated by the mask = 1
y_pred_last = []
for idx, batch in enumerate(fold2_preds['val']['preds'][-1]):
    temp_mask_idx = torch.argmin(fold2_preds['val']['mask'][-1][idx].to(torch.int), dim=-1) - 1
    temp_mask_idx[temp_mask_idx < 0] = -1
    for row in range(len(batch)):
        y_pred_last.append(batch[row, temp_mask_idx[row]].item())

In [31]:
pearson_last_pred = pearsonr(y_true, y_pred_last)
print (f"Pearson correlation for last pred: {pearson_last_pred[0].round(3)}")

Pearson correlation for last pred: 0.534


## Fold 1 Analysis

In [None]:
y_true = []
for batch in fold3_preds['val']['target'][-1]:
    temp = batch[:, 0].cpu().numpy().tolist()
    y_true.extend(temp)

In [None]:
# Strategy 1 for y_pred: Get the avg y_pred indicated by the mask = 1
y_pred_avg = []
for idx, batch in enumerate(fold3_preds['val']['preds'][-1]):
    temp_mask = fold3_preds['val']['mask'][-1][idx]
    temp = (batch * temp_mask).sum(1)/torch.sum(temp_mask, 1).to(torch.float)
    y_pred_avg.extend(temp.cpu().numpy().tolist())


In [None]:
pearson_avg_pred = pearsonr(y_true, y_pred_avg)
print (f"Pearson correlation for avg pred: {pearson_avg_pred[0].round(3)}")

Pearson correlation for avg pred: 0.791


In [None]:
# Strategy 1 for y_pred: Get the last valid y_pred indicated by the mask = 1
y_pred_last = []
for idx, batch in enumerate(fold3_preds['val']['preds'][-1]):
    temp_mask_idx = torch.argmin(fold3_preds['val']['mask'][-1][idx].to(torch.int), dim=-1) - 1
    temp_mask_idx[temp_mask_idx < 0] = -1
    for row in range(len(batch)):
        y_pred_last.append(batch[row, temp_mask_idx[row]].item())

In [None]:
pearson_last_pred = pearsonr(y_true, y_pred_last)
print (f"Pearson correlation for last pred: {pearson_last_pred[0].round(3)}")

Pearson correlation for last pred: 0.798
