In [None]:
%load_ext autoreload
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import re
%autoreload 2

In [None]:
pd.set_option('display.max_rows', 100)
SEED = 42
USE_nG = True # creting data only from nG subset

In [None]:
nG_subs = np.load('./metadata/metadata_fcd_nG.npy', allow_pickle=True).item()

In [None]:
all_subs_idcs = np.concatenate((nG_subs['train'], nG_subs['test']))

In [None]:
# https://docs.google.com/spreadsheets/d/1MDleLmQ0Nlcg62x95e3xnkc5_j_i4IK_KQEHccDosG8/edit?usp=sharing
#https://docs.google.com/spreadsheets/d/1_TbYJj9JY-QTdFE5-KeVG1gusTMbRUE5/edit#gid=676408086
# https://docs.google.com/spreadsheets/d/1PqzgvTJNxgObtIMg42Xvj232geOmuT4Q/edit?usp=sharing&ouid=111984848783696185973&rtpof=true&sd=true
sheet_id = "1PqzgvTJNxgObtIMg42Xvj232geOmuT4Q"
sheet_name = "Alekseev"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

In [None]:
folds_cv = np.load('./metadata/folds_cv_nG.npy', allow_pickle=True)
#folds_cv_new = np.load('./metadata/stratified_vol_loc_cv_nG.npy', allow_pickle=True)

In [None]:
import nibabel
import os
import nilearn
from nilearn import plotting
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



unit_vol_per_fold = {}
all_units = defaultdict()
for i,f in enumerate(folds_cv):
    unit_f = 0
    unit_f_l = []
    val = f['val']
    for sub_ind in val:
        label_file = nibabel.load(os.path.join('/workspace/RawData/Features/preprocessed_data/label_bernaskoni', f'{sub_ind}.nii.gz'))
        label_data = label_file.get_fdata()
        sub_unit = np.count_nonzero(label_data)
        # debugged n50, because it has label equal to 2
        """
        if sub_ind == 'n50':
            data_2 = np.where(label_data >= 0.5, 1, 0)
            #label_data = np.concatenate((data_2, scaled_data))
            file = nibabel.Nifti1Image(data_2, label_file.affine)
            nibabel.save(file,os.path.join('/workspace/RawData/Features/preprocessed_data/label_bernaskoni', f'{sub_ind}_new.nii.gz'))
            plt.hist(data_2.ravel(), label=sub_ind)
        """    
        #print(unit)
        #print(f'yes {sub_ind}' if label_data[label_data>1.0].any() else 'none')
        unit_f += sub_unit
        unit_f_l.append(sub_unit)
        all_units[str(sub_ind)] = sub_unit
        #plt.hist(label_data[label_data > 0].ravel(), label=sub_ind)
        #plt.title(sub_ind)
        #plt.show()
        #nilearn.plotting.plot_anat(label_file, bg_img=None)
    #plt.hist(unit_f_l, label=f'fold{i}', range=(0, 16000))
    #sns.histplot(unit_f_l, kde=True, binrange=(0,16000))
    #plt.show()
    #unit_vol_per_fold[str(i)] = unit_f_l
    
    
    unit_vol_per_fold[str(i)] = unit_f / len(val)

{'0': 2993.7,
 '1': 4390.4,
 '2': 3807.1,
 '3': 2291.7,
 '4': 3865.8,
 '5': 3481.0,
 '6': 3760.4,
 '7': 4008.4,
 '8': 4566.5}

In [None]:
all_units.values()

In [None]:
np.quantile(list(all_units.values()), [0.05, 0.5, 0.95])

In [None]:
f, (ax_box, ax_hist) = plt.subplots(2)

sns.boxplot(list(all_units.values()), ax=ax_box)
sns.histplot(list(all_units.values()), kde=True, binrange=(0,16000),bins=15, ax=ax_hist)
#ax_box.set(xlabel='')
plt.show()

In [None]:
df = pd.read_csv(url, 
                 header=0,
                 index_col=None,
                 usecols=['subject','label', 'front_pariet', 'occipit', 'tempor', 'isolated', 'blurring']
                 )
#df['country_age'] = df['country'].astype(str) + '_' + df['age_group'].astype(str)

In [None]:
volume_frame = pd.DataFrame.from_dict(all_units.items())
volume_frame.rename(columns={0:'subject',
                             1:'volume'}, inplace = True)

In [None]:
new = pd.merge(df, volume_frame, how='inner', on='subject')  # change to inner, to change 

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
new['volume_cat'] = ['small' if x < 1250 else 'mean' if 1250<=x<5200 else 'big' for x in new['volume']]

In [None]:
new['volume_cat'].value_counts()

In [None]:
new['stratify'] = new['label'].astype(str) + '_' + new['volume_cat'].astype(str)

In [None]:
num = 9
folds = list(range(num))

skf = StratifiedKFold(n_splits=num, random_state=SEED, shuffle=True)

In [None]:
train_val_ds_indcs = new['subject'].values
train_val_split = skf.split(train_val_ds_indcs, new['stratify'])

In [None]:
folds = []
for k, [train_index, test_index] in enumerate(train_val_split):
    indcs = [train_index, test_index]
    print(f'Train {train_val_ds_indcs[indcs[0]]}', '\n', f'Validation {train_val_ds_indcs[indcs[1]]}')
    print(len(train_val_ds_indcs[indcs[0]]),(len(train_val_ds_indcs[indcs[1]])))
    metadata_cv_folds = {'train': train_val_ds_indcs[indcs[0]],
                         'val': train_val_ds_indcs[indcs[1]]}
    folds.append(metadata_cv_folds)
#folds.append({'test': test_ds_indcs})
np.save('metadata/stratified_vol_loc_cv_nG.npy', folds)

### Old table

In [None]:
df = pd.read_csv(url, 
                 header=0,
                 usecols=['patient', 'is_good', 'localization', 'comments'],
                 index_col=None, dtype={'patient':str,
                                        'is_good':str,
                                        'localization':str,
                                        'comments':str})

if USE_nG:
    df = df[df['patient'].apply(lambda x: x[0]=='n' or x[0]=='G')]
df_good = df.query('is_good == "1"')

In [None]:
df_good['localization'].value_counts()

In [None]:
manual_mapping = {'right tempor occipit':'occipit',
                  'left tempor occipit':'occipit',
                  'left occipit tempor':'occipit',
                  'left occipit':'occipit',
                  'right occipit':'occipit',
                  'left front tempor':'left tempor'
                 }

for k,v in manual_mapping.items():
    mask = df_good['localization'] == k
    df_good.loc[mask, 'localization'] = v

In [None]:
df_good['localization'].value_counts()

In [None]:
df_train, df_test = train_test_split(df_good, 
                                     stratify=df_good['localization'],
                                     random_state=SEED,
                                     test_size=10)

In [None]:
df_train.shape, df_test.shape

In [None]:
metadata = {'train':df_train['patient'].values,
            'test':df_test['patient'].values}
np.save('metadata/metadata_fcd_nG.npy', metadata)

### Add cross-validation metadata dataset

In [None]:
num = 9
folds = list(range(num))

kf = KFold(n_splits=num, random_state=SEED, shuffle=True)

In [None]:
train_val_ds_indcs = df_good['patient'].values[:90]
# test_ds_indcs = df_good['patient'].values[80:]  # Till we are not setting any hyperparams 

In [None]:
train_val_split = kf.split(train_val_ds_indcs)

In [None]:
folds = []
for k, [train_index, test_index] in enumerate(train_val_split):
    indcs = [train_index, test_index]
    print(f'Train {train_val_ds_indcs[indcs[0]]}', '\n', f'Validation {train_val_ds_indcs[indcs[1]]}')
    print(len(train_val_ds_indcs[indcs[0]]),(len(train_val_ds_indcs[indcs[1]])))
    metadata_cv_folds = {'train': train_val_ds_indcs[indcs[0]],
                         'val': train_val_ds_indcs[indcs[1]]}
    folds.append(metadata_cv_folds)
#folds.append({'test': test_ds_indcs})
np.save('metadata/folds_cv_nG.npy', folds)

In [None]:
len(folds)