In [None]:
import os
import pandas as pd
import numpy as np
import tqdm
import random
import pingouin as pg
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook
from scipy import io
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
import nibabel as nib
from scipy.linalg import pinv, pinv2
from statannot import add_stat_annotation
from statsmodels import robust
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import math
from sklearn.model_selection import cross_val_score
from scipy.stats import skew, skewtest
from sklearn.metrics import explained_variance_score, r2_score
from scipy.stats import ttest_rel

# Train

In [None]:
topdir = '/users/jsh/data/IG/ICA/2023-04-25/pca_ref/result_nucleus/30'
sbj_list = pd.read_csv(topdir + '/first_para_ica_sel_data.txt', header = None)
sbj_list.columns = ['temp']
sbj_list = sbj_list.loc[0:len(sbj_list)/2]
sbj_list['sbj'] = sbj_list.temp.str.split('/').str[9]
sbj_list['case'] = sbj_list.temp.str.split('/').str[8]
sbj_list = sbj_list.dropna(axis = 0)
sbj_info = sbj_list[['sbj', 'case']]

# sbj_info
sbjlist = []

for i in sbj_info['sbj'].tolist():
    sbjlist.append(i[:4]+'_'+i[4:])

demo = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_sst02.txt', sep = '\t')
demo = demo.loc[demo['subjectkey'].isin(sbjlist)]
demo = demo.loc[demo['eventname'] == 'baseline_year_1_arm_1']

scanner = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_mri01.txt', sep = '\t')
scanner = scanner.loc[scanner['subjectkey'].isin(sbjlist)]

demo_scanner = pd.merge(demo, scanner, on = 'subjectkey')
demo_exclude = demo_scanner[['subjectkey', 'interview_age_x', 'sex', 'mri_info_manufacturer']]

# handness
temp = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_ehis01.txt', sep = '\t')
temp = temp[['subjectkey', 'ehi_y_ss_scoreb']]
demo_exclude = pd.merge(demo_exclude, temp)

demo_exclude['ehi_y_ss_scoreb'] = demo_exclude['ehi_y_ss_scoreb'].astype(float)

# Genetic batch effect

batch = pd.read_csv('/data5/open_data/ABCD/3.0/genomics_sample03/ABCD_release3.0_.batch_info.txt', sep = '\t')
batch = batch[['abcd.id_redcap', 'BATCH']]
batch.columns = ['subjectkey', 'BATCH']
demo_exclude = pd.merge(demo_exclude, batch)


basedir = '/users/jsh/data/IG/ICA/2023-04-25/input_origin'
control_dir = basedir + '/control'
ADHD_dir = basedir + '/ADHD'

control_subjects = os.listdir(control_dir)
ADHD_subjects = os.listdir(ADHD_dir)

both_run = []
run1 = []
run2 = []

for i in control_subjects:
    targetdir = control_dir + '/' + i + '/fMRI'
    if 'con_not_masked1.nii' and 'con_not_masked2.nii' in os.listdir(targetdir):
        both_run.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked1.nii' in os.listdir(targetdir):
        run1.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked2.nii' in os.listdir(targetdir):
        run2.append(i[:4] + '_' + i[4:])
        
for i in ADHD_subjects:
    targetdir = ADHD_dir + '/' + i + '/fMRI'
    if 'con_not_masked1.nii' and 'con_not_masked2.nii' in os.listdir(targetdir):
        both_run.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked1.nii' in os.listdir(targetdir):
        run1.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked2.nii' in os.listdir(targetdir):
        run2.append(i[:4] + '_' + i[4:])        

In [None]:
data = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_sst02.txt', sep = '\t')

data = data.drop(['collection_id', 'abcd_sst02_id', 'dataset_id', 'src_subject_id', 'interview_date', 'collection_title'], axis = 1)
data = data.loc[data['eventname'] == 'baseline_year_1_arm_1']
data = data.drop(['eventname'], axis = 1)

sbj_list = pd.read_csv('/users/jsh/data/IG/ICA/2023-04-25/pca_ref/result_nucleus/30/first_para_ica_sel_data.txt', header = None)
sbj_list.columns = ['temp']
sbj_list = sbj_list.loc[0:len(sbj_list)/2]
sbj_list['sbj'] = sbj_list.temp.str.split('/').str[9]
sbj_list['case'] = sbj_list.temp.str.split('/').str[8]
sbj_list = sbj_list.dropna(axis = 0)
sbj_info = sbj_list[['sbj', 'case']]

temp = []

for i in sbj_info['sbj']:
    temp.append(i[:4] + '_' + i[4:])
    
sbj_info['subjectkey'] = temp
data_merged = pd.merge(sbj_info, data, how = 'inner', on = 'subjectkey')


coeff = io.loadmat('/users/jsh/data/IG/ICA/2023-04-25/pca_ref/result_nucleus/30/first_para_ica_ica.mat')
fMRI_A_origin = coeff['loadingCoeff'][0][0]
Gene_A_origin = coeff['loadingCoeff'][0][1]

fMRI_A_origin = pd.DataFrame(fMRI_A_origin)
Gene_A_origin = pd.DataFrame(Gene_A_origin)

data_merged['fMRI'] = fMRI_A_origin[14 - 1]
data_merged['Gene'] = Gene_A_origin[21 - 1]

data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_rt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_rt'].tolist()
data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_mrt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_mrt'].tolist()
data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_incrs_mrt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_incrs_mrt'].tolist()
data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_stdrt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_stdrt'].tolist()

data_merged = data_merged.astype({'tfmri_sst_all_beh_crgo_rt':'float',
                                 'tfmri_sst_all_beh_crgo_mrt':'float',
                                 'tfmri_sst_all_beh_incrs_mrt':'float',
                                 'tfmri_sst_all_beh_tot_mssd':'float',
                                 'tfmri_sst_all_beh_total_mssrt':'float',
                                 'tfmri_sst_all_beh_total_issrt':'float',
                                 'tfmri_sst_all_beh_crgo_stdrt':'float',
                                                                    
                                 'tfmri_sst_all_beh_go_nt':'float',
                                 'tfmri_sst_r1_beh_go_nt':'float',
                                 'tfmri_sst_all_beh_crgo_nt':'float', # all correct go
                                 'tfmri_sst_r1_beh_crgo_nt':'float', # r1 correct go
                                 'tfmri_sst_all_beh_incrgo_nt':'float', # all incorrect go
                                 'tfmri_sst_r1_beh_incrgo_nt':'float'}) # r1 incorrect go
#
data_final = pd.merge(data_merged, demo_exclude, on = 'subjectkey')

data_final['interview_age'] = data_final['interview_age'].astype(float)

data_final.loc[data_final['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_nt'] = data_final[data_final['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_nt'].tolist()
data_final.loc[data_final['subjectkey'].isin(run1), 'tfmri_sst_all_beh_incrgo_nt'] = data_final[data_final['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_incrgo_nt'].tolist()

data_final['ALL_MEAN_GORT'] = (data_final['tfmri_sst_all_beh_crgo_mrt'] * data_final['tfmri_sst_all_beh_crgo_nt'] + data_final['tfmri_sst_all_beh_incrs_mrt'] * data_final['tfmri_sst_all_beh_incrgo_nt']) / (data_final['tfmri_sst_all_beh_crgo_nt'] + data_final['tfmri_sst_all_beh_incrgo_nt'])
data_final['ICE'] = data_final['tfmri_sst_all_beh_total_mssrt'] - data_final['tfmri_sst_all_beh_tot_mssd']

iq = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_tbss01.txt', sep = '\t')
iq = iq[['subjectkey', 'nihtbx_totalcomp_uncorrected']]
data_final = pd.merge(data_final, iq, on = 'subjectkey')
data_final = data_final.astype({'nihtbx_totalcomp_uncorrected':'float'})
data_final['sex'] = data_final['sex_x']

data_final = data_final[['case', 'subjectkey', 'interview_age', 'sex', 'mri_info_manufacturer', 'BATCH', 'fMRI', 'Gene',
                         'tfmri_sst_all_beh_crgo_rt', 'tfmri_sst_all_beh_crgo_mrt', 'tfmri_sst_all_beh_incrs_mrt',
                         'tfmri_sst_all_beh_tot_mssd', 'tfmri_sst_all_beh_total_mssrt', 
                         'tfmri_sst_all_beh_total_issrt', 'tfmri_sst_all_beh_crgo_stdrt',
                         'nihtbx_totalcomp_uncorrected', 'ehi_y_ss_scoreb',
                         'ALL_MEAN_GORT', 'ICE']]


# UPPS / BIS+BAS
temp = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_mhy02.txt', sep = '\t')
temp = temp.loc[temp['eventname'] == 'baseline_year_1_arm_1']
temp = temp[['subjectkey', 'upps_y_ss_negative_urgency', 'upps_y_ss_lack_of_planning', 'upps_y_ss_sensation_seeking',
             'upps_y_ss_positive_urgency', 'upps_y_ss_lack_of_perseverance',
             'bis_y_ss_bism_sum', 'bis_y_ss_basm_rr', 'bis_y_ss_basm_drive']]


data_final = pd.merge(data_final, temp, how = 'inner', on = 'subjectkey')
data_final['upps_y_ss_negative_urgency'] = data_final['upps_y_ss_negative_urgency'].astype(float)
data_final['upps_y_ss_lack_of_planning'] = data_final['upps_y_ss_lack_of_planning'].astype(float)
data_final['upps_y_ss_sensation_seeking'] = data_final['upps_y_ss_sensation_seeking'].astype(float)
data_final['upps_y_ss_positive_urgency'] = data_final['upps_y_ss_positive_urgency'].astype(float)
data_final['upps_y_ss_lack_of_perseverance'] = data_final['upps_y_ss_lack_of_perseverance'].astype(float)
data_final['bis_y_ss_bism_sum'] = data_final['bis_y_ss_bism_sum'].astype(float)
data_final['bis_y_ss_basm_rr'] = data_final['bis_y_ss_basm_rr'].astype(float)
data_final['bis_y_ss_basm_drive'] = data_final['bis_y_ss_basm_drive'].astype(float)

data_final['exclude'] = False
data_train = data_final

# Test

In [None]:
# fMRI

inputdir = '/users/jsh/data/IG/ICA/2023-04-25/input_valid'
HC_target = os.listdir(inputdir + '/control')
ADHD_target = os.listdir(inputdir + '/ADHD')

j = 0

print("=" * 50)
print("Start")
print("=" * 50)

for i in tqdm_notebook(HC_target):
    
    sample = pd.read_csv(inputdir + '/control/' + i + '/fMRI/smooth8_within.txt', header = None)[0].tolist()
    
    if j == 0:
        base = pd.DataFrame(sample)
    else:
        base['{}'.format(j)] = sample
    j = j+1

for i in tqdm_notebook(ADHD_target):
    
    sample = pd.read_csv(inputdir + '/ADHD/' + i + '/fMRI/smooth8_within.txt', header = None)[0].tolist()
    
    if j == 0:
        base = pd.DataFrame(sample)
    else:
        base['{}'.format(j)] = sample
    j = j+1
    
fMRI_X = base
fMRI_X = np.transpose(fMRI_X)

In [None]:
# Gene validation dataset

inputdir = '/users/jsh/data/IG/ICA/2023-04-25/input_valid'
HC_target = os.listdir(inputdir + '/control')
ADHD_target = os.listdir(inputdir + '/ADHD')

j = 0

print("=" * 50)
print("Start")
print("=" * 50)

for i in tqdm_notebook(HC_target):
    
    sample = pd.read_csv(inputdir + '/control/' + i + '/Gene/gtex_v7_Brain_Nucleus_accumbens_basal_ganglia_predicted_expression.txt', header = None)[0].tolist()
    
    if j == 0:
        base = pd.DataFrame(sample)
    else:
        temp = pd.DataFrame(sample)
        base['{}'.format(j)] = temp
    j = j+1

for i in tqdm_notebook(ADHD_target):
    
    sample = pd.read_csv(inputdir + '/ADHD/' + i + '/Gene/gtex_v7_Brain_Nucleus_accumbens_basal_ganglia_predicted_expression.txt', header = None)[0].tolist()
    
    if j == 0:
        base = pd.DataFrame(sample)
    else:
        temp = pd.DataFrame(sample)
        base['{}'.format(j)] = temp
    j = j+1
    
Gene_X = base
Gene_X = np.transpose(Gene_X)

In [None]:
fMRI_S_list = [i for i in os.listdir('/users/jsh/data/IG/ICA/2023-04-25/pca_ref/result_nucleus/30') if '.img' in i]
fMRI_S_list.remove('first_para_ica_comp_feature_1_load_coeff_.img')
fMRI_S_list.remove('first_para_ica_comp_feature_2_load_coeff_.img')

j = 0

for i in fMRI_S_list:
    mask = nib.load("/users/jsh/data/IG/mask/SST_group_dilate-1.nii")
    mask_np = np.array(mask.get_fdata())
    
    temp = nib.load('/users/jsh/data/IG/ICA/2023-04-25/pca_ref/result_nucleus/30/' + i)
    temp = np.array(temp.get_fdata())[np.where(mask_np == 1)]
    if j == 0:
        base = pd.DataFrame(temp)
    else:
        base['{}'.format(j)] = temp
    j = j + 1
    
fMRI_S = base.transpose()

###

Gene_S_list = [i for i in os.listdir('/users/jsh/data/IG/ICA/2023-04-25/pca_ref/result_nucleus/30') if '.asc' in i]

j = 0

for i in Gene_S_list:
    temp = pd.read_csv('/users/jsh/data/IG/ICA/2023-04-25/pca_ref/result_nucleus/30/' + i, header = None)[0]
    if j == 0:
        base = pd.DataFrame(temp)
    else:
        base['{}'.format(j)] = temp
    j = j + 1
Gene_S = base.transpose()

In [None]:
# fMRI component
fMRI_A = np.dot(np.array(fMRI_X), np.linalg.pinv(np.array(fMRI_S))) # A = X * S-1
fMRI_A_df = pd.DataFrame(fMRI_A)

# Gene component
Gene_A = np.dot(np.array(Gene_X), np.linalg.pinv(np.array(Gene_S)))
Gene_A_df = pd.DataFrame(Gene_A)

In [None]:
inputdir = '/users/jsh/data/IG/ICA/2023-04-25/input_valid'
HC_target = os.listdir(inputdir + '/control')
ADHD_target = os.listdir(inputdir + '/ADHD')
valid_target = HC_target + ADHD_target

sbjlist = []

for i in valid_target:
    sbjlist.append(i[:4] + '_' + i[4:])
    
demo = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_sst02.txt', sep = '\t')
demo = demo.loc[demo['subjectkey'].isin(sbjlist)]
demo = demo.loc[demo['eventname'] == 'baseline_year_1_arm_1']

scanner = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_mri01.txt', sep = '\t')
scanner = scanner.loc[scanner['subjectkey'].isin(sbjlist)]

demo_scanner = pd.merge(demo, scanner, on = 'subjectkey')

demo_exclude = demo_scanner[['subjectkey', 'interview_age_x', 'sex', 'mri_info_manufacturer']]

# handness
temp = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_ehis01.txt', sep = '\t')
temp = temp[['subjectkey', 'ehi_y_ss_scoreb']]
demo_exclude = pd.merge(demo_exclude, temp)

demo_exclude['ehi_y_ss_scoreb'] = demo_exclude['ehi_y_ss_scoreb'].astype(float)


# Genetic batch effect
batch = pd.read_csv('/data5/open_data/ABCD/3.0/genomics_sample03/ABCD_release3.0_.batch_info.txt', sep = '\t')
batch = batch[['abcd.id_redcap', 'BATCH']]
batch.columns = ['subjectkey', 'BATCH']
demo_exclude = pd.merge(demo_exclude, batch)

###

basedir = '/users/jsh/data/IG/ICA/2023-04-25/input_valid'
control_dir = basedir + '/control'
ADHD_dir = basedir + '/ADHD'

control_subjects = os.listdir(control_dir)
ADHD_subjects = os.listdir(ADHD_dir)

both_run = []
run1 = []
run2 = []

for i in control_subjects:
    targetdir = control_dir + '/' + i + '/fMRI'
    if 'con_not_masked1.nii' and 'con_not_masked2.nii' in os.listdir(targetdir):
        both_run.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked1.nii' in os.listdir(targetdir):
        run1.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked2.nii' in os.listdir(targetdir):
        run2.append(i[:4] + '_' + i[4:])
        
for i in ADHD_subjects:
    targetdir = ADHD_dir + '/' + i + '/fMRI'
    if 'con_not_masked1.nii' and 'con_not_masked2.nii' in os.listdir(targetdir):
        both_run.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked1.nii' in os.listdir(targetdir):
        run1.append(i[:4] + '_' + i[4:])
    elif 'con_not_masked2.nii' in os.listdir(targetdir):
        run2.append(i[:4] + '_' + i[4:])        

In [None]:
data = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_sst02.txt', sep = '\t')

data = data.drop(['collection_id', 'abcd_sst02_id', 'dataset_id', 'src_subject_id', 'interview_date', 'collection_title'], axis = 1)
data = data.loc[data['eventname'] == 'baseline_year_1_arm_1']
data = data.drop(['eventname'], axis = 1)

sbj_list = sbjlist

sbj_info = pd.DataFrame()
sbj_info['subjectkey'] = sbjlist

data_merged = pd.merge(sbj_info, data, how = 'inner', on = 'subjectkey')

data_merged['fMRI'] = fMRI_A_df[14 - 1]
data_merged['Gene'] = Gene_A_df[21 - 1]

data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_rt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_rt'].tolist()
data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_mrt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_mrt'].tolist()
data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_incrs_mrt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_incrs_mrt'].tolist()
data_merged.loc[data_merged['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_stdrt'] = data_merged[data_merged['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_stdrt'].tolist()

data_merged = data_merged.astype({'tfmri_sst_all_beh_crgo_rt':'float',
                                 'tfmri_sst_all_beh_crgo_mrt':'float',
#                                  'tfmri_sst_all_beh_incrs_rt':'float',
                                 'tfmri_sst_all_beh_incrs_mrt':'float',
                                 'tfmri_sst_all_beh_tot_mssd':'float',
                                 'tfmri_sst_all_beh_total_mssrt':'float',
                                 'tfmri_sst_all_beh_total_issrt':'float',
                                 'tfmri_sst_all_beh_crgo_stdrt':'float',
                                 'tfmri_sst_all_beh_go_nt':'float',
                                 'tfmri_sst_r1_beh_go_nt':'float',
                                 'tfmri_sst_all_beh_crgo_nt':'float', # all correct go
                                 'tfmri_sst_r1_beh_crgo_nt':'float', # r1 correct go
                                 'tfmri_sst_all_beh_incrgo_nt':'float', # all incorrect go
                                 'tfmri_sst_r1_beh_incrgo_nt':'float'}) # r1 incorrect go

data_final = pd.merge(data_merged, demo_exclude, on = 'subjectkey')

data_final['interview_age'] = data_final['interview_age'].astype(float)

data_final.loc[data_final['subjectkey'].isin(run1), 'tfmri_sst_all_beh_crgo_nt'] = data_final[data_final['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_crgo_nt'].tolist()
data_final.loc[data_final['subjectkey'].isin(run1), 'tfmri_sst_all_beh_incrgo_nt'] = data_final[data_final['subjectkey'].isin(run1)]['tfmri_sst_r1_beh_incrgo_nt'].tolist()

data_final['ALL_MEAN_GORT'] = (data_final['tfmri_sst_all_beh_crgo_mrt'] * data_final['tfmri_sst_all_beh_crgo_nt'] + data_final['tfmri_sst_all_beh_incrs_mrt'] * data_final['tfmri_sst_all_beh_incrgo_nt']) / (data_final['tfmri_sst_all_beh_crgo_nt'] + data_final['tfmri_sst_all_beh_incrgo_nt'])
data_final['ICE'] = data_final['tfmri_sst_all_beh_total_mssrt'] - data_final['tfmri_sst_all_beh_tot_mssd']

iq = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_tbss01.txt', sep = '\t')
iq = iq[['subjectkey', 'nihtbx_totalcomp_uncorrected']]
data_final = pd.merge(data_final, iq, on = 'subjectkey')
data_final = data_final.astype({'nihtbx_totalcomp_uncorrected':'float'})
data_final['sex'] = data_final['sex_x']

data_final['case'] = ['control'] * 200 + ['ADHD'] * 79

data_final = data_final[['case', 'subjectkey', 'interview_age', 'sex', 'mri_info_manufacturer', 'BATCH', 'fMRI', 'Gene',
                         'tfmri_sst_all_beh_crgo_rt', 'tfmri_sst_all_beh_crgo_mrt', 'tfmri_sst_all_beh_incrs_mrt',
                         'tfmri_sst_all_beh_tot_mssd', 'tfmri_sst_all_beh_total_mssrt', 
                         'tfmri_sst_all_beh_total_issrt', 'tfmri_sst_all_beh_crgo_stdrt',
                         'nihtbx_totalcomp_uncorrected',
                         'ALL_MEAN_GORT', 'ICE', 'ehi_y_ss_scoreb']]

# UPPS / BIS+BAS
temp = pd.read_csv('/users/jsh/data/IG/Demographic+Behavior/abcd_mhy02.txt', sep = '\t')
temp = temp.loc[temp['eventname'] == 'baseline_year_1_arm_1']
temp = temp[['subjectkey', 'upps_y_ss_negative_urgency', 'upps_y_ss_lack_of_planning', 'upps_y_ss_sensation_seeking',
             'upps_y_ss_positive_urgency', 'upps_y_ss_lack_of_perseverance',
             'bis_y_ss_bism_sum', 'bis_y_ss_basm_rr', 'bis_y_ss_basm_drive']]

data_final = pd.merge(data_final, temp, how = 'inner', on = 'subjectkey')
data_final['upps_y_ss_negative_urgency'] = data_final['upps_y_ss_negative_urgency'].astype(float)
data_final['upps_y_ss_lack_of_planning'] = data_final['upps_y_ss_lack_of_planning'].astype(float)
data_final['upps_y_ss_sensation_seeking'] = data_final['upps_y_ss_sensation_seeking'].astype(float)
data_final['upps_y_ss_positive_urgency'] = data_final['upps_y_ss_positive_urgency'].astype(float)
data_final['upps_y_ss_lack_of_perseverance'] = data_final['upps_y_ss_lack_of_perseverance'].astype(float)
data_final['bis_y_ss_bism_sum'] = data_final['bis_y_ss_bism_sum'].astype(float)
data_final['bis_y_ss_basm_rr'] = data_final['bis_y_ss_basm_rr'].astype(float)
data_final['bis_y_ss_basm_drive'] = data_final['bis_y_ss_basm_drive'].astype(float)

data_test = data_final

# Regression

In [None]:
behavior_list = ['tfmri_sst_all_beh_crgo_rt', 'tfmri_sst_all_beh_crgo_mrt', 'tfmri_sst_all_beh_incrs_mrt', 'tfmri_sst_all_beh_tot_mssd',
                 'tfmri_sst_all_beh_total_mssrt', 'tfmri_sst_all_beh_total_issrt', 'tfmri_sst_all_beh_crgo_stdrt', 'ALL_MEAN_GORT',
                 'nihtbx_totalcomp_uncorrected', 'upps_y_ss_negative_urgency', 'upps_y_ss_positive_urgency', 'upps_y_ss_lack_of_perseverance',
                 'upps_y_ss_lack_of_planning', 'upps_y_ss_sensation_seeking', 'bis_y_ss_bism_sum', 'bis_y_ss_basm_rr', 'bis_y_ss_basm_drive']

for behavior in behavior_list:
    print(behavior)
    print(pg.ttest(data_test.loc[data_test['case'] == 'control'][behavior], data_test.loc[data_test['case'] == 'ADHD'][behavior]))

In [None]:
df = pd.DataFrame()

for behavior in behavior_list:
    
    cor_var = []

    # Train
    data_train['fMRI_Gene'] = data_train['fMRI'] * data_train['Gene']
    
    data_final_HC = data_train.loc[data_train['case'] == 'control']
    data_final_HC['exclude'] = False
    data_final_ADHD = data_train.loc[data_train['case'] == 'ADHD']
    data_final_ADHD['exclude'] = False
    
    #
    mad = data_final_HC[['fMRI']].apply(robust.mad)['fMRI']
    data_final_HC.loc[data_final_HC['fMRI'] > np.median(data_final_HC['fMRI']) + 3 * mad, 'exclude'] = True
    data_final_HC.loc[data_final_HC['fMRI'] < np.median(data_final_HC['fMRI']) - 3 * mad, 'exclude'] = True

    mad = data_final_HC[['Gene']].apply(robust.mad)['Gene']
    data_final_HC.loc[data_final_HC['Gene'] > np.median(data_final_HC['Gene']) + 3 * mad, 'exclude'] = True
    data_final_HC.loc[data_final_HC['Gene'] < np.median(data_final_HC['Gene']) - 3 * mad, 'exclude'] = True
    
    mad = data_final_HC[[behavior]].apply(robust.mad)[behavior]
    data_final_HC.loc[data_final_HC[behavior] > np.median(data_final_HC[behavior]) + 3 * mad, 'exclude'] = True
    data_final_HC.loc[data_final_HC[behavior] < np.median(data_final_HC[behavior]) - 3 * mad, 'exclude'] = True
    
    #
    mad = data_final_ADHD[['fMRI']].apply(robust.mad)['fMRI']
    data_final_ADHD.loc[data_final_ADHD['fMRI'] > np.median(data_final_ADHD['fMRI']) + 3 * mad, 'exclude'] = True
    data_final_ADHD.loc[data_final_ADHD['fMRI'] < np.median(data_final_ADHD['fMRI']) - 3 * mad, 'exclude'] = True

    mad = data_final_ADHD[['Gene']].apply(robust.mad)['Gene']
    data_final_ADHD.loc[data_final_ADHD['Gene'] > np.median(data_final_ADHD['Gene']) + 3 * mad, 'exclude'] = True
    data_final_ADHD.loc[data_final_ADHD['Gene'] < np.median(data_final_ADHD['Gene']) - 3 * mad, 'exclude'] = True
    
    mad = data_final_ADHD[[behavior]].apply(robust.mad)[behavior]
    data_final_ADHD.loc[data_final_ADHD[behavior] > np.median(data_final_ADHD[behavior]) + 3 * mad, 'exclude'] = True
    data_final_ADHD.loc[data_final_ADHD[behavior] < np.median(data_final_ADHD[behavior]) - 3 * mad, 'exclude'] = True   
    
    ####
    data_final_HC = data_final_HC.drop(data_final_HC[data_final_HC['exclude'] == True].index)
    data_final_ADHD = data_final_ADHD.drop(data_final_ADHD[data_final_ADHD['exclude'] == True].index)
    
    data_final_ = pd.concat([data_final_HC, data_final_ADHD])
        
    ####
    data_target = data_final_HC

    data_target = data_target[['fMRI', 'Gene', 'sex', 'interview_age','mri_info_manufacturer', 'BATCH', behavior, 'fMRI_Gene']]
    data_target = data_target.dropna(axis = 0)
    
    
    # log transform
    statistic, p_value = skewtest(data_target[behavior])
    if p_value < 0.05:
        if behavior=='tfmri_sst_all_beh_crgo_rt':
            data_target[behavior] = np.arcsin(np.sqrt(data_target[behavior]))
        else:
            data_target[behavior] = np.log1p(data_target[behavior])
        print(behavior, " is skewed!!!!")
        is_skew = True
    else:
        is_skew = False 
        
    # For partial_corr
    data_par = pd.get_dummies(data_target, columns = ['sex', 'mri_info_manufacturer', 'BATCH'])
    cov_col = data_par.columns.tolist()
    cov_col.remove('fMRI')
    cov_col.remove('Gene')
    cov_col.remove('fMRI_Gene')
    cov_col.remove(behavior) 
    cor_var.extend([pg.partial_corr(data = data_par, x = 'fMRI', y = behavior, covar = cov_col)['r'][0],
                  pg.partial_corr(data = data_par, x = 'fMRI', y = behavior, covar = cov_col)['p-val'][0],
                  pg.partial_corr(data = data_par, x = 'Gene', y = behavior, covar = cov_col)['r'][0],
                  pg.partial_corr(data = data_par, x = 'Gene', y = behavior, covar = cov_col)['p-val'][0]])
    
    
    ####    
    y_train = data_target[behavior].values
    
    X_train_o = data_target[['mri_info_manufacturer', 'BATCH']]
    X_train_os = data_target[['sex']]    

    X_base_train_s = data_target[['interview_age']]
    X_fmri_train_s = data_target[['fMRI', 'interview_age']]
    X_gene_train_s = data_target[['Gene', 'interview_age']]
    X_both_train_s = data_target[['fMRI', 'Gene', 'interview_age']]
    X_med_train_s = data_target[['fMRI', 'Gene', 'fMRI_Gene', 'interview_age']]

    scaler_base = StandardScaler() 
    scaler_fmri = StandardScaler()
    scaler_gene = StandardScaler()
    scaler_both = StandardScaler()
    scaler_med = StandardScaler()
    
    one_en = OneHotEncoder()
    one_sex = OneHotEncoder(drop = 'first')
    
    X_train_o = one_en.fit_transform(X_train_o)
    X_train_os = one_sex.fit_transform(X_train_os)    
    
    X_base_train_s = scaler_base.fit_transform(X_base_train_s)
    X_base_train = np.concatenate([X_base_train_s, X_train_o.toarray(), X_train_os.toarray()], axis = 1)   
    
    X_fmri_train_s = scaler_fmri.fit_transform(X_fmri_train_s)
    X_fmri_train = np.concatenate([X_fmri_train_s, X_train_o.toarray(), X_train_os.toarray()], axis = 1)

    X_gene_train_s = scaler_gene.fit_transform(X_gene_train_s)
    X_gene_train = np.concatenate([X_gene_train_s, X_train_o.toarray(), X_train_os.toarray()], axis = 1)
    
    X_both_train_s = scaler_both.fit_transform(X_both_train_s)
    X_both_train = np.concatenate([X_both_train_s, X_train_o.toarray(), X_train_os.toarray()], axis = 1)
    
    X_med_train_s = scaler_med.fit_transform(X_med_train_s)
    X_med_train = np.concatenate([X_med_train_s, X_train_o.toarray(), X_train_os.toarray()], axis = 1)    
    
    ####
    alphas = np.logspace(-3, 3, 100)
    
    param_grid = {
        'kernel': ['linear', 'rbf', 'poly'],
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 0.2, 0.3]
    }

#     lr_base = LinearRegression()
    lr_base = RidgeCV(alphas = alphas, cv = 5)
    lr_base.fit(X_base_train, y_train)
    
    lr_fmri = RidgeCV(alphas = alphas, cv = 5)
    lr_fmri.fit(X_fmri_train, y_train)
    
    lr_gene = RidgeCV(alphas = alphas, cv = 5)
    lr_gene.fit(X_gene_train, y_train) 
    
    lr_both = RidgeCV(alphas = alphas, cv = 5)
    lr_both.fit(X_both_train, y_train)
    
    lr_med = RidgeCV(alphas = alphas, cv = 5)
    lr_med.fit(X_med_train, y_train)

    ####
    # Test
    data_test['fMRI_Gene'] = data_test['fMRI'] * data_test['Gene']
    
    data_final_HC = data_test.loc[data_test['case'] == 'control']
    data_final_HC['exclude'] = False

    data_final_ADHD = data_test.loc[data_test['case'] == 'ADHD']
    data_final_ADHD['exclude'] = False
    
    #
    mad = data_final_HC[['fMRI']].apply(robust.mad)['fMRI']
    data_final_HC.loc[data_final_HC['fMRI'] > np.median(data_final_HC['fMRI']) + 3 * mad, 'exclude'] = True
    data_final_HC.loc[data_final_HC['fMRI'] < np.median(data_final_HC['fMRI']) - 3 * mad, 'exclude'] = True

    mad = data_final_HC[['Gene']].apply(robust.mad)['Gene']
    data_final_HC.loc[data_final_HC['Gene'] > np.median(data_final_HC['Gene']) + 3 * mad, 'exclude'] = True
    data_final_HC.loc[data_final_HC['Gene'] < np.median(data_final_HC['Gene']) - 3 * mad, 'exclude'] = True
    
    mad = data_final_HC[[behavior]].apply(robust.mad)[behavior]
    data_final_HC.loc[data_final_HC[behavior] > np.median(data_final_HC[behavior]) + 3 * mad, 'exclude'] = True
    data_final_HC.loc[data_final_HC[behavior] < np.median(data_final_HC[behavior]) - 3 * mad, 'exclude'] = True
    
    #
    mad = data_final_ADHD[['fMRI']].apply(robust.mad)['fMRI']
    data_final_ADHD.loc[data_final_ADHD['fMRI'] > np.median(data_final_ADHD['fMRI']) + 3 * mad, 'exclude'] = True
    data_final_ADHD.loc[data_final_ADHD['fMRI'] < np.median(data_final_ADHD['fMRI']) - 3 * mad, 'exclude'] = True

    mad = data_final_ADHD[['Gene']].apply(robust.mad)['Gene']
    data_final_ADHD.loc[data_final_ADHD['Gene'] > np.median(data_final_ADHD['Gene']) + 3 * mad, 'exclude'] = True
    data_final_ADHD.loc[data_final_ADHD['Gene'] < np.median(data_final_ADHD['Gene']) - 3 * mad, 'exclude'] = True
    
    mad = data_final_ADHD[[behavior]].apply(robust.mad)[behavior]
    data_final_ADHD.loc[data_final_ADHD[behavior] > np.median(data_final_ADHD[behavior]) + 3 * mad, 'exclude'] = True
    data_final_ADHD.loc[data_final_ADHD[behavior] < np.median(data_final_ADHD[behavior]) - 3 * mad, 'exclude'] = True   
    
    #
    data_final_HC = data_final_HC.drop(data_final_HC[data_final_HC['exclude'] == True].index)
    data_final_ADHD = data_final_ADHD.drop(data_final_ADHD[data_final_ADHD['exclude'] == True].index)
    
    data_final_ = pd.concat([data_final_HC, data_final_ADHD])    
    
    ####
    data_target_test = data_final_HC

    data_target_test = data_target_test[['fMRI', 'Gene', 'sex', 'interview_age', 'mri_info_manufacturer', 'BATCH', behavior, 'fMRI_Gene']]
    data_target_test = data_target_test.dropna(axis = 0)
    
    
    if is_skew:
        if behavior=='tfmri_sst_all_beh_crgo_rt':
            data_target_test[behavior] = np.arcsin(np.sqrt(data_target_test[behavior]))
        else:
            data_target_test[behavior] = np.log1p(data_target_test[behavior])
            
    # For partial_corr
    data_par = pd.get_dummies(data_target_test, columns = ['sex', 'mri_info_manufacturer', 'BATCH'])
    cov_col = data_par.columns.tolist()
    cov_col.remove('fMRI')
    cov_col.remove('Gene')
    cov_col.remove('fMRI_Gene')
    cov_col.remove(behavior)

    cor_var.extend([pg.partial_corr(data = data_par, x = 'fMRI', y = behavior, covar = cov_col)['r'][0],
                  pg.partial_corr(data = data_par, x = 'fMRI', y = behavior, covar = cov_col)['p-val'][0],
                  pg.partial_corr(data = data_par, x = 'Gene', y = behavior, covar = cov_col)['r'][0],
                  pg.partial_corr(data = data_par, x = 'Gene', y = behavior, covar = cov_col)['p-val'][0]])   
    
    ##
    y_true = data_target_test[behavior].values
    
    X_test_o = data_target_test[['mri_info_manufacturer', 'BATCH']]
    X_test_os = data_target_test[['sex']]    
        
    X_base_test_s = data_target_test[['interview_age']]
    X_fmri_test_s = data_target_test[['fMRI', 'interview_age']]
    X_gene_test_s = data_target_test[['Gene', 'interview_age']]
    X_both_test_s = data_target_test[['fMRI', 'Gene', 'interview_age']]
    X_med_test_s = data_target_test[['fMRI', 'Gene', 'fMRI_Gene', 'interview_age']]

    #
    X_test_o = one_en.transform(X_test_o)
    X_test_os = one_sex.transform(X_test_os)    
    
    X_base_test_s = scaler_base.transform(X_base_test_s)
    X_base_test = np.concatenate([X_base_test_s, X_test_o.toarray(), X_test_os.toarray()], axis = 1)    
    
    X_fmri_test_s = scaler_fmri.transform(X_fmri_test_s)
    X_fmri_test = np.concatenate([X_fmri_test_s, X_test_o.toarray(), X_test_os.toarray()], axis = 1)

    X_gene_test_s = scaler_gene.transform(X_gene_test_s)
    X_gene_test = np.concatenate([X_gene_test_s, X_test_o.toarray(), X_test_os.toarray()], axis = 1)
    
    X_both_test_s = scaler_both.transform(X_both_test_s)
    X_both_test = np.concatenate([X_both_test_s, X_test_o.toarray(), X_test_os.toarray()], axis = 1)
    
    X_med_test_s = scaler_med.fit_transform(X_med_test_s)
    X_med_test = np.concatenate([X_med_test_s, X_test_o.toarray(), X_test_os.toarray()], axis = 1)      
    
    ###
    y_base_pred = lr_base.predict(X_base_test)
    y_fmri_pred = lr_fmri.predict(X_fmri_test)
    y_gene_pred = lr_gene.predict(X_gene_test)
    y_both_pred = lr_both.predict(X_both_test)
    y_med_pred = lr_med.predict(X_med_test)
    
    y_true = y_true.reshape(-1)
    y_fmri_pred = y_fmri_pred.reshape(-1)
    y_gene_pred = y_gene_pred.reshape(-1)
    y_both_pred = y_both_pred.reshape(-1)
    y_med_pred = y_med_pred.reshape(-1)