In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scipy.stats as stats

import csv
import os

In [2]:
filename_info = 'THE_WHEEL_12_31_2019_CHESTFINALSUBJECTS.xls'

pt_info = pd.read_excel(filename_info)
pt_info['label'] = pt_info['Group_Arb2'].map({0: 'control',
                                              1: 'pah',
                                              5: 'epah'})
pt_info['SID'] = pt_info['SID'].astype(str).str.zfill(3)

label_dict = dict(zip(pt_info['SID'].to_list(), pt_info['label'].to_list()))
# label_dict

In [4]:
# Compile all spreadsheets in the "FDs_data" folder

search_dir = os.path.join(os.getcwd(), 'FDs_data')

print(f'Performing search in:\n{search_dir}\n')

extension = 'csv'

csvFile_list = []
for root, dirs_list, files_list in os.walk(search_dir):
    for file_name in files_list:
        if file_name[-3:] == extension and \
        '-checkpoint' not in file_name and \
        'Visual' not in root:
            file_name_path = os.path.join(root, file_name)
            csvFile_list.append(file_name_path)

sid_list = pt_info['SID'].to_list()

csvFile_list = [csvFile for csvFile in csvFile_list
                if any(sid == csvFile.split('\\')[-1][:3] for sid in sid_list)]

print(f'Compiled {len(csvFile_list)} files!')

Performing search in:
C:\Users\andre\Documents\GitHub\sampleNotebook\FDs_data

Compiled 14468 files!


In [7]:
# Compiles FD data into one df that is saved as csv

# Separate spreadsheets for each lobe
# All artvein are in columns rather than rows
# --> each row is one subject

lobe_list = ['whole']
artvein_list = ['', 'artery', 'vein']
offset_list = ['']

for lobe in lobe_list:
    df_list = []
    for sid in sid_list:
        df_dict = {
            'SID':             sid,
            'lobe':            lobe,
            'label':           label_dict[sid],
        }
        for artvein in artvein_list:
            for offset in offset_list:
                files_matching = csvFile_list
                
                # Check for sid and lobe matching in file namee
                files_matching = [csvFile for csvFile in files_matching
                                      if sid == csvFile.split('\\')[-1][:3] and \
                                         lobe in csvFile.split('\\')[-1] in csvFile.split('\\')[-1]]
                
                # Check for artvein matching in file name
                if artvein:
                    files_matching = [csvFile for csvFile in files_matching
                                      if artvein.capitalize() in csvFile.split('\\')[-1]]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'Artery' not in csvFile.split('\\')[-1] and \
                                         'Vein' not in csvFile.split('\\')[-1]]
                
                # Check for offset matching in complete path name
                if offset:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' in csvFile.lower()]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' not in csvFile.lower()]

                artvein_label = 'UNSEP' if not artvein else artvein.upper()
                for f in files_matching:
                    try:
                        with open(f) as csv_file:
                            csv_reader = csv.reader(csv_file)
                            csv_data = list(csv_reader)

                        folder = f.split('\\')[-2]

                        sc_arr = np.asarray(csv_data[4:]).astype(int)
                        sizes, counts = sc_arr[:, 0], sc_arr[:, 1]

                        sizes_log =  np.log(sizes)
                        counts_log = np.log(counts)
                        coeffs =     np.array([csv_data[1][1], csv_data[2][1]]).astype(float)

                        correlation_matrix = np.corrcoef(sizes_log, counts_log)
                        r_squared =       correlation_matrix[0, 1]**2
                        residuals =       counts_log - np.polyval(coeffs, sizes_log)
                        pairwise_slopes = (counts_log[1:] - counts_log[:-1]) / (sizes_log[1:] - sizes_log[:-1])

                        df_dict[f'{folder}_{artvein_label}__Fractal_Dimension'] = float(csv_data[0][1])
                    except:
                        print(f'ERROR:\n{f}\n')

        df_list.append(df_dict)
    df = pd.DataFrame(df_list)
#     df.to_csv(f'{lobe}_FDs_combined_04_14_22_RANK.csv', index=False)

In [8]:
df

Unnamed: 0,SID,lobe,label,FDs_UNSEP__Fractal_Dimension,FDs_20Size_UNSEP__Fractal_Dimension,FDs_20SizeBy1_UNSEP__Fractal_Dimension,FDs_DNN_SCALE_UNSEP__Fractal_Dimension,FDs_DNN_SCALE_20SizeBy1_UNSEP__Fractal_Dimension,FDs_DNN_SCALE_ManySizeBy1_UNSEP__Fractal_Dimension,FDs_FACTOR_0_5_SCALE_UNSEP__Fractal_Dimension,...,FDs_FACTOR_0_8_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_FACTOR_0_9_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_FACTOR_1_0_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_FACTOR_1_1_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_FACTOR_1_2_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_FACTOR_1_3_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_FACTOR_1_4_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_FACTOR_1_5_SCALE_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_ManySizeBy1_ArtVein_VEIN__Fractal_Dimension,FDs_ManySize_ArtVein_VEIN__Fractal_Dimension
0,062,whole,control,2.409303,2.376474,2.386155,2.243210,2.085325,2.306883,2.181775,...,2.189684,2.207202,2.222486,2.236951,2.249749,2.262801,2.273160,2.283074,2.222486,2.179863
1,063,whole,control,2.386010,2.426932,2.440630,2.250393,2.135780,2.296203,2.208323,...,2.224028,2.238603,2.252029,2.263385,2.273744,2.284047,2.294183,2.302417,2.252029,2.253795
2,064,whole,control,2.377754,2.332468,2.344820,2.288855,2.089522,2.374310,2.225141,...,2.277502,2.293320,2.304803,2.317732,2.329709,2.340463,2.350034,2.359935,2.304803,2.292369
3,065,whole,control,2.309298,2.353453,2.369045,2.181870,2.088724,2.258664,2.129457,...,2.178067,2.192833,2.205509,2.216933,2.228611,2.239242,2.249312,2.258494,2.205509,2.217023
4,068,whole,control,2.412060,2.417303,2.431406,2.298395,2.122803,2.341462,2.261121,...,2.261985,2.277823,2.290022,2.303085,2.314127,2.325187,2.335328,2.343729,2.290022,2.283432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,176,whole,epah,2.383102,2.406897,2.418782,2.215161,2.122524,2.250233,2.162447,...,2.142843,2.161507,2.176850,2.191739,2.205090,2.217834,2.228966,2.239545,2.176850,2.177068
87,177,whole,epah,2.445889,2.402788,2.411878,2.284308,2.046702,2.332085,2.260301,...,2.225961,2.243033,2.259034,2.273648,2.286553,2.298928,2.311450,2.320998,2.259034,2.251452
88,187,whole,epah,2.438324,2.406307,2.415677,2.225169,2.033859,2.247007,2.209742,...,2.163869,2.181594,2.195828,2.211258,2.224857,2.238081,2.250979,2.262662,2.195828,2.193285
89,286,whole,epah,2.301947,2.346956,2.352179,2.159690,2.044381,2.199058,2.108943,...,2.107702,2.123380,2.138006,2.150885,2.162123,2.173622,2.184121,2.192465,2.138006,2.178493


In [None]:
# WHOLE_ART_TBV
# WHOLE_VEN_TBV

group = ['control', 'pah']

mask = calculated_features['lobe'].eq(lobe) & calculated_features['artery_or_vein'].eq(artvein)
X1 = calculated_features[calculated_features['label'].eq(group[0]) & mask][feature]
X2 = calculated_features[calculated_features['label'].eq(group[1]) & mask][feature]
_, pvalue = stats.ttest_ind(X1, X2)
print(f't-test_{group[0]}_{group[1]}:    {pvalue}')

_, pvalue = stats.mannwhitneyu(X1, X2)
print(f'wilcoxon_{group[0]}_{group[1]}:  {pvalue}')
print()