In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scipy.stats as stats

import csv
import os

In [5]:
filename_info = 'THE_WHEEL_12_31_2019_CHESTFINALSUBJECTS.xls'

pt_info = pd.read_excel(filename_info)
pt_info['label'] = pt_info['Group_Arb2'].map({0: 'control',
                                              1: 'pah',
                                              5: 'epah'})
pt_info['SID'] = pt_info['SID'].astype(str).str.zfill(3)
sid_list = pt_info['SID'].to_list()

label_dict = dict(zip(pt_info['SID'].to_list(), pt_info['label'].to_list()))
# label_dict

In [14]:
# Compile all spreadsheets in the "FDs_data" folder

search_dir = os.path.join(os.getcwd(), 'FDs_data')

print(f'Performing search in:\n{search_dir}\n')

extension = 'csv'

csvFile_list = []
for root, dirs_list, files_list in os.walk(search_dir):
    for file_name in files_list:
        if file_name[-3:] == extension and \
        '-checkpoint' not in file_name and \
        'Visual' not in root:
            file_name_path = os.path.join(root, file_name)
            csvFile_list.append(file_name_path)

csvFile_list = [csvFile for csvFile in csvFile_list
                if any(sid == csvFile.split('\\')[-1][:3] for sid in sid_list)]

print(f'Compiled {len(csvFile_list)} files!')

Performing search in:
C:\Users\andre\Documents\GitHub\sampleNotebook\FDs_data

Compiled 14741 files!


In [21]:
# Compiles FD data into one df

# Separate spreadsheets for each lobe
# All artvein are in columns rather than rows
# --> each row is one subject

lobe_list = ['whole']
artvein_list = ['', 'artery', 'vein']
offset_list = ['']

for lobe in lobe_list:
    df_list = []
    for sid in sid_list:
        df_dict = {
            'SID':             sid,
            'lobe':            lobe,
            'label':           label_dict[sid],
        }
        for artvein in artvein_list:
            for offset in offset_list:
                files_matching = csvFile_list
                
                # Check for sid and lobe matching in file namee
                files_matching = [csvFile for csvFile in files_matching
                                      if sid == csvFile.split('\\')[-1][:3] and \
                                         lobe in csvFile.split('\\')[-1]]
                
                # Check for artvein matching in file name
                if artvein:
                    files_matching = [csvFile for csvFile in files_matching
                                      if artvein.capitalize() in csvFile.split('\\')[-1]]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'Artery' not in csvFile.split('\\')[-1] and \
                                         'Vein' not in csvFile.split('\\')[-1]]
                
                # Check for offset matching in complete path name
                if offset:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' in csvFile.lower()]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' not in csvFile.lower()]

                artvein_label = 'UNSEP' if not artvein else artvein.upper()
                for f in files_matching:
                    try:
                        with open(f) as csv_file:
                            csv_reader = csv.reader(csv_file)
                            csv_data = list(csv_reader)

                        folder = f.split('\\')[-2]
                        
                        if '_fd' in f:
                            df_dict[f'{folder}_{artvein_label}__Fractal_Dimension'] = float(csv_data[0][1])
                        elif '_surfaceArea' in f:
                            df_dict[f'{folder}_{artvein_label}__SurfaceArea'] = float(csv_data[0][1]) * 1e-2
                            df_dict[f'{folder}_{artvein_label}__Volume'] = float(csv_data[1][1]) * 1e-3
                            df_dict[f'{folder}_{artvein_label}__SAV'] = float(csv_data[0][1]) / float(csv_data[1][1])
                    except:
                        print(f'ERROR:\n{f}\n')
        
        ### ADD ADDITONAL COLUMNS OF DATA HERE ###
        WHOLE_ART_TBV = pt_info[pt_info['SID'].eq(sid)]['WHOLE_ART_TBV'].iloc[0]
        WHOLE_VEN_TBV = pt_info[pt_info['SID'].eq(sid)]['WHOLE_VEN_TBV'].iloc[0]
        WHOLE_TBV = WHOLE_ART_TBV + WHOLE_VEN_TBV

        df_dict['WHOLE_ART_TBV'] = WHOLE_ART_TBV
        df_dict['WHOLE_VEN_TBV'] = WHOLE_VEN_TBV
        df_dict['WHOLE_TBV'] = WHOLE_TBV

        df_dict['WHOLE_ART_TBV_fdRatio'] = df_dict['FDs_ArtVein_ARTERY__Fractal_Dimension'] / WHOLE_ART_TBV
        df_dict['WHOLE_VEN_TBV_fdRatio'] = df_dict['FDs_ArtVein_VEIN__Fractal_Dimension'] / WHOLE_VEN_TBV
        df_dict['WHOLE_TBV_fdRatio'] = df_dict['FDs_UNSEP__Fractal_Dimension'] / WHOLE_TBV

        df_list.append(df_dict)
    df = pd.DataFrame(df_list)
#     df.to_csv(f'{lobe}_FDs_combined_04_14_22_RANK.csv', index=False)

df_features = df

In [16]:
df.to_csv(f'test.csv', index=False)

In [23]:
# WHOLE_ART_TBV
# WHOLE_VEN_TBV
# WHOLE_TBV

# SurfaceArea_UNSEP__SurfaceArea
# SurfaceArea_UNSEP__Volume
# SurfaceArea_UNSEP__SAV
# SurfaceArea_ArtVein_ARTERY__SurfaceArea
# SurfaceArea_ArtVein_ARTERY__Volume
# SurfaceArea_ArtVein_ARTERY__SAV
# SurfaceArea_ArtVein_VEIN__SurfaceArea
# SurfaceArea_ArtVein_VEIN__Volume
# SurfaceArea_ArtVein_VEIN__SAV

group = ['control', 'pah']
feature = 'FDs_UNSEP__Fractal_Dimension'

X1 = df_features[df_features['label'].eq(group[0])][feature]
X2 = df_features[df_features['label'].eq(group[1])][feature]

print(f'Groups:  {group[0]} vs {group[1]}')
print(f'Feature: {feature}')
print()

print(f'\'{group[0]}\'')
print(f'Count: {len(X1)}')
print(f'Median: {np.median(X1):.2f} ({np.quantile(X1, 0.25):.2f} - {np.quantile(X1, 0.75):.2f})')
print()

print(f'\'{group[1]}\'')
print(f'Count: {len(X2)}')
print(f'Median: {np.median(X2):.2f} ({np.quantile(X2, 0.25):.2f} - {np.quantile(X2, 0.75):.2f})')
print()

_, pvalue = stats.ttest_ind(X1, X2)
print(f't-test_{group[0]}_{group[1]}:    {pvalue}')

_, pvalue = stats.mannwhitneyu(X1, X2)
print(f'wilcoxon_{group[0]}_{group[1]}:  {pvalue}')
print()

Groups:  control vs pah
Feature: FDs_UNSEP__Fractal_Dimension

'control'
Count: 37
Median: 2.37 (2.34 - 2.41)

'pah'
Count: 42
Median: 2.36 (2.33 - 2.39)

t-test_control_pah:    0.06875659958158035
wilcoxon_control_pah:  0.049918910863307984



In [24]:
# WHOLE_ART_TBV
# WHOLE_VEN_TBV
# WHOLE_TBV

features = [
    'SurfaceArea_UNSEP__SurfaceArea', 'SurfaceArea_UNSEP__Volume', 'SurfaceArea_UNSEP__SAV',
    'SurfaceArea_ArtVein_ARTERY__SurfaceArea', 'SurfaceArea_ArtVein_ARTERY__Volume', 'SurfaceArea_ArtVein_ARTERY__SAV',
    'SurfaceArea_ArtVein_VEIN__SurfaceArea', 'SurfaceArea_ArtVein_VEIN__Volume', 'SurfaceArea_ArtVein_VEIN__SAV'
]

group = ['control', 'pah']

df_list = []
for feature in features:
    
    
    X1 = df_features[df_features['label'].eq(group[0])][feature]
    X2 = df_features[df_features['label'].eq(group[1])][feature]

    _, t_pvalue = stats.ttest_ind(X1, X2)
    _, wilcoxon_pvalue = stats.mannwhitneyu(X1, X2)
    
    df_dict = {
        'feature':                feature,
        f'{group[0]}__count':     len(X1),
        f'{group[0]}__median':    np.median(X1),
        f'{group[1]}__count':     len(X2),
        f'{group[1]}__median':    np.median(X2),
        't-test_control_pah':     t_pvalue,
        'wilcoxon_control_pah':   wilcoxon_pvalue,
    }
    df_list.append(df_dict)

df = pd.DataFrame(df_list)
df.to_csv(f'test.csv', index=False)