In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats as stats

import csv
import os

In [2]:
filename_info = 'THE_WHEEL_12_31_2019_CHESTFINALSUBJECTS.xls'

pt_info = pd.read_excel(filename_info)
pt_info['label'] = pt_info['Group_Arb2'].map({0: 'control',
                                              1: 'pah',
                                              5: 'epah'})
pt_info['SID'] = pt_info['SID'].astype(str).str.zfill(3)

label_dict = dict(zip(pt_info['SID'].to_list(), pt_info['label'].to_list()))
# label_dict

In [13]:
# Compile all spreadsheets in relevant folder

old_folder = 'FDs_data'
filters_include = ['_fd']
filters_exclude = ['-checkpoint', 'Visual']
extension = 'csv'

search_dir = os.path.join(os.getcwd(), old_folder)

print(f'Performing search in:\n{search_dir}\n')

csvFile_list = []
for root, dirs_list, files_list in os.walk(search_dir):
    for file_name in files_list:
        if file_name[-3:] == extension and \
        all([f in root or f in file_name for f in filters_include]) and \
        not any([f in root or f in file_name for f in filters_exclude]):
            file_name_path = os.path.join(root, file_name)
            csvFile_list.append(file_name_path)

sid_list = pt_info['SID'].to_list()

csvFile_list = [csvFile for csvFile in csvFile_list
                if any(sid == csvFile.split('\\')[-1][:3] for sid in sid_list)]

print(f'Compiled {len(csvFile_list)} files!')

Performing search in:
C:\Users\andre\Documents\github\fd_analysis\FDs_data

Compiled 14468 files!


In [15]:
lobe_list = ['whole']
artvein_list = ['', 'artery', 'vein']
offset_list = ['', '_allOffsets']

for lobe in lobe_list:
    df_list = []
    for sid in sid_list:
        df_dict = {
            'SID':             sid,
            'lobe':            lobe,
            'label':           label_dict[sid],
        }
        for artvein in artvein_list:
            for offset in offset_list:
                files_matching = csvFile_list
                
                # Check for sid and lobe matching in file namee
                files_matching = [csvFile for csvFile in files_matching
                                      if sid == csvFile.split('\\')[-1][:3] and \
                                         lobe in csvFile.split('\\')[-1]]
                
                # Check for artvein matching in file name
                if artvein:
                    files_matching = [csvFile for csvFile in files_matching
                                      if artvein.capitalize() in csvFile.split('\\')[-1]]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'Artery' not in csvFile.split('\\')[-1] and \
                                         'Vein' not in csvFile.split('\\')[-1]]
                
                # Check for offset matching in complete path name
                if offset:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' in csvFile.lower()]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' not in csvFile.lower()]

                artvein_label = 'UNSEP' if not artvein else artvein.upper()
                for f in files_matching:
                    try:
                        with open(f) as csv_file:
                            csv_reader = csv.reader(csv_file)
                            csv_data = list(csv_reader)

                        folder = f.split('\\')[-2]

                        sc_arr = np.asarray(csv_data[4:]).astype(int)
                        sizes, counts = sc_arr[:, 0], sc_arr[:, 1]

                        sizes_log =  np.log(sizes)
                        counts_log = np.log(counts)
                        coeffs =     np.array([csv_data[1][1], csv_data[2][1]]).astype(float)

                        correlation_matrix = np.corrcoef(sizes_log, counts_log)
                        r_squared =       correlation_matrix[0, 1]**2
                        residuals =       counts_log - np.polyval(coeffs, sizes_log)
                        pairwise_slopes = (counts_log[1:] - counts_log[:-1]) / (sizes_log[1:] - sizes_log[:-1])

                        df_dict[f'{folder}_{artvein_label}__Fractal_Dimension'] = float(csv_data[0][1])

                        if '20Size' in f and 'ArtVein' not in f:
                            zero_crossings = np.where(np.diff(np.sign(np.array(residuals))))[0]
                            if len(zero_crossings) > 2:
                                df_dict[f'{folder}_{artvein_label}__FD@root1'] = np.nan
                                df_dict[f'{folder}_{artvein_label}__FD@root2'] = np.nan
                                continue
                            zero_crossings = zero_crossings[::-1]

                            # Fits
                            counts_poly = []
                            for i, c in enumerate(zero_crossings):
                                counts_poly.append(np.polynomial.polynomial.Polynomial.fit(x=sizes_log[c-1:c+3], y=counts_log[c-1:c+3], deg=1, domain=[]))

                            df_dict[f'{folder}_{artvein_label}__FD@root1'] = -counts_poly[0].coef[1]
                            df_dict[f'{folder}_{artvein_label}__FD@root2'] = -counts_poly[1].coef[1]
                    except:
                        print(f'ERROR:\n{f}\n')

        df_list.append(df_dict)
    calculated_features = pd.DataFrame(df_list)
    calculated_features.to_csv(f'analysis_FDs_data.csv', index=False)

### Statistics

### DNN vs FACTOR Scaling

In [17]:
group = ['control', 'pah']
features = [
    'FDs_UNSEP__Fractal_Dimension',
    'FDs_DNN_SCALE_UNSEP__Fractal_Dimension',
    'FDs_FACTOR_0_5_SCALE_UNSEP__Fractal_Dimension',
    'FDs_FACTOR_1_0_SCALE_UNSEP__Fractal_Dimension',
    'FDs_FACTOR_1_5_SCALE_UNSEP__Fractal_Dimension',
]

df_list = []
for i, feature in enumerate(features):
    X1 = calculated_features[calculated_features['label'].eq(group[0])][feature]
    X2 = calculated_features[calculated_features['label'].eq(group[1])][feature]

    _, pvalue = stats.mannwhitneyu(X1, X2)
    
    df_dict = {
        'Feature':             feature,
        f'{group[0]}_count':   len(X1),
        f'{group[0]}_median':  np.median(X1),
        f'{group[1]}_count':   len(X2),
        f'{group[1]}_median':  np.median(X2),
        'Wilcoxon p value':    pvalue,
    }
    
    df_list.append(df_dict)

df_stats = pd.DataFrame(df_list)
df_stats.to_csv(f'analysis_DNN.csv', index=False)