In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import csv
import os

### Import FDs

In [3]:
filename_info = 'THE_WHEEL_12_31_2019_CHESTFINALSUBJECTS.xls'

pt_info = pd.read_excel(filename_info)
pt_info['label'] = pt_info['Group_Arb2'].map({0: 'control',
                                              1: 'pah',
                                              5: 'epah'})
pt_info['SID'] = pt_info['SID'].astype(str).str.zfill(3)

label_dict = dict(zip(pt_info['SID'].to_list(), pt_info['label'].to_list()))
# label_dict

In [5]:
# Compile all spreadsheets in the "FDs_data" folder

search_dir = os.path.join(os.getcwd(), 'FDs_data')

print(f'Performing search in:\n{search_dir}\n')

extension = 'csv'

csvFile_list = []
for root, dirs_list, files_list in os.walk(search_dir):
    for file_name in files_list:
        if file_name[-3:] == extension and \
        '_fd' in file_name and \
        '-checkpoint' not in file_name and \
        'Visual' not in root:
            file_name_path = os.path.join(root, file_name)
            csvFile_list.append(file_name_path)

sid_list = pt_info['SID'].to_list()

csvFile_list = [csvFile for csvFile in csvFile_list
                if any(sid == csvFile.split('\\')[-1][:3] for sid in sid_list)]

print(f'Compiled {len(csvFile_list)} files!')

Performing search in:
C:\Users\andre\Documents\GitHub\sampleNotebook\FDs_data

Compiled 14468 files!


In [4]:
# Run this for my analysis (exploreFeatures.ipynb)

# Everything in one spreadsheet
# Row for each subject and each artvein
lobe_list = ['left', 'right', 'whole']
artvein_list = ['', 'artery', 'vein']
offset_list = ['', 'allOffsets']

df_list = []
for sid in sid_list:
    for lobe in lobe_list:
        for artvein in artvein_list:
            for offset in offset_list:
                files_matching = csvFile_list
                
                # Check for sid and lobe matching in file namee
                files_matching = [csvFile for csvFile in files_matching
                                      if sid == csvFile.split('\\')[-1][:3] and \
                                         lobe in csvFile.split('\\')[-1]]
                
                # Check for artvein matching in file name
                if artvein:
                    files_matching = [csvFile for csvFile in files_matching
                                      if artvein.capitalize() in csvFile.split('\\')[-1]]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'Artery' not in csvFile.split('\\')[-1] and \
                                         'Vein' not in csvFile.split('\\')[-1]]
                
                # Check for offset matching in complete path name
                if offset:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' in csvFile.lower()]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' not in csvFile.lower()]
                
                df_dict = {
                    'SID':             sid,
                    'lobe':            lobe,
                    'artery_or_vein':  artvein,
                    'label':           label_dict[sid],
                    'num_files':       len(files_matching),
                    'files':           files_matching,
                }

                for f in files_matching:
                    try:
                        with open(f) as csv_file:
                            csv_reader = csv.reader(csv_file)
                            csv_data = list(csv_reader)

                        folder = f.split('\\')[-2]

                        sc_arr = np.asarray(csv_data[4:]).astype(int)
                        sizes, counts = sc_arr[:, 0], sc_arr[:, 1]

                        sizes_log =  np.log(sizes)
                        counts_log = np.log(counts)
                        coeffs =     np.array([csv_data[1][1], csv_data[2][1]]).astype(float)

                        correlation_matrix = np.corrcoef(sizes_log, counts_log)
                        r_squared =       correlation_matrix[0, 1]**2
                        residuals =       counts_log - np.polyval(coeffs, sizes_log)
                        pairwise_slopes = (counts_log[1:] - counts_log[:-1]) / (sizes_log[1:] - sizes_log[:-1])

                        df_dict[f'{folder}__Fractal_Dimension'] = float(csv_data[0][1])
    #                     df_dict[f'{folder}__Intercept'] =         float(csv_data[2][1])
                        df_dict[f'{folder}__R_Squared'] =         r_squared

                        if '20Size' in f and 'ArtVein' not in f:
                            zero_crossings = np.where(np.diff(np.sign(np.array(residuals))))[0]
                            if len(zero_crossings) > 2:
                                df_dict[f'{folder}__FD@root1'] = np.nan
                                df_dict[f'{folder}__FD@root2'] = np.nan
                                continue
                            zero_crossings = zero_crossings[::-1]

                            # Fits
                            counts_poly = []
                            for i, c in enumerate(zero_crossings):
                                counts_poly.append(np.polynomial.polynomial.Polynomial.fit(x=sizes_log[c-1:c+3], y=counts_log[c-1:c+3], deg=1, domain=[]))

                            df_dict[f'{folder}__FD@root1'] = -counts_poly[0].coef[1]
                            df_dict[f'{folder}__FD@root2'] = -counts_poly[1].coef[1]
                    except:
                        print(f'ERROR:\n{f}\n')

                df_list.append(df_dict)
pd.DataFrame(df_list).to_csv('FDs_combined_04_14_22_analysis.csv', index=False)

In [4]:
# Run this for Nick's analysis

# Separate spreadsheets for each lobe
# All artvein are in columns rather than rows
# --> each row is one subject

lobe_list = ['left', 'right', 'whole']
artvein_list = ['', 'artery', 'vein']

df_list = []
for sid in sid_list:
    for lobe in lobe_list:
        df_dict = {
            'SID':             sid,
            'lobe':            lobe,
            'label':           label_dict[sid],
        }
        for artvein in artvein_list:
            if artvein:
                files_matching = [csvFile for csvFile in csvFile_list
                                  if sid == csvFile.split('\\')[-1][:3] and \
                                     lobe in csvFile.split('\\')[-1] and \
                                     artvein.capitalize() in csvFile.split('\\')[-1]]
            else:
                files_matching = [csvFile for csvFile in csvFile_list
                                  if sid == csvFile.split('\\')[-1][:3] and \
                                     lobe in csvFile.split('\\')[-1] and \
                                     'Artery' not in csvFile.split('\\')[-1] and \
                                     'Vein' not in csvFile.split('\\')[-1]]

            artvein_label = 'UNSEP' if not artvein else artvein.upper()
            for f in files_matching:
                try:
                    with open(f) as csv_file:
                        csv_reader = csv.reader(csv_file)
                        csv_data = list(csv_reader)

                    folder = f.split('\\')[-2]

                    sc_arr = np.asarray(csv_data[4:]).astype(int)
                    sizes, counts = sc_arr[:, 0], sc_arr[:, 1]

                    sizes_log =  np.log(sizes)
                    counts_log = np.log(counts)
                    coeffs =     np.array([csv_data[1][1], csv_data[2][1]]).astype(float)

                    correlation_matrix = np.corrcoef(sizes_log, counts_log)
                    r_squared =       correlation_matrix[0, 1]**2
                    residuals =       counts_log - np.polyval(coeffs, sizes_log)
                    pairwise_slopes = (counts_log[1:] - counts_log[:-1]) / (sizes_log[1:] - sizes_log[:-1])

                    df_dict[f'{folder}_{artvein_label}__Fractal_Dimension'] = float(csv_data[0][1])
#                     df_dict[f'{folder}_{artvein_label}__Intercept'] =         float(csv_data[2][1])
                    df_dict[f'{folder}_{artvein_label}__R_Squared'] =         r_squared

                    if '20Size' in f and 'ArtVein' not in f:
                        zero_crossings = np.where(np.diff(np.sign(np.array(residuals))))[0]
                        if len(zero_crossings) > 2:
                            df_dict[f'{folder}_{artvein_label}__FD@root1'] = np.nan
                            df_dict[f'{folder}_{artvein_label}__FD@root2'] = np.nan
                            continue
                        zero_crossings = zero_crossings[::-1]

                        # Fits
                        counts_poly = []
                        for i, c in enumerate(zero_crossings):
                            counts_poly.append(np.polynomial.polynomial.Polynomial.fit(x=sizes_log[c-1:c+3], y=counts_log[c-1:c+3], deg=1, domain=[]))

                        df_dict[f'{folder}_{artvein_label}__FD@root1'] = -counts_poly[0].coef[1]
                        df_dict[f'{folder}_{artvein_label}__FD@root2'] = -counts_poly[1].coef[1]
                except:
                    print(f'ERROR:\n{f}\n')

        df_list.append(df_dict)
pd.DataFrame(df_list).to_csv(f'FDs_combined_04_14_22.csv', index=False)

In [19]:
# Run this for Nick's analysis

# Separate spreadsheets for each lobe
# All artvein are in columns rather than rows
# --> each row is one subject

lobe_list = ['whole']
artvein_list = ['', 'artery', 'vein']
offset_list = ['', '_allOffsets']

for lobe in lobe_list:
    df_list = []
    for sid in sid_list:
        df_dict = {
            'SID':             sid,
            'lobe':            lobe,
            'label':           label_dict[sid],
        }
        for artvein in artvein_list:
            for offset in offset_list:
                files_matching = csvFile_list
                
                # Check for sid and lobe matching in file namee
                files_matching = [csvFile for csvFile in files_matching
                                      if sid == csvFile.split('\\')[-1][:3] and \
                                         lobe in csvFile.split('\\')[-1]]
                
                # Check for artvein matching in file name
                if artvein:
                    files_matching = [csvFile for csvFile in files_matching
                                      if artvein.capitalize() in csvFile.split('\\')[-1]]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'Artery' not in csvFile.split('\\')[-1] and \
                                         'Vein' not in csvFile.split('\\')[-1]]
                
                # Check for offset matching in complete path name
                if offset:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' in csvFile.lower()]
                else:
                    files_matching = [csvFile for csvFile in files_matching
                                      if 'offset' not in csvFile.lower()]

                artvein_label = 'UNSEP' if not artvein else artvein.upper()
                for f in files_matching:
                    try:
                        with open(f) as csv_file:
                            csv_reader = csv.reader(csv_file)
                            csv_data = list(csv_reader)

                        folder = f.split('\\')[-2]

                        sc_arr = np.asarray(csv_data[4:]).astype(int)
                        sizes, counts = sc_arr[:, 0], sc_arr[:, 1]

                        sizes_log =  np.log(sizes)
                        counts_log = np.log(counts)
                        coeffs =     np.array([csv_data[1][1], csv_data[2][1]]).astype(float)

                        correlation_matrix = np.corrcoef(sizes_log, counts_log)
                        r_squared =       correlation_matrix[0, 1]**2
                        residuals =       counts_log - np.polyval(coeffs, sizes_log)
                        pairwise_slopes = (counts_log[1:] - counts_log[:-1]) / (sizes_log[1:] - sizes_log[:-1])

                        df_dict[f'{folder}_{artvein_label}__Fractal_Dimension'] = float(csv_data[0][1])
#                         df_dict[f'{folder}_{artvein_label}__Intercept'] =         float(csv_data[2][1])
                        df_dict[f'{folder}_{artvein_label}__R_Squared'] =         r_squared

                        if '20Size' in f and 'ArtVein' not in f:
                            zero_crossings = np.where(np.diff(np.sign(np.array(residuals))))[0]
                            if len(zero_crossings) > 2:
                                df_dict[f'{folder}_{artvein_label}__FD@root1'] = np.nan
                                df_dict[f'{folder}_{artvein_label}__FD@root2'] = np.nan
                                continue
                            zero_crossings = zero_crossings[::-1]

                            # Fits
                            counts_poly = []
                            for i, c in enumerate(zero_crossings):
                                counts_poly.append(np.polynomial.polynomial.Polynomial.fit(x=sizes_log[c-1:c+3], y=counts_log[c-1:c+3], deg=1, domain=[]))

                            df_dict[f'{folder}_{artvein_label}__FD@root1'] = -counts_poly[0].coef[1]
                            df_dict[f'{folder}_{artvein_label}__FD@root2'] = -counts_poly[1].coef[1]
                    except:
                        print(f'ERROR:\n{f}\n')

        df_list.append(df_dict)
    pd.DataFrame(df_list).to_csv(f'{lobe}_FDs_combined_04_14_22_RANK.csv', index=False)

In [None]:
# Compiles FD data using custom box size ranges as shown in poster
# includes FACTOR_SCALE, DNN_SCALE + 20Size, 20SizeBy1, ManySize variants

# Custom bounds (for poster presentation)
lower_bounds = [2, 2, 32]
upper_bounds = [16, 256, 256]

df_list = []
for sid in sid_list:
    for i in range(0, len(lower_bounds)):
        lower_bound = lower_bounds[i]
        upper_bound = upper_bounds[i]

        df_dict = {}

        stats_list = [[], [], []]
        for thing_to_plot in things_to_plot:
            df_to_plot = thing_to_plot['df']

            if 'By1' in folder:
                m = 1
            else:
                m = 2

            if '20Size' in folder:
                mask = np.arange(20, 1, -m)
            elif 'ManySize' in folder:
                mask = np.arange(upper_bound, lower_bound-1, -m)
            else:
                lower_log = int(np.log(lower_bound) / np.log(2))
                upper_log = int(np.log(upper_bound) / np.log(2))
                mask = 2**np.arange(upper_log, lower_log-1, -1)

            FDs = []
            for _, row in df_to_plot.iterrows():
                mask_ind = len(row['Sizes']) - np.searchsorted(row['Sizes'][::-1], mask[::-1])[::-1] - 1

                sizes = row['Sizes'][mask_ind]
                counts = row['Counts'][mask_ind]

                # Handles cases where box sizes are lower than expected (<256)
                # Trims if actual box size is lower than upper_bound
                for i in range(0, len(sizes)-1):
                    if sizes[i] < sizes[i+1]:
                        sizes = sizes[i+1:]
                        counts = counts[i+1:]
                        break

                sizes_log = np.log(sizes)
                counts_log = np.log(counts)

                poly = np.polynomial.polynomial.Polynomial.fit(x=sizes_log, y=counts_log, deg=1, domain=[])

                FDs.append(-poly.coef[1])

            stats_list[thing_to_plot['axes']].append(FDs)

        df_list.append(df_dict)

pd.DataFrame(df_list).to_csv('test.csv', index=False)