In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import csv
import os

In [7]:
filename_info = 'THE_WHEEL_12_31_2019_CHESTFINALSUBJECTS.xls'

pt_info = pd.read_excel(filename_info)
pt_info['label'] = pt_info['Group_Arb2'].map({0: 'control',
                                              1: 'pah',
                                              5: 'epah'})
pt_info['SID'] = pt_info['SID'].astype(str).str.zfill(3)
sid_list = pt_info['SID'].to_list()

label_dict = dict(zip(pt_info['SID'].to_list(), pt_info['label'].to_list()))
# label_dict

In [8]:
# Compile all spreadsheets in the "FDs_data" folder

search_dir = os.path.join(os.getcwd(), 'FDs_data')

print(f'Performing search in:\n{search_dir}\n')

extension = 'csv'

csvFile_list = []
for root, dirs_list, files_list in os.walk(search_dir):
    for file_name in files_list:
        if file_name[-3:] == extension and \
        '-checkpoint' not in file_name and \
        'Visual' not in root:
            file_name_path = os.path.join(root, file_name)
            csvFile_list.append(file_name_path)

csvFile_list = [csvFile for csvFile in csvFile_list
                if any(sid == csvFile.split('\\')[-1][:3] for sid in sid_list)]

print(f'Compiled {len(csvFile_list)} files!')

Performing search in:
C:\Users\andre\Documents\GitHub\sampleNotebook\FDs_data

Compiled 14741 files!


In [13]:
# Compiles FD data into one df

# Separate spreadsheets for each lobe
# All artvein are in columns rather than rows
# --> each row is one subject

lobe_list = ['whole']
artvein_list = ['', 'artery', 'vein']
offset_list = ['']

# Custom bounds (for poster presentation)
lower_bounds = [2, 2, 32]
upper_bounds = [16, 256, 256]

for lobe in lobe_list:
    df_list = []
    for sid in sid_list:
        df_dict = {
            'SID':             sid,
            'lobe':            lobe,
            'label':           label_dict[sid],
        }
        
        for i in range(0, len(lower_bounds)):
            lower_bound = lower_bounds[i]
            upper_bound = upper_bounds[i]
            for artvein in artvein_list:
                for offset in offset_list:
                    files_matching = csvFile_list

                    # Check for sid and lobe matching in file namee
                    files_matching = [csvFile for csvFile in files_matching
                                          if sid == csvFile.split('\\')[-1][:3] and \
                                             lobe in csvFile.split('\\')[-1]]

                    # Check for artvein matching in file name
                    if artvein:
                        files_matching = [csvFile for csvFile in files_matching
                                          if artvein.capitalize() in csvFile.split('\\')[-1]]
                    else:
                        files_matching = [csvFile for csvFile in files_matching
                                          if 'Artery' not in csvFile.split('\\')[-1] and \
                                             'Vein' not in csvFile.split('\\')[-1]]

                    # Check for offset matching in complete path name
                    if offset:
                        files_matching = [csvFile for csvFile in files_matching
                                          if 'offset' in csvFile.lower()]
                    else:
                        files_matching = [csvFile for csvFile in files_matching
                                          if 'offset' not in csvFile.lower()]

                    artvein_label = 'UNSEP' if not artvein else artvein.upper()
                    for f in files_matching:
                        with open(f) as csv_file:
                            csv_reader = csv.reader(csv_file)
                            csv_data = list(csv_reader)

                        folder = f.split('\\')[-2]
                        
                        if 'FACTOR' in f:
                            continue

                        if '_fd' in f:
                            sc_arr = np.asarray(csv_data[4:]).astype(int)
                            sizes_raw, counts_raw = sc_arr[:, 0], sc_arr[:, 1]
                            
                            if 'By1' in folder:
                                m = 1
                            else:
                                m = 2

                            if '20Size' in f:
                                mask = np.arange(20, 1, -m)
                            elif 'ManySize' in folder:
                                mask = np.arange(upper_bound, lower_bound-1, -m)
                            else:
                                lower_log = int(np.log(lower_bound) / np.log(2))
                                upper_log = int(np.log(upper_bound) / np.log(2))
                                mask = 2**np.arange(upper_log, lower_log-1, -1)

                            mask_ind = len(sizes_raw) - np.searchsorted(sizes_raw[::-1], mask[::-1])[::-1] - 1

                            sizes = sizes_raw[mask_ind]
                            counts = counts_raw[mask_ind]

                            # Handles cases where box sizes are lower than expected (<256)
                            # Trims if actual box size is lower than upper_bound
                            for i in range(0, len(sizes)-1):
                                if sizes[i] < sizes[i+1]:
                                    sizes = sizes[i+1:]
                                    counts = counts[i+1:]
                                    break

                            sizes_log = np.log(sizes)
                            counts_log = np.log(counts)

                            poly = np.polynomial.polynomial.Polynomial.fit(x=sizes_log, y=counts_log, deg=1, domain=[])

                            df_dict[f'bounded_{lower_bound}_{upper_bound}__{folder}_{artvein_label}__Fractal_Dimension'] = -poly.coef[1]

            ### ADD ADDITONAL COLUMNS OF DATA HERE ###

            df_list.append(df_dict)
df = pd.DataFrame(df_list)

In [14]:
df.to_csv(f'06_14_22_boundedFDs.csv', index=False)