In [1]:
### import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scipy.stats as stats

### Import Features

In [2]:
filename = 'FDs_combined_04_14_22_analysis.csv'
# filename = 'relevant_features/calculated_relevantFeatures_2_standardized.csv'

calculated_features = pd.read_csv(filename)
calculated_features

Unnamed: 0,SID,lobe,artery_or_vein,label,num_files,files,FDs__Fractal_Dimension,FDs__R_Squared,FDs_20Size__Fractal_Dimension,FDs_20Size__R_Squared,...,FDs_FACTOR_1_3_SCALE_ManySizeBy1_ArtVein__Fractal_Dimension,FDs_FACTOR_1_3_SCALE_ManySizeBy1_ArtVein__R_Squared,FDs_FACTOR_1_4_SCALE_ManySizeBy1_ArtVein__Fractal_Dimension,FDs_FACTOR_1_4_SCALE_ManySizeBy1_ArtVein__R_Squared,FDs_FACTOR_1_5_SCALE_ManySizeBy1_ArtVein__Fractal_Dimension,FDs_FACTOR_1_5_SCALE_ManySizeBy1_ArtVein__R_Squared,FDs_20SizeBy1_allOffsets_ArtVein__Fractal_Dimension,FDs_20SizeBy1_allOffsets_ArtVein__R_Squared,FDs_20Size_allOffsets_ArtVein__Fractal_Dimension,FDs_20Size_allOffsets_ArtVein__R_Squared
0,62,left,,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,2.386236,0.999772,2.371542,0.999145,...,,,,,,,,,,
1,62,left,,control,0,[],,,,,...,,,,,,,,,,
2,62,left,artery,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,,,,
3,62,left,artery,control,0,[],,,,,...,,,,,,,,,,
4,62,left,vein,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1633,298,whole,,epah,2,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,,,,
1634,298,whole,artery,epah,41,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,2.247235,0.998677,2.257065,0.998567,2.265681,0.998448,,,,
1635,298,whole,artery,epah,2,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,2.074771,0.999301,2.083861,0.999371
1636,298,whole,vein,epah,41,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,2.183483,0.999152,2.194562,0.999163,2.205277,0.999151,,,,


### Functions: Generate Statistics

In [3]:
def generate_nick_stats(values_df, features, save=True, filename='test.csv'):
    df = values_df[features].describe().transpose()
    df.columns = [f'total__{col}' for col in df.columns.values]
    df.index.names = ['feature']
    
    features_list = []
    for feature in features:
        # Get basic statistics for each group
        temp_df = values_df[[feature, 'label']].groupby('label').describe().stack().T
        temp_df.columns = ['__'.join(col).strip() for col in temp_df.columns.values]
        temp_df.index.names= ['feature']
        
        features_dict = temp_df.to_dict(orient='records')[0]
        
        features_dict['feature'] = feature
        
        # Skip if data are not well behaved
        skip_msg = ''
        if len(values_df[feature].unique()) < 3:
            skip_msg = 'Not enough unique values! '
        elif values_df[feature].std() < 1e-12:
            skip_msg = 'Not enough variation! '

        features_dict['test_notes'] = skip_msg
        if len(skip_msg) > 1:
            features_list.append(features_dict)
            continue
        
        groups = [['control', 'epah'], ['control', 'pah'], ['epah', 'pah']]

        for group in groups:
            X1 = values_df[values_df['label'].eq(group[0])][feature].dropna()
            X2 = values_df[values_df['label'].eq(group[1])][feature].dropna()
            _, pvalue = stats.ttest_ind(X1, X2)
            features_dict[f't-test_{group[0]}_{group[1]}'] = pvalue

            _, pvalue = stats.mannwhitneyu(X1, X2)
            features_dict[f'wilcoxon_{group[0]}_{group[1]}'] = pvalue
        
        features_list.append(features_dict)
        
    df = df.merge(pd.DataFrame(features_list), on='feature')
    
    if save:
        df.to_csv(filename, index=False)
    
    return df

### Generate Nick Stats
Generates CSV file w/ summary data + significance testing for each FD category

In [12]:
# t-test + wilcoxon by pairs
filename = 'FDs_combined_04_14_22_analysis.csv'

calculated_features = pd.read_csv(filename)
calculated_features['artery_or_vein'].fillna('', inplace=True)
features = [col for col in calculated_features.columns if 'FDs' in col]

# lobe_list = ['left', 'right', 'whole']
lobe_list = ['whole']
artvein_list = ['', 'artery', 'vein']

df_list = []
for lobe in lobe_list:
    for artvein in artvein_list:
        mask = calculated_features['lobe'].eq(lobe) & calculated_features['artery_or_vein'].eq(artvein)
        if not artvein:
            features = [col for col in calculated_features.columns if 'FDs' in col and 'ArtVein' not in col]
        else:
            features = [col for col in calculated_features.columns if 'FDs' in col and 'ArtVein' in col]
        df = generate_nick_stats(calculated_features[mask], features, save=False, filename=f'{lobe}_{artvein}_statistics.csv')
        df['lobe'] = lobe
        df['artery_or_vein'] = artvein
        df['FD_type'] = np.nan
        df.loc[df['feature'].str.contains('20SizeBy1'), 'FD_type'] = '20SizeBy1'
        df.loc[df['feature'].str.contains('ManySizeBy1'), 'FD_type'] = 'ManySizeBy1'
        df['SCALE'] = 1
#         df_list.append(df)
#         df_list.append(df[df['feature'].str.contains('FACTOR') & df['feature'].str.contains('Fractal')])
        df_list.append(df[df['feature'].str.contains('Fractal')])

df_combined = pd.concat(df_list)
col_list = df_combined.columns.tolist()
df_combined = df_combined[['feature', 'lobe', 'artery_or_vein', 'FD_type', 'SCALE'] + col_list[1:-4]]
df_combined.to_csv('test.csv', index=False)

In [6]:
groups = [['control', 'epah'], ['control', 'pah'], ['epah', 'pah']]
feature = features[0]

print(f'Feature: {feature}\n')

for group in groups:
    mask = calculated_features['lobe'].eq(lobe) & calculated_features['artery_or_vein'].eq(artvein)
    X1 = calculated_features[calculated_features['label'].eq(group[0]) & mask][feature]
    X2 = calculated_features[calculated_features['label'].eq(group[1]) & mask][feature]
    _, pvalue = stats.ttest_ind(X1, X2)
    print(f't-test_{group[0]}_{group[1]}:    {pvalue}')
    
    _, pvalue = stats.mannwhitneyu(X1, X2)
    print(f'wilcoxon_{group[0]}_{group[1]}:  {pvalue}')
    print()

Feature: FDs_20SizeBy1_ArtVein__Fractal_Dimension

t-test_control_epah:    nan
wilcoxon_control_epah:  0.05225653096780727

t-test_control_pah:    nan
wilcoxon_control_pah:  0.07934847998328132

t-test_epah_pah:    nan
wilcoxon_epah_pah:  0.28092408662175083



In [None]:
calculated_features[calculated_features['label'].eq('control') & mask][['SID', feature]]