In [1]:
### import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scipy.stats as stats

### Import Features

In [3]:
filename = 'FDs_combined_03_21_22_analysis.csv'
# filename = 'relevant_features/calculated_relevantFeatures_2_standardized.csv'

calculated_features = pd.read_csv(filename)
calculated_features

Unnamed: 0,SID,lobe,artery_or_vein,label,num_files,files,FDs__Fractal_Dimension,FDs__R_Squared,FDs_20Size__Fractal_Dimension,FDs_20Size__R_Squared,...,FDs_FACTOR_1_3_SCALE_ArtVein__Fractal_Dimension,FDs_FACTOR_1_3_SCALE_ArtVein__R_Squared,FDs_FACTOR_1_4_SCALE_ArtVein__Fractal_Dimension,FDs_FACTOR_1_4_SCALE_ArtVein__R_Squared,FDs_FACTOR_1_5_SCALE_ArtVein__Fractal_Dimension,FDs_FACTOR_1_5_SCALE_ArtVein__R_Squared,FDs_20SizeBy1_allOffsets_ArtVein__Fractal_Dimension,FDs_20SizeBy1_allOffsets_ArtVein__R_Squared,FDs_20Size_allOffsets_ArtVein__Fractal_Dimension,FDs_20Size_allOffsets_ArtVein__R_Squared
0,62,left,,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,2.386236,0.999772,2.371542,0.999145,...,,,,,,,,,,
1,62,left,,control,0,[],,,,,...,,,,,,,,,,
2,62,left,artery,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,,,,
3,62,left,artery,control,0,[],,,,,...,,,,,,,,,,
4,62,left,vein,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1633,298,whole,,epah,2,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,,,,
1634,298,whole,artery,epah,29,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,2.253593,0.999535,2.276127,0.999461,2.297247,0.999345,,,,
1635,298,whole,artery,epah,2,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,,,,,,,2.074771,0.999301,2.083861,0.999371
1636,298,whole,vein,epah,29,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,2.161593,0.999834,2.186198,0.999854,2.208934,0.999842,,,,


### Functions: Generate Statistics

In [2]:
def generate_nick_stats(values_df, features, save=True, filename='test.csv'):
    df = values_df[features].describe().transpose()
    df.columns = [f'total__{col}' for col in df.columns.values]
    df.index.names = ['feature']
    
    features_list = []
    for feature in features:
        # Get basic statistics for each group
        temp_df = values_df[[feature, 'label']].groupby('label').describe().stack().T
        temp_df.columns = ['__'.join(col).strip() for col in temp_df.columns.values]
        temp_df.index.names= ['feature']
        
        features_dict = temp_df.to_dict(orient='records')[0]
        
        features_dict['feature'] = feature
        
        # Skip if data are not well behaved
        skip_msg = ''
        if len(values_df[feature].unique()) < 3:
            skip_msg = 'Not enough unique values! '
        elif values_df[feature].std() < 1e-12:
            skip_msg = 'Not enough variation! '

        features_dict['test_notes'] = skip_msg
        if len(skip_msg) > 1:
            features_list.append(features_dict)
            continue
        
        groups = [['control', 'epah'], ['control', 'pah'], ['epah', 'pah']]

        for group in groups:
            X1 = values_df[values_df['label'].eq(group[0])][feature].dropna()
            X2 = values_df[values_df['label'].eq(group[1])][feature].dropna()
            _, pvalue = stats.ttest_ind(X1, X2)
            features_dict[f't-test_{group[0]}_{group[1]}'] = pvalue

            _, pvalue = stats.mannwhitneyu(X1, X2)
            features_dict[f'wilcoxon_{group[0]}_{group[1]}'] = pvalue
        
        features_list.append(features_dict)
        
    df = df.merge(pd.DataFrame(features_list), on='feature')
    
    if save:
        df.to_csv(filename, index=False)
    
    return df

### Generate Nick Stats

In [5]:
# t-test + wilcoxon by pairs
filename = 'FDs_combined_03_21_22_analysis.csv'

calculated_features = pd.read_csv(filename)
calculated_features['artery_or_vein'].fillna('', inplace=True)
features = [col for col in calculated_features.columns if 'FDs' in col]

# lobe_list = ['left', 'right', 'whole']
lobe_list = ['whole']
artvein_list = ['', 'artery', 'vein']

df_list = []
for lobe in lobe_list:
    for artvein in artvein_list:
        mask = calculated_features['lobe'].eq(lobe) & calculated_features['artery_or_vein'].eq(artvein)
        if not artvein:
            features = [col for col in calculated_features.columns if 'FDs' in col and 'ArtVein' not in col]
        else:
            features = [col for col in calculated_features.columns if 'FDs' in col and 'ArtVein' in col]
        df = generate_nick_stats(calculated_features[mask], features, save=False, filename=f'{lobe}_{artvein}_statistics.csv')
        df['lobe'] = lobe
        df['artery_or_vein'] = artvein
#         df_list.append(df)
#         df_list.append(df[df['feature'].str.contains('FACTOR') & df['feature'].str.contains('Fractal')])
        df_list.append(df[df['feature'].str.contains('Fractal')])

df_combined = pd.concat(df_list)
col_list = df_combined.columns.tolist()
df_combined = df_combined[['feature', 'lobe', 'artery_or_vein'] + col_list[1:-2]]
df_combined.to_csv('test.csv', index=False)

In [14]:
df

Unnamed: 0,feature,total__count,total__mean,total__std,total__min,total__25%,total__50%,total__75%,total__max,control__count,...,pah__max,test_notes,t-test_control_epah,wilcoxon_control_epah,t-test_control_pah,wilcoxon_control_pah,t-test_epah_pah,wilcoxon_epah_pah,lobe,artery_or_vein
0,FDs_20SizeBy1_ArtVein__Fractal_Dimension,91.0,1.963792,0.050278,1.783274,1.937723,1.971686,1.991596,2.065196,37.0,...,2.054619,,0.134717,0.06686,0.0002299459,0.0003199775,0.15327,0.07121,whole,vein
1,FDs_20SizeBy1_ArtVein__R_Squared,91.0,0.999056,0.000283,0.997786,0.998928,0.999104,0.999258,0.999499,37.0,...,0.99948,,0.509802,0.191643,0.4691134,0.4278866,0.95558,0.177255,whole,vein
2,FDs_20Size_ArtVein__Fractal_Dimension,91.0,1.977688,0.049232,1.801344,1.95451,1.981179,2.006218,2.072776,37.0,...,2.069159,,0.205609,0.083279,0.0004760515,0.0005829995,0.143868,0.065719,whole,vein
3,FDs_20Size_ArtVein__R_Squared,91.0,0.999054,0.000291,0.99801,0.998941,0.999091,0.999238,0.999501,37.0,...,0.999491,,0.030938,0.018725,0.05447623,0.1593316,0.99138,0.217626,whole,vein
4,FDs_ArtVein__Fractal_Dimension,91.0,2.093805,0.062277,1.86925,2.053745,2.097086,2.136432,2.227913,37.0,...,2.16717,,0.02024,0.017686,1.196006e-06,1.567058e-06,0.197148,0.11177,whole,vein
5,FDs_ArtVein__R_Squared,91.0,0.99878,0.000687,0.996684,0.998388,0.998935,0.999287,0.999861,37.0,...,0.999861,,0.181773,0.076381,0.0791059,0.05302919,0.822095,0.342473,whole,vein
6,FDs_ManySizeBy1_ArtVein__Fractal_Dimension,91.0,2.178462,0.073806,1.956982,2.142409,2.180236,2.238188,2.304803,37.0,...,2.289034,,0.006096,0.008057,0.0001536144,0.0001311799,0.743594,0.454447,whole,vein
7,FDs_ManySizeBy1_ArtVein__R_Squared,91.0,0.996875,0.001165,0.991131,0.996218,0.997152,0.997556,0.998832,37.0,...,0.99824,,0.095372,0.058284,0.3610285,0.1221656,0.297723,0.096733,whole,vein
8,FDs_ManySize_ArtVein__Fractal_Dimension,91.0,2.179391,0.074575,1.960508,2.138691,2.187276,2.235311,2.339609,37.0,...,2.276483,,0.018976,0.041629,1.984017e-05,1.45487e-05,0.66861,0.242898,whole,vein
9,FDs_ManySize_ArtVein__R_Squared,91.0,0.997407,0.001085,0.991037,0.996997,0.997509,0.998149,0.998936,37.0,...,0.998694,,0.023817,0.020959,0.8201817,0.4356105,0.042916,0.005405,whole,vein


In [6]:
groups = [['control', 'epah'], ['control', 'pah'], ['epah', 'pah']]
feature = features[0]

print(f'Feature: {feature}\n')

for group in groups:
    mask = calculated_features['lobe'].eq(lobe) & calculated_features['artery_or_vein'].eq(artvein)
    X1 = calculated_features[calculated_features['label'].eq(group[0]) & mask][feature]
    X2 = calculated_features[calculated_features['label'].eq(group[1]) & mask][feature]
    _, pvalue = stats.ttest_ind(X1, X2)
    print(f't-test_{group[0]}_{group[1]}:    {pvalue}')
    
    _, pvalue = stats.mannwhitneyu(X1, X2)
    print(f'wilcoxon_{group[0]}_{group[1]}:  {pvalue}')
    print()

Feature: FDs_20SizeBy1_ArtVein__Fractal_Dimension

t-test_control_epah:    nan
wilcoxon_control_epah:  0.05225653096780727

t-test_control_pah:    nan
wilcoxon_control_pah:  0.07934847998328132

t-test_epah_pah:    nan
wilcoxon_epah_pah:  0.28092408662175083



In [44]:
calculated_features[calculated_features['label'].eq('control') & mask][['SID', feature]]

Unnamed: 0,SID,FDs_20SizeBy1_ArtVein__Fractal_Dimension
8,62,1.983194
17,63,2.040068
26,64,1.929826
35,68,2.024104
44,70,2.065196
53,71,1.953676
62,72,1.910128
71,82,2.000261
80,87,2.022505
89,98,2.004986
