In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scipy.stats as stats

### Import Features

In [63]:
filename = 'FDs_combined_10_26_21.csv'
# filename = 'relevant_features/calculated_relevantFeatures_2_standardized.csv'

calculated_features = pd.read_csv(filename)
calculated_features

Unnamed: 0,SID,lobe,artery_or_vein,label,num_files,files,FDs__Fractal_Dimension,FDs__Intercept,FDs__R_Squared,FDs_20Size__Fractal_Dimension,...,FDs_20Size_ArtVein__R_Squared,FDs_ArtVein__Fractal_Dimension,FDs_ArtVein__Intercept,FDs_ArtVein__R_Squared,FDs_ManySizeBy1_ArtVein__Fractal_Dimension,FDs_ManySizeBy1_ArtVein__Intercept,FDs_ManySizeBy1_ArtVein__R_Squared,FDs_ManySize_ArtVein__Fractal_Dimension,FDs_ManySize_ArtVein__Intercept,FDs_ManySize_ArtVein__R_Squared
0,62,left,,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,2.386236,14.134436,0.999772,2.371542,...,,,,,,,,,,
1,62,left,artery,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,0.999413,2.213112,13.449094,0.999254,2.224695,13.481557,0.997574,2.275365,13.646408,0.998857
2,62,left,vein,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,0.999027,2.079931,12.836250,0.999112,2.142450,13.036288,0.998021,2.162110,13.098350,0.998169
3,62,right,,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,2.429700,14.457652,0.999469,2.385905,...,,,,,,,,,,
4,62,right,artery,control,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,0.998956,2.206498,13.615497,0.998188,2.219218,13.643496,0.995373,2.286145,13.864656,0.998634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,298,right,artery,epah,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,0.999347,2.156634,13.328737,0.999091,2.209603,13.463781,0.997367,2.231202,13.528052,0.998471
815,298,right,vein,epah,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,0.999030,2.055671,12.820001,0.999076,2.135850,13.059079,0.996768,2.132905,13.041425,0.997727
816,298,whole,,epah,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,2.300824,14.677996,0.999219,2.325178,...,,,,,,,,,,
817,298,whole,artery,epah,5,['C:\\Users\\andre\\Documents\\GitHub\\sampleN...,,,,,...,0.999294,2.181330,14.095030,0.999297,2.215744,14.203367,0.998776,2.222713,14.228324,0.999012


### Functions: Generate Statistics

In [64]:
def generate_nick_stats(values_df, features, save=True, filename='test.csv'):
    df = values_df[features].describe().transpose()
    df.columns = [f'total__{col}' for col in df.columns.values]
    df.index.names = ['feature']
    
    features_list = []
    for feature in features:
        # Get basic statistics for each group
        temp_df = values_df[[feature, 'label']].groupby('label').describe().stack().T
        temp_df.columns = ['__'.join(col).strip() for col in temp_df.columns.values]
        temp_df.index.names= ['feature']
        
        features_dict = temp_df.to_dict(orient='records')[0]
        
        features_dict['feature'] = feature
        
        # Skip if data are not well behaved
        skip_msg = ''
        if len(values_df[feature].unique()) < 3:
            skip_msg = 'Not enough unique values! '
        elif values_df[feature].std() < 1e-12:
            skip_msg = 'Not enough variation! '

        features_dict['test_notes'] = skip_msg
        if len(skip_msg) > 1:
            features_list.append(features_dict)
            continue
        
        groups = [['control', 'epah'], ['control', 'pah'], ['epah', 'pah']]

        for group in groups:
            X1 = values_df[values_df['label'].eq(group[0])][feature].dropna()
            X2 = values_df[values_df['label'].eq(group[1])][feature].dropna()
            _, pvalue = stats.ttest_ind(X1, X2)
            features_dict[f't-test_{group[0]}_{group[1]}'] = pvalue

            _, pvalue = stats.mannwhitneyu(X1, X2)
            features_dict[f'wilcoxon_{group[0]}_{group[1]}'] = pvalue
        
        features_list.append(features_dict)
        
    df = df.merge(pd.DataFrame(features_list), on='feature')
    
    if save:
        df.to_csv(filename, index=False)
    
    return df

### Generate Nick Stats

In [62]:
# t-test + wilcoxon by pairs
filename = 'FDs_combined_10_26_21.csv'

calculated_features = pd.read_csv(filename)
calculated_features['artery_or_vein'].fillna('', inplace=True)
features = [col for col in calculated_features.columns if 'FDs' in col]

lobe_list = ['left', 'right', 'whole']
artvein_list = ['', 'artery', 'vein']

df_list = []
for lobe in lobe_list:
    for artvein in artvein_list:
        mask = calculated_features['lobe'].eq(lobe) & calculated_features['artery_or_vein'].eq(artvein)
        if not artvein:
            features = [col for col in calculated_features.columns if 'FDs' in col and 'ArtVein' not in col]
        else:
            features = [col for col in calculated_features.columns if 'FDs' in col and 'ArtVein' in col]
        df = generate_nick_stats(calculated_features[mask], features, save=False, filename=f'{lobe}_{artvein}_statistics.csv')
        df['lobe'] = lobe
        df['artery_or_vein'] = artvein
        df_list.append(df)

df_combined = pd.concat(df_list)
col_list = df_combined.columns.tolist()
df_combined = df_combined[['feature', 'lobe', 'artery_or_vein'] + col_list[1:-2]]
df_combined.to_csv('test.csv', index=False)

In [47]:
groups = [['control', 'epah'], ['control', 'pah'], ['epah', 'pah']]
feature = features[0]

print(f'Feature: {feature}\n')

for group in groups:
    mask = calculated_features['lobe'].eq(lobe) & calculated_features['artery_or_vein'].eq(artvein)
    X1 = calculated_features[calculated_features['label'].eq(group[0]) & mask][feature]
    X2 = calculated_features[calculated_features['label'].eq(group[1]) & mask][feature]
    _, pvalue = stats.ttest_ind(X1, X2)
    print(f't-test_{group[0]}_{group[1]}:    {pvalue}')
    
    _, pvalue = stats.mannwhitneyu(X1, X2)
    print(f'wilcoxon_{group[0]}_{group[1]}:  {pvalue}')
    print()

Feature: FDs_20SizeBy1_ArtVein__Fractal_Dimension

t-test_control_epah:    0.0787274440272598
wilcoxon_control_epah:  0.043303742562231684

t-test_control_pah:    0.00013553333749922756
wilcoxon_control_pah:  0.0001731091084786119

t-test_epah_pah:    0.15326966476192
wilcoxon_epah_pah:  0.07120961697531496



In [44]:
calculated_features[calculated_features['label'].eq('control') & mask][['SID', feature]]

Unnamed: 0,SID,FDs_20SizeBy1_ArtVein__Fractal_Dimension
8,62,1.983194
17,63,2.040068
26,64,1.929826
35,68,2.024104
44,70,2.065196
53,71,1.953676
62,72,1.910128
71,82,2.000261
80,87,2.022505
89,98,2.004986
