# Numeric Results Summary
The purpose of this file is to summarize and print all the numeric results announced in Mahdavi & Seigel (2020) AS&T (Phase 1).

In [77]:
import pandas as pd
from scipy.stats import wilcoxon, ranksums, spearmanr

#### Calculation: Ranges of M_t, tCE, and dustmass 

# K-S test for comparing 1st and 7th cycle dust recovered PSDs
exec(open('C:/PhD Research/Paper 1 - Extraction/Code/artificial/Statistics/k_s_threshold.py').read())

# Recovery data (cycle by cycle)
df = pd.read_excel(r'C:/PhD Research/Paper 1 - Extraction/Processed/artificial/artl_dataset_summary.xlsx')

# Recovery data (aggregated over all cycles)
df_collapsed = pd.read_excel(r'C:/PhD Research/Paper 1 - Extraction/Processed/artificial/artl_dataset_summary_collapsed.xlsx')

# psd modal data
df_modal = pd.read_excel(r'C:/PhD Research/Paper 1 - Extraction/Processed/artificial/modal/psd_modal_analysis_summary.xlsx')

# all processed data for signed-rank test (Cycle 1 vs. Cycle 2)
df_wilcoxon = pd.read_excel(r'C:/PhD Research/Paper 1 - Extraction/Processed/artificial/sign_rank_all.xlsx')

# all Kolomogorov-Smirnove data
df_ks = pd.read_excel(r'C:/PhD Research/Paper 1 - Extraction/Processed/artificial/k_s_all.xlsx')


### Reading overall mass recovery and recovery efficiency
sum_ = df_collapsed[['M_t_cum', 'tCE_cum', 'dustmass']].describe()

### Reading the asfter-sieve/total dust recovery ratio 
df_ash = df[df['td'] == 2]
df_ash['d_t_ratio'] = (df_ash['M_d_cum'] / df_ash['M_t_cum'])*100
ratio = df_ash['d_t_ratio'].describe()

### Reading recovery after the first cycle
first_cyc_rec = df[df['Cycle_N'] == 1][['M_t', 'tCE']].describe()

### Reading recovery data from first to second cycle for filters going through at least 2 cycles
list_ExpN = df[df['Cycle_N'] == 2]['ExpN'].unique()

df_two = df[df['ExpN'].isin(list_ExpN)]
df_two = df_two[df_two["Cycle_N"] < 3]

df_two[df_two["Cycle_N"] == 1]['M_t_cum'].describe()
df_two[df_two["Cycle_N"] == 2]['M_t_cum'].describe()

### Reading CV of recovery amount and efficiency
df_cv = pd.read_excel(r'C:/PhD Research/Paper 1 - Extraction/Processed/artificial/artl_dataset_summary_w_cv.xlsx')
cv_iso_a = int(round(df_cv['cv_M_t_cum_1_1'].dropna().reset_index(drop = True)[0]/100,2)*100)
cv_ash_a = int(round(df_cv['cv_M_t_cum_2_1'].dropna().reset_index(drop = True)[0]/100,2)*100)

cv_iso_e = int(round(df_cv['cv_tCE_cum_1_1'].dropna().reset_index(drop = True)[0]/100,2)*100)
cv_ash_e = int(round(df_cv['cv_tCE_cum_2_1'].dropna().reset_index(drop = True)[0]/100,2)*100)

### Rank sum test p-value list for the comparison of extraction processes efficiencies
proc_list = [ranksums(df['E'], df['C'])[1], ranksums(df['C'], df['D'])[1], ranksums(df['E'], df['D'])[1]]

###################################
### Spearman's rank correlation ###
###################################

#### A function that cuts all the ExpN that have gone through less # of cycles desired
## Two main inputs: 1- Name of dataframe to cut, 2- the cycle number to apply

def Cycle_Refine(df_input, cyc, cut = False):
    cyc_list = list(df_input[df_input['Cycle_N'] == cyc]['ExpN'].unique())

    df_input = df_input[df_input['ExpN'].isin(cyc_list)]
    if cut == True:
        df_input = df_input[df_input['Cycle_N'] <= cyc]
    
    return df_input

### correlations for Figs 10-S-A,C

df_low = df[df['dustmass'] <10]
df_collapsed_low = df_collapsed[df_collapsed['dustmass'] <10]

fig_s11a_c = {'11a - all': spearmanr(df[df['Cycle_N'] == 1]['dustmass'], df[df['Cycle_N'] == 1]['M_t_cum']),
               '11a - all except high load' : spearmanr(df_low[df_low['Cycle_N'] == 1]['dustmass'], df_low[df_low['Cycle_N'] == 1]['M_t_cum']),
               '11c - all': spearmanr(df_collapsed['dustmass'], df_collapsed['M_t_cum']),
               '11c - all except high load': spearmanr(df_collapsed_low['dustmass'], df_collapsed_low['M_t_cum'])}

fig_s11a_c_r_val = [x[0] for x in fig_s11a_c.values()]
fig_s11a_c_p_val = [x[1] for x in fig_s11a_c.values()]

### correlations for Figs 10-S-B,D

df2 = Cycle_Refine(df,2)
df_low2 = Cycle_Refine(df_low,2, True)

fig_s11b_d = {'11b - all': spearmanr(df2[df2['Cycle_N'] == 2]['dustmass'], df2[df2['Cycle_N'] == 2]['M_t_cum']),
              '11d - all after 2nd cycle': spearmanr(df_collapsed[df_collapsed['Cycle_N'] != 1]['dustmass'], df_collapsed[df_collapsed['Cycle_N'] != 1]['M_t_cum'])}

fig_s11b_d_r_val = [x[0] for x in fig_s11a_d.values()]
fig_s11b_d_p_val = [x[1] for x in fig_s11a_d.values()]

fig_s11a_d_low = spearmanr(df_low2[df_low2['Cycle_N'] == 2]['dustmass'], df_low2[df_low2['Cycle_N'] == 2]['M_t_cum'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ash['d_t_ratio'] = (df_ash['M_d_cum'] / df_ash['M_t_cum'])*100


In [86]:
### Descriptive Statistics (all that are mentioned in the paper)

print('Descriptive Statistics Results:\n')
print('The total number of recovery tests from filters loaded with ISO-A2 or ASHRAE #2 test dust was N = {}.'.format(int(sum_.iloc[0,0])))

print('The range of loaded dust mass is {}-{} g with a median of {} g.'.format(round(sum_.iloc[3,2],1), 
                                                                                   round(sum_.iloc[7,2],1),
                                                                                   round(sum_.iloc[5,2],1)))

print('The range of total recovered dust is {}-{} g with a median of {} g.'.format(round(sum_.iloc[3,0],1), 
                                                                                   round(sum_.iloc[7,0],1),
                                                                                   round(sum_.iloc[5,0],1)))

print('The range of total recovery efficiency is {}-{}% with a median of {}%.'.format(round(sum_.iloc[3,1],1),
                                                                                      round(sum_.iloc[7,1],1),
                                                                                      round(sum_.iloc[5,1],1)))

print('The range of the ratio of the after-sieve portion to total dust recovery is {}-{}%'.format(round(ratio[3],1),
                                                                                              round(ratio[7],1)))

print('The range of total recovered dust after the first cycle is {}-{} g with a median of {} g.'.format(round(first_cyc_rec.iloc[3,0],1), 
                                                                                                         round(first_cyc_rec.iloc[7,0],1),
                                                                                                         round(first_cyc_rec.iloc[5,0],1)))

print('The range of total recovery efficiency after the first cycle is {}-{}% with a median of {}%.'.format(round(first_cyc_rec.iloc[3,1],1), 
                                                                                                             round(first_cyc_rec.iloc[7,1],1),
                                                                                                             round(first_cyc_rec.iloc[5,1],1)))

print('The total recovery amount raised from {}-{} g (median = {} g) to {}-{} g (median = {} g) for the {} filters that went \nthrough at least 2 cycles of extraction.'.format(round(df_two[df_two["Cycle_N"] == 1]['M_t_cum'].describe()[3],1), 
                                                                                                                                                                              round(df_two[df_two["Cycle_N"] == 1]['M_t_cum'].describe()[7],1),
                                                                                                                                                                              round(df_two[df_two["Cycle_N"] == 1]['M_t_cum'].describe()[5],1),
                                                                                                                                                                              round(df_two[df_two["Cycle_N"] == 2]['M_t_cum'].describe()[3],1),
                                                                                                                                                                              round(df_two[df_two["Cycle_N"] == 2]['M_t_cum'].describe()[7],1),
                                                                                                                                                                              round(df_two[df_two["Cycle_N"] == 2]['M_t_cum'].describe()[5],1),
                                                                                                                                                                              int(df_two[df_two["Cycle_N"] == 2]['M_t_cum'].describe()[0])))

print('After the first cycle, the Coefficient of Variation (CV) for the ISO and ASHRAE dust recovered amounts was {}% and {}%, \nrespectively. These values correspond to {}% and {}% recovery efficiency, respectively.'.format(cv_iso_a, 
                                                                                                                                                                                                                            cv_ash_a,
                                                                                                                                                                                                                            cv_iso_e,
                                                                                                                                                                                                                            cv_ash_e))

print('Suction, collection, and transfer efficiencies were measured as {} - {} % (median = {} %), {} - {} % (median = {} %), and {} - {} % (median = {} g), respectively.'.format(round(df['E'].describe()[3],1),
                                                                                                                                                                                  round(df['E'].describe()[7],1),
                                                                                                                                                                                  round(df['E'].describe()[5],1),
                                                                                                                                                                                  round(df['C'].describe()[3],1),
                                                                                                                                                                                  round(df['C'].describe()[7],1),
                                                                                                                                                                                  round(df['C'].describe()[5],1),
                                                                                                                                                                                  round(df[df['Cycle_N'] <= 2]['D'].describe()[3],1),
                                                                                                                                                                                  round(df[df['Cycle_N'] <= 2]['D'].describe()[7],1),
                                                                                                                                                                                  round(df[df['Cycle_N'] <= 2]['D'].describe()[5],1)))     

print('Mass closure was measured as {} - {}% with a median of {}%.'.format(round(df[df['Cycle_N'] == 1]['M_C'].describe()[3],1),
                                                                           round(df[df['Cycle_N'] == 1]['M_C'].describe()[7],1),
                                                                           round(df[df['Cycle_N'] == 1]['M_C'].describe()[5],1)))

print('The volume fraction of 4.2 µm mode from ISO-loaded filter recoveries was {} - {}% with a median of {}%.'.format(round((df_modal[(df_modal['td'] == 1) & (df_modal['ft'] != 0)]['mode_2_f'].describe()[3])*100,1),
                                                                                                                       round((df_modal[(df_modal['td'] == 1) & (df_modal['ft'] != 0)]['mode_2_f'].describe()[7])*100,1),
                                                                                                                       round((df_modal[(df_modal['td'] == 1) & (df_modal['ft'] != 0)]['mode_2_f'].describe()[5])*100,1)))
                                                                           
print('The volume fraction of 32 µm mode from ISO loaded filter recoveries was {} - {}% with a median of {}%.'.format(round((df_modal[(df_modal['td'] == 1) & (df_modal['ft'] != 0)]['mode_3_f'].describe()[3])*100,1),
                                                                                                                      round((df_modal[(df_modal['td'] == 1) & (df_modal['ft'] != 0)]['mode_3_f'].describe()[7])*100,1),
                                                                                                                      round((df_modal[(df_modal['td'] == 1) & (df_modal['ft'] != 0)]['mode_3_f'].describe()[5])*100,1)))

print('The volume fraction of 4.2 µm mode from ASHRAE-loaded filter recoveries was {} - {}% with a median of {}%.'.format(round((df_modal[(df_modal['td'] == 2) & (df_modal['ft'] != 0)]['mode_2_f'].describe()[3])*100,1),
                                                                                                                       round((df_modal[(df_modal['td'] == 2) & (df_modal['ft'] != 0)]['mode_2_f'].describe()[7])*100,1),
                                                                                                                       round((df_modal[(df_modal['td'] == 2) & (df_modal['ft'] != 0)]['mode_2_f'].describe()[5])*100,1)))

print('The volume fraction of 32 µm mode from ASHRAE-loaded filter recoveries was {} - {}% with a median of {}%.'.format(round((df_modal[(df_modal['td'] == 2) & (df_modal['ft'] != 0)]['mode_3_f'].describe()[3])*100,1),
                                                                                                                      round((df_modal[(df_modal['td'] == 2) & (df_modal['ft'] != 0)]['mode_3_f'].describe()[7])*100,1),
                                                                                                                      round((df_modal[(df_modal['td'] == 2) & (df_modal['ft'] != 0)]['mode_3_f'].describe()[5])*100,1)))


Descriptive Statistics Results:

The total number of recovery tests from filters loaded with ISO-A2 or ASHRAE #2 test dust was N = 20.
The range of loaded dust mass is 0.6-14.8 g with a median of 2.0 g.
The range of total recovered dust is 0.1-5.5 g with a median of 0.6 g.
The range of total recovery efficiency is 11.3-52.2% with a median of 27.7%.
The range of the ratio of the after-sieve portion to total dust recovery is 80.0-98.7%
The range of total recovered dust after the first cycle is 0.1-1.7 g with a median of 0.4 g.
The range of total recovery efficiency after the first cycle is 2.9-44.6% with a median of 21.2%.
The total recovery amount raised from 0.1-1.6 g (median = 0.5 g) to 0.3-1.9 g (median = 0.8 g) for the 11 filters that went 
through at least 2 cycles of extraction.
After the first cycle, the Coefficient of Variation (CV) for the ISO and ASHRAE dust recovered amounts was 60% and 73%, 
respectively. These values correspond to 47% and 55% recovery efficiency, respective

In [88]:
### Statistical test p-values

print('Statistical Test Results:\n')
print('Signed-rank test for comparing the cumulative recovery amount after 1st and 2nd cycle: p-value = {}.'.format(wilcoxon(df_wilcoxon['M_t_cum_1'],
                                                                                                                             df_wilcoxon['M_t_cum_2'])[1]))

print('Signed-rank test for comparing the cumulative recovery efficiency after 1st and 2nd cycle: p-value = {}.'.format(wilcoxon(df_wilcoxon['tCE_cum_1'],
                                                                                                                                 df_wilcoxon['tCE_cum_2'])[1]))
# Note two similar p-values is due to the simular increase ratio of recvovery amount and recovery efficiency (despite the absolute numbers are different)

print('Kolomogorov-Smirnov test for comparing the PSDs of test and recovered dust samples over 20 tests: maximum p-value = {}.'.format(df_ks['P Value'].describe()[7]))

print('The smallest p-value for the Kolomogorov-Smirnov test for comparing the PSDs of cumulative dust after 1st and 7th cycle up to a size of 61 µm: {} or more.'.format(threshold_p_value)) # from k_s_threshold.py

print('Signed-rank test for comparing the cumulative recovery efficiency after the 1st and 2nd cycle: p-value = {} or more.'.format(wilcoxon(df_wilcoxon['tCE_cum_1'],
                                                                                                                                 df_wilcoxon['tCE_cum_2'])[1]))

print('Rank-sum test to compare extraction processes (suction, collection, transfer): p-value = {} or less.'.format(max(proc_list)))

print('Rank-sum test to compare non-cumulative dust recovered from 1st and 2nd cycles: p-value = {}.'.format(ranksums(df[df['Cycle_N'] == 1]['E'], 
                                                                                                                      df[df['Cycle_N'] == 2]['E'])[1]))

print("The Spearman's correlation test results for Figure S10 A and C are: ρ = {} - {}, and p-values = {} or less.".format(round(min(fig_s11a_c_r_val),2),
                                                                                                                        round(max(fig_s11a_c_r_val),2),
                                                                                                                        round(max(fig_s11a_c_p_val),4)))

print("The Spearman's correlation test results for Figure S10 B and D are: ρ = {} - {}, and p-values = {} or less.".format(round(min(fig_s11b_d_r_val),2),
                                                                                                                        round(max(fig_s11b_d_r_val),2),
                                                                                                                        round(max(fig_s11b_d_p_val),2)))

print("The Spearman's correlation test results for Figure S10 B and D are: ρ = {} - {}, and p-values = {} or less.".format(round(min(fig_s11b_d_r_val),2),
                                                                                                                         round(max(fig_s11b_d_r_val),2),
                                                                                                                         round(max(fig_s11b_d_p_val),2)))

print("The Spearman's correlation test results for Figure S10 B and D are: ρ = {}, and p-value = {}.".format(round(fig_s11a_d_low[0],2),
                                                                                                             round(fig_s11a_d_low[1],2)))

Statistical Test Results:

Signed-rank test for comparing the cumulative recovery amount after 1st and 2nd cycle: p-value = 0.0009765625.
Signed-rank test for comparing the cumulative recovery efficiency after 1st and 2nd cycle: p-value = 0.0009765625.
Kolomogorov-Smirnov test for comparing the PSDs of test and recovered dust samples over 20 tests: maximum p-value = 7.247352654055354e-110.
The smallest p-value for the Kolomogorov-Smirnov test for comparing the PSDs of cumulative dust after 1st and 7th cycle up to a size of 61 µm: 0.07832815177324615 or more.
Signed-rank test for comparing the cumulative recovery efficiency after the 1st and 2nd cycle: p-value = 0.0009765625 or more.
Rank-sum test to compare extraction processes (suction, collection, transfer): p-value = 0.00015785291191827808 or less.
Rank-sum test to compare non-cumulative dust recovered from 1st and 2nd cycles: p-value = 0.0007105813875889367.
The Spearman's correlation test results for Figure S10 A and C are: ρ = 0.

End of code