## Statistical Tests

In [5]:
import pandas as pd
import scipy.stats as stats
from test_wilcoxon_os import ts_wilcoxon_os as wilcox

# Read in data frames with richness scores
original_df = pd.read_csv('Data/scores_o.csv')
group1_df = pd.read_csv('Data/scores_1.csv')
group2_df = pd.read_csv('Data/scores_2.csv')

# Remove ID column
group1_df.drop(columns = group1_df.columns[0], axis = 1, inplace = True)
group2_df.drop(columns = group2_df.columns[0], axis = 1, inplace = True)
original_df.drop(columns = original_df.columns[0], axis = 1, inplace = True)

# Create empty data frame
test_results = pd.DataFrame(columns = ['Group 1 Shapiro Statistic', 'Group 1 Shapiro P-Value', 'Group 2 Shapiro Statistic', \
                                       'Group 2 Shapiro P-Value', 'Homogeneity of Variance Statistic', 'Homogeneity of Variance P-Value', \
                                       'Group 1 1-Sample Wilcox Statistic', 'Group 1 1-Sample Wilcox P-Value', 'Group 2 1-Sample Wilcox Statistic',\
                                       'Group 2 1-Sample Wilcox P-Value', '2-Sample Mann-Whitney U Test Statistic', '2-Sample Mann-Whitney U Test P-Value'])

# Statistical Tests                        
for i in range(group1_df.shape[1]):
        
    group1_stats = group1_df.iloc[:,i]
    group2_stats = group2_df.iloc[:,i]
    original_stats = original_df.iloc[:,i]
 
    results = []
    
    [normal_1_stat, normal_1_pvalue] = stats.shapiro(group1_stats)
    [normal_2_stat, normal_2_pvalue] = stats.shapiro(group2_stats)
    
    results.extend([normal_1_stat, normal_1_pvalue])
    results.extend([normal_2_stat, normal_2_pvalue])
    
    [ind_stat, ind_p] = stats.levene(group1_stats, group2_stats, center = 'median')
    results.extend([ind_stat, ind_p])
    
    one_1samp = wilcox(group1_stats, hypMed = original_stats.item())
    results.extend([one_1samp['statistic'], one_1samp['p-value']])
    
    two_1samp = wilcox(group2_stats, hypMed = original_stats.item())
    results.extend([two_1samp['statistic'], two_1samp['p-value']])

    [mw_stat, mw_p] = stats.mannwhitneyu(group1_stats, group2_stats)
    results.extend([mw_stat, mw_p])
    
    test_results.loc[len(test_results)] = results

test_results.index = ['TTR', 'Root TTR', 'Log TTR', 'Mass TTR', 'MSTR', 'MATTR', 'HDD', 'MTLD', 'MTLD wrap', 'MTLD bi', '# words', 'Custom AG']

print(test_results)

# Save results
test_results.to_csv(r'Data/test_results.csv')

           Group 1 Shapiro Statistic  Group 1 Shapiro P-Value  \
TTR                         0.891301                 0.363687   
Root TTR                    0.960316                 0.810182   
Log TTR                     0.933220                 0.618490   
Mass TTR                    0.958040                 0.794273   
MSTR                        0.966060                 0.849417   
MATTR                       0.906818                 0.448704   
HDD                         0.854089                 0.207793   
MTLD                        0.979741                 0.933230   
MTLD wrap                   0.977791                 0.922469   
MTLD bi                     0.933441                 0.620013   
# words                     0.893924                 0.377221   
Custom AG                   0.994748                 0.993397   

           Group 2 Shapiro Statistic  Group 2 Shapiro P-Value  \
TTR                         0.921946                 0.542568   
Root TTR                