<h1 style="text-align:center; color:black;">------------------------------------------------------------------------</h1>
<h1 style="text-align:center; color:orange;"> - Load Data - </h1>

In [1]:
import pandas as pd # PANDAS DATAFRAMES
from skbio.stats.distance import DistanceMatrix # DISTANCE MATRIX OBJECT FOR BETA DIVERSITY
import itertools # ACCESS DATA ITERATION COMBINATIONS
import numpy as np  # NUMPY NUMERICAL AND LINEAR ALGEBRA TOOLKIT
import scipy as sp  # SCIPY SCIENTIFIC TOOLKIT
import random       # TO GENERATE RANDOM VALUES
random.seed(54321)  # SET RANDOM SEED FOR REPRODUCIBILITY


<h1 style="text-align:center; color:black;">------------------------------------------------------------------------</h1>
<h1 style="text-align:center; color:orange;"> - Beta Intra Inter Function - </h1>
<h3 style="text-align:center; color:blue;"> Compare beta diversity distances within and between categorical groups of samples. </h3>
<h4 style="text-align:left; color:black;"> Input the path to a beta diversity distance matrix (dmPath), a QIIME format mapping file (mapPath), and specify a categorical variable in the mapping file to separate distances (mapCategory). Subsample specifies the number of samples to subsample for each group (subsample=)  </h4>

In [2]:
### BETA DIVERSITY - INTRA/INTER DISTANCE COMPARISONS ###
def beta_intra_inter(dmPath, mapPath, mapCategory, subsample=100000):

    ##### IMPORT DISTANCE MATRIX AS DATAFRAME #####
    dmIn = DistanceMatrix.read(dmPath).to_data_frame()

    ##### IMPORT MAP #####
    mapIn = pd.read_csv(mapPath,sep='\t',low_memory=False)
    ### Set sample id as index ###
    mapIn.set_index('#SampleID',append=False,inplace=True)
    ### Get unique values in mapping category ###
    mapCats = list(mapIn[mapCategory].unique())

    ##### SORTING / ORGANIZING DATA #####
    ### Add category to distance matrix ###
    dmCat = pd.concat([dmIn, mapIn[mapCategory]],axis=1,join='inner')

    ##### STORAGE STRUCTURES #####
    allDists = {}
    meanDists = {}

    ##### SUBSAMPLE INDICES #####
    ### Create a dictionary to hold indices for each grouping ###
    groupIndices = {}
    ### For each group... store indices ###
    for curGroup in mapCats:
        ### If more samples than subsample depth... Subsample and store ###
        if len(dmCat[dmCat[mapCategory] == curGroup].index) > subsample: 
            groupIndices[curGroup] = list(dmCat[dmCat[mapCategory] == curGroup].sample(n=subsample, axis=0, replace=False).index)
        ### Else just use all samples from that group ###
        else: groupIndices[curGroup] = list(dmCat[dmCat[mapCategory] == curGroup].index)

    ##### CALCULATING GROUP DISTANCES #####
    ### For every combination of groups ###
    for curCompare in itertools.product(mapCats,repeat=2):

        ##### FORMAT DM #####
        ### Get Distances ###
        curDM = dmCat.loc[groupIndices[curCompare[0]],groupIndices[curCompare[1]]]
        ### If intra group distance we only want to take values from the top half of matrix ###
        if curCompare[0] == curCompare[1]:
            ### Set all values below diagonal to 0.0 ###
            curDM = pd.DataFrame(np.triu(curDM, k=0), index=curDM.index, columns=curDM.columns)
            ### Set all 0.0 values to np.nan (if two samples are identical (i.e. 0.0 dist) will be excluded!!! You probably did something wrong if that is the case :)
            curDM[curDM==0.0]=np.nan
        ### Flatten all distances into a list ###
        allCurDists = curDM.values.flatten()
        ### Remove nan (for intra) ###
        allCurDists = allCurDists[~np.isnan(allCurDists)]

        ##### AVERAGE SAMPLE DISTANCES BETWEEN GROUPS #####
        if curCompare[0] not in meanDists.keys(): meanDists[curCompare[0]] = {}
        meanDists[curCompare[0]][curCompare[1]] = curDM.mean(axis=1, skipna=True)

        ##### ALL SAMPLE DISTANCES BETWEEN GROUPS #####
        if curCompare[0] not in allDists.keys(): allDists[curCompare[0]] = {}
        allDists[curCompare[0]][curCompare[1]] = allCurDists

    ### Dataframe for Results ###
    dfOut = pd.DataFrame(columns=['dists','type','subsample','depth', 'metric','g0','g0mean','g0count','g1','g1mean','g1count','pval','tstat']); curIDX = 0

    ##### COMPARE GROUP DISTANCES #####
    ### For each combination of groups... ###
    for curCompare in itertools.combinations(mapCats,2):

        ### Calculate INTRA-INTRA Mann-Whitney-U on ALL Distances ###
        outMann = sp.stats.mannwhitneyu(allDists[curCompare[0]][curCompare[0]], allDists[curCompare[1]][curCompare[1]], use_continuity=True, alternative='two-sided')
        dfOut.loc[curIDX] = ['all','intra-intra',subsample,np.nan,np.nan,curCompare[0],np.mean(allDists[curCompare[0]][curCompare[0]]), len(allDists[curCompare[0]][curCompare[0]]),curCompare[1],np.mean(allDists[curCompare[1]][curCompare[1]]), len(allDists[curCompare[1]][curCompare[1]]),outMann[1], outMann[0]]; curIDX+=1

        ### Calculate INTRA-INTER Mann-Whitney-U on ALL Distances ###
        outMann = sp.stats.mannwhitneyu(allDists[curCompare[0]][curCompare[0]],allDists[curCompare[0]][curCompare[1]], use_continuity=True, alternative='two-sided')
        dfOut.loc[curIDX] = ['all','intra-inter',subsample,np.nan,np.nan,curCompare[0],np.mean(allDists[curCompare[0]][curCompare[0]]),len(allDists[curCompare[0]][curCompare[0]]),(curCompare[0]+'-'+curCompare[1]),np.mean(allDists[curCompare[0]][curCompare[1]]),len(allDists[curCompare[0]][curCompare[1]]),outMann[1], outMann[0]]; curIDX+=1

        ### Calculate INTRA-INTER Mann-Whitney-U on ALL Distances ###
        outMann = sp.stats.mannwhitneyu(allDists[curCompare[1]][curCompare[1]],allDists[curCompare[0]][curCompare[1]], use_continuity=True, alternative='two-sided')
        dfOut.loc[curIDX] = ['all','intra-inter',subsample,np.nan,np.nan,curCompare[1],np.mean(allDists[curCompare[1]][curCompare[1]]),len(allDists[curCompare[1]][curCompare[1]]),(curCompare[1]+'-'+curCompare[0]),np.mean(allDists[curCompare[0]][curCompare[1]]),len(allDists[curCompare[0]][curCompare[1]]),outMann[1], outMann[0]]; curIDX+=1

        ### Calculate INTRA-INTRA Mann-Whitney-U on MEAN Distances ### 
        outMann = sp.stats.mannwhitneyu(meanDists[curCompare[0]][curCompare[0]],meanDists[curCompare[1]][curCompare[1]],use_continuity=True, alternative='two-sided')
        dfOut.loc[curIDX] = ['mean','intra-intra',subsample,np.nan,np.nan,curCompare[0],np.mean(meanDists[curCompare[0]][curCompare[0]]), len(meanDists[curCompare[0]][curCompare[0]]),curCompare[1],np.mean(meanDists[curCompare[1]][curCompare[1]]), len(meanDists[curCompare[1]][curCompare[1]]),outMann[1], outMann[0]]; curIDX+=1

        ### Calculate INTRA-INTER Mann-Whitney-U on MEAN Distances ### 
        outMann = sp.stats.mannwhitneyu(meanDists[curCompare[0]][curCompare[0]],meanDists[curCompare[0]][curCompare[1]],use_continuity=True, alternative='two-sided')
        dfOut.loc[curIDX] = ['mean','intra-inter',subsample,np.nan,np.nan,curCompare[0],np.mean(meanDists[curCompare[0]][curCompare[0]]), len(meanDists[curCompare[0]][curCompare[0]]),curCompare[0]+'-'+curCompare[1],np.mean(meanDists[curCompare[0]][curCompare[1]]), len(meanDists[curCompare[0]][curCompare[1]]),outMann[1], outMann[0]]; curIDX+=1

        ### Calculate INTRA-INTER Mann-Whitney-U on MEAN Distances ### 
        outMann = sp.stats.mannwhitneyu(meanDists[curCompare[1]][curCompare[1]],meanDists[curCompare[1]][curCompare[0]],use_continuity=True, alternative='two-sided')
        dfOut.loc[curIDX] = ['mean','intra-inter',subsample,np.nan,np.nan,curCompare[1],np.mean(meanDists[curCompare[1]][curCompare[1]]), len(meanDists[curCompare[1]][curCompare[1]]),curCompare[1]+'-'+curCompare[0],np.mean(meanDists[curCompare[1]][curCompare[0]]), len(meanDists[curCompare[1]][curCompare[0]]),outMann[1], outMann[0]]; curIDX+=1

    return dfOut

<h1 style="text-align:center; color:black;">------------------------------------------------------------------------</h1>
<h1 style="text-align:center; color:orange;"> - Beta Intra Inter Pipeline - </h1>
<h4 style="text-align:left; color:black;"> Pick a mapping category and the number of times to subsample, then loop through depths, beta diversity metrics, and the subsample set of samples.  </h4>

In [129]:
### BETA DIVERSITY - INTRA/INTER DISTANCE COMPARISONS ###
def beta_summarize_category(dmPath, mapPath, mapCategory):

    ##### IMPORT DISTANCE MATRIX AS DATAFRAME #####
    dmIn = DistanceMatrix.read(dmPath).to_data_frame()

    ##### IMPORT MAP #####
    mapIn = pd.read_csv(mapPath,sep='\t',low_memory=False)
    ### Set sample id as index ###
    mapIn.set_index('#SampleID',append=False,inplace=True)
    ### Get unique values in mapping category ###
    mapCats = list(mapIn[mapCategory].unique())

    ##### SORTING / ORGANIZING DATA #####
    ### Add category to distance matrix ###
    dmCat = pd.concat([dmIn, mapIn[mapCategory]],axis=1,join='inner')

    ##### STORAGE STRUCTURES #####
    allDists = {}

    ##### SUBSAMPLE INDICES #####
    ### Create a dictionary to hold indices for each grouping ###
    groupIndices = {}
    ### For each group... store indices ###
    for curGroup in mapCats: groupIndices[curGroup] = list(dmCat[dmCat[mapCategory] == curGroup].index)

    ##### CALCULATING GROUP DISTANCES #####
    ### For every combination of groups ###
    for curCompare in itertools.product(mapCats,repeat=2):

        ##### FORMAT DM #####
        ### Get Distances ###
        curDM = dmCat.loc[groupIndices[curCompare[0]],groupIndices[curCompare[1]]]
        ### If intra group distance we only want to take values from the top half of matrix ###
        if curCompare[0] == curCompare[1]:
            ### Set all values below diagonal to 0.0 ###
            curDM = pd.DataFrame(np.triu(curDM, k=0), index=curDM.index, columns=curDM.columns)
            ### Set all 0.0 values to np.nan (if two samples are identical (i.e. 0.0 dist) will be excluded!!! You probably did something wrong if that is the case :)
            curDM[curDM==0.0]=np.nan
        ### Flatten all distances into a list ###
        allCurDists = curDM.values.flatten()
        ### Remove nan (for intra) ###
        allCurDists = allCurDists[~np.isnan(allCurDists)]

        ##### ALL SAMPLE DISTANCES BETWEEN GROUPS #####
        if curCompare[0] not in allDists.keys(): allDists[curCompare[0]] = {}
        allDists[curCompare[0]][curCompare[1]] = allCurDists
    
    allIntra = []
    allInter = []
    ### Intra Group Average Distances ###
    for curCompare in itertools.product(mapCats,repeat=2):
        if curCompare[0] == curCompare[1]: 
            allIntra.extend(allDists[curCompare[0]][curCompare[1]])
            print('   - All Intra-Group Average '+curCompare[0]+'-'+curCompare[1]+': '+str(np.mean(allDists[curCompare[0]][curCompare[1]])))
    ### Inter Group Average Distances ###
    print()
    for curCompare in itertools.product(mapCats,repeat=2):
        if curCompare[0] != curCompare[1]: 
            allInter.extend(allDists[curCompare[0]][curCompare[1]])
            print('   - All Inter-Group Average '+curCompare[0]+'-'+curCompare[1]+': '+str(np.mean(allDists[curCompare[0]][curCompare[1]])))
    print('\n')
    ### Calculate Average of all Intra and Inter-Group Distances ###
    print(' - All Intra-Group Average Distance: '+str(np.mean(allIntra)))
    print('   - Number of Intra Values: '+str(len(allIntra)))
    print(' - All Inter-Group Average Distance: '+str(np.mean(allInter)))
    print('   - Number of Inter Values: '+str(len(allInter)))
    ### Calculate Significance ###
    outMann = sp.stats.mannwhitneyu(allIntra, allInter, use_continuity=True, alternative='two-sided')
    print(' - Compare All Intra-Inter: p='+str(outMann[1])+' stat='+str(outMann[0]))


    
### Run Results for Bray Curtis Consensus Distances ###    
curDepth = 1000
curMetric = 'bray_curtis'
dmPath = 'test_data/ag_analysis/4_0_beta_diversity/4_0_beta_'+str(curDepth)+'_'+curMetric+'/consensus_dm.txt' 
mapPath = 'test_data/ag_analysis/1_1_qc_'+str(curDepth)+'_map.txt'

beta_summarize_category(dmPath, mapPath, mapCategory)

   - All Intra-Group Average Caucasian-Caucasian: 0.805934447523
   - All Intra-Group Average Hispanic-Hispanic: 0.754242972973
   - All Intra-Group Average Asian or Pacific Islander-Asian or Pacific Islander: 0.829118986416
   - All Intra-Group Average African American-African American: 0.812741666667

   - All Inter-Group Average Caucasian-Hispanic: 0.788113244554
   - All Inter-Group Average Caucasian-Asian or Pacific Islander: 0.823720624403
   - All Inter-Group Average Caucasian-African American: 0.817233919532
   - All Inter-Group Average Hispanic-Caucasian: 0.788113244554
   - All Inter-Group Average Hispanic-Asian or Pacific Islander: 0.809960307125
   - All Inter-Group Average Hispanic-African American: 0.798771954262
   - All Inter-Group Average Asian or Pacific Islander-Caucasian: 0.823720624403
   - All Inter-Group Average Asian or Pacific Islander-Hispanic: 0.809960307125
   - All Inter-Group Average Asian or Pacific Islander-African American: 0.8306
   - All Inter-Group A

In [3]:
### Map Category ###
mapCategory = 'race'

### Number of times to repeatedly subsample ###
numSubSample = 10
### Store Resulting Dataframes in List ###
xList = []

### For each Rarefaction Depth ###
for curDepth in [1000,10000]:
    
    ### For each beta diversity metric ###
    for curMetric in ['bray_curtis','weighted_unifrac','binary_jaccard','unweighted_unifrac']:
        
        ### For each desired depth of subsampling ###
        for subsample in [10,15,20,25,30,35,40,45,50,75,100,250,500,1000,2000]:
    
            print(str(curDepth)+' - '+curMetric+' - '+str(subsample))
            ### For the Number of Repeated Subsamples ###
            for numSub in np.arange(numSubSample):
                ### Perform the beta intra / inter analysis ###
                xOut = beta_intra_inter('test_data/ag_analysis/4_0_beta_diversity/4_0_beta_'+str(curDepth)+'_'+curMetric+'/consensus_dm.txt', 
                                        'test_data/ag_analysis/1_1_qc_'+str(curDepth)+'_map.txt', mapCategory, subsample=subsample)
                ### Set metric and depth in output ###
                xOut['metric'] = curMetric
                xOut['depth'] = curDepth
                ### Store Results ###
                xList.append(xOut)

### Concatenate Results ###
xFinal = pd.concat(xList, axis=0)
### Update Index ###
xFinal = xFinal.groupby(['dists','type','subsample','g0','g1','metric','depth']).mean().T
### Save all statistics ###
xFinal.to_csv('test_data/ag_analysis/4_1_beta_intra_inter_stats.txt',sep='\t')

##### Table with P-values stacked and other stats removed #####
### Let's make another copy to stack the pvalues by subsample level ###
xManipulate = pd.concat(xList, axis=0)
xStacked = 'none'
### For each subsample depth ###
subsUnique = xManipulate['subsample'].unique()
for subS in subsUnique:
    ### Initialize as single subsample depth ###
    if isinstance(xStacked, str): xStacked = xManipulate[xManipulate['subsample'] == subS].copy()
    ### And always set column name to subsample depth ###
    xStacked[subS] = list(xManipulate[xManipulate['subsample'] == subS]['pval'])

### Remove Non-pvalue columns ###
xStacked = xStacked.drop(['g0mean','g1mean','subsample','pval','tstat','g0count','g1count'], 1)
### Groupby ###
xStacked = xStacked.groupby(['dists','type','g0','g1','metric','depth']).mean().T  
### Save ###
xStacked.to_csv('test_data/ag_analysis/4_1_beta_intra_inter_pstacked.txt',sep='\t')

1000 - bray_curtis - 10
1000 - bray_curtis - 15
1000 - bray_curtis - 20
1000 - bray_curtis - 25
1000 - bray_curtis - 30
1000 - bray_curtis - 35
1000 - bray_curtis - 40
1000 - bray_curtis - 45
1000 - bray_curtis - 50
1000 - bray_curtis - 75
1000 - bray_curtis - 100
1000 - bray_curtis - 250
1000 - bray_curtis - 500
1000 - bray_curtis - 1000
1000 - bray_curtis - 2000
1000 - weighted_unifrac - 10
1000 - weighted_unifrac - 15
1000 - weighted_unifrac - 20
1000 - weighted_unifrac - 25
1000 - weighted_unifrac - 30
1000 - weighted_unifrac - 35
1000 - weighted_unifrac - 40
1000 - weighted_unifrac - 45
1000 - weighted_unifrac - 50
1000 - weighted_unifrac - 75
1000 - weighted_unifrac - 100
1000 - weighted_unifrac - 250
1000 - weighted_unifrac - 500
1000 - weighted_unifrac - 1000
1000 - weighted_unifrac - 2000
1000 - binary_jaccard - 10
1000 - binary_jaccard - 15
1000 - binary_jaccard - 20
1000 - binary_jaccard - 25
1000 - binary_jaccard - 30
1000 - binary_jaccard - 35
1000 - binary_jaccard - 40
10