<h1 style="text-align:Center; color:orange;">- BIOM Analysis -</h1>
<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>
<h4 style="text-align:center; color:blue;">Andrew W. Brooks</h4>
<h4 style="text-align:center; color:blue;">Vanderbilt Genetics Institute</h4>
<h4 style="text-align:center; color:blue;">andrew.w.brooks(at)vanderbilt.edu</h4>
<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>

<h1 style="text-align:center; color:orange;"> - User Input - </h1>

In [1]:
%matplotlib inline

import glob
import os
import pandas as pd
from biom.table import Table
from biom import load_table
import os
import numpy as np
import random
from skbio.stats import subsample
import matplotlib.pyplot as plt


######################################################################################
##### USER INPUT #####################################################################

### PATH TO MAPPING FILE ###
mapPath = "../../Data/3_Custom_Filtering/1_1_qc_1000_map.txt"

### PATH TO BIOM TABLE ###
biomPath = "../../Data/3_Custom_Filtering/1_0_qc_1000_table/1_0_qc_1000_table.txt"

### PATH TO FOLDER FOR OUTPUT ###
dirPath = "Output/"

### MAPPING CATEGORY ###
mapCat = 'race'

ImportError: No module named matplotlib

<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>
<h1 style="text-align:center; color:orange;"> - Data Import and Basic Functions - </h1>

In [None]:
##### MAKE OUTPUT DIRECTORY #####
### IF OUTPUT DIRECTORY DOESN'T EXIST... THEN MAKE ###
print " - Making Output Directory - "
if not os.path.isdir(dirPath): os.makedirs(dirPath)
    
##### IMPORT MAPPING FILE #####
def load_map(mapPathIn):
    print " - Importing Mapping File - "
    mapDfIn = pd.read_csv(mapPathIn, sep='\t', index_col=None, skiprows=0, verbose=False)
    return mapDfIn.set_index("#SampleID")
mapDf = load_map(mapPath)

##### ADDING METADATA TO BIOM TABLE #####
#print " - Adding Metadata to BIOM - "
#print os.system("biom add-metadata -i " + dirPath + "0_" + biomPath + ".txt" + ".biom -o " + dirPath + "0_" + biomPath + ".meta.biom -m "+ mapPath)
#biomPathMD = dirPath + "0_" + biomPath + ".meta.biom"

##### IMPORT BIOM TABLE #####
print " - Importing BIOM Table - "
biomTable = load_table(biomPath)

### WRITE OTU TABLE TO TSV ###
def write_table(tableWrite, fileName, toJson=True):
    print " - Writing BIOM Table TSV - " + fileName
    f = open(fileName,'w')
    f.write(tableWrite.to_tsv())
    f.close()
    if toJson == True: 
        print " - Converting TSV BIOM to JSON BIOM - " + fileName + ".biom"
        print os.system("biom convert -i " + fileName + " -o " + fileName + ".biom --table-type='OTU table' --to-json")
#write_table(biomTable, dirPath + "0_" + biomPath+".txt")

#########################################################################################
##### BASIC TABLE ANALYSIS #####

### Print info about table ###
def get_table_info(bt, writePath=None, printOut = True):
    if printOut == True: 
        print " - Table Info - "
        print '   Total Observations: '+str(len(bt.ids(axis='observation')))
        print '   Total Samples: '+str(len(bt.ids(axis='sample')))
        print '   Total Counts: '+str(bt.sum())
        print '   Non-Zero Entries: '+str(bt.nnz)
        print '   Table Density: '+str(bt.get_table_density()) 
    if writePath != None: 
        fOut = open(writePath,'w')
        fOut.write(" - Table Info - \n")
        fOut.write("   Total Observations: "+str(len(bt.ids(axis='observation')))+"\n")
        fOut.write("   Total Samples: "+str(len(bt.ids(axis='sample')))+"\n")
        fOut.write("   Total Counts: "+str(bt.sum())+"\n")
        fOut.write("   Non-Zero Entries: "+str(bt.nnz)+"\n")
        fOut.write("   Table Density: "+str(bt.get_table_density()))
        fOut.close()
print
get_table_info(biomTable, writePath=dirPath+"0_input_table_summary.txt")
print

### Return List of Observations ###
def get_observations(bt): 
    print " - Getting List of Observations -> otus"
    return bt.ids(axis='observation')
otus = get_observations(biomTable)

### Return List of Observation Total Counts ###
def counts_observations(bt): 
    print " - Getting List of Observation Counts -> otusCounts"
    return bt.sum('observation')
otusCounts = counts_observations(biomTable)

### Return List of Samples ###
def get_samples(bt): 
    print " - Getting List of Samples -> samples"
    return bt.ids(axis='sample')
samples = get_samples(biomTable)

### Return List of Counts for Each Sample ###
def counts_samples(bt): 
    print " - Getting List of Sample Counts -> samplesCounts"
    return bt.sum('sample')
samplesCounts = counts_samples(biomTable)

### GET METADATA CATEGORIES ###
def get_table_metadata_categories(mappingFileDataframe):
    print " - Getting Metadata Categories -> metaCats"
    metaC = []
    for i in mappingFileDataframe.columns: metaC.append(i)
    return metaC
metaCats = get_table_metadata_categories(mapDf)

### ADD METADATA TO BIOMTABLE ###
def add_metadata(bt, mapDfIn, metaCatsIn):
    print " - Add Sample Metadata from Map to Biom Table - "
    iterSamples = bt.iter(axis='sample'); metaDict={}
    # Loop through samples #
    for values, id, metadata in iterSamples: 
        metaDict[id] = {}
        for idx, i in enumerate(mapDfIn.loc[id]): metaDict[id][metaCatsIn[idx]] = i
    return bt.add_group_metadata(metaDict)
#biomTable = add_metadata(biomTable, mapDf, metaCats)

#########################################################################################
##### TABLE FILTERING #####

### FILTER OTU MINCOUNT ### - remove otus with < mincount
def filter_otu_mincount(bt, mincount):
    filter_func = lambda values, id, md: sum(values) >= mincount
    return bt.filter(filter_func, axis='observation', inplace=False)

### FILTER OTU MAXCOUNT ### - remove otus with > maxcount
def filter_otu_maxcount(bt, maxcount):
    filter_func = lambda values, id, md: sum(values) <= maxcount
    return bt.filter(filter_func, axis='observation', inplace=False)

### FILTER SAMPLE MINCOUNT ### - remove otus with < mincount
def filter_sample_mincount(bt, mincount):
    filter_func = lambda values, id, md: sum(values) >= mincount
    return bt.filter(filter_func, axis='sample', inplace=False)

### FILTER OTU MAXCOUNT ### - remove otus with > maxcount
def filter_sample_maxcount(bt, maxcount):
    filter_func = lambda values, id, md: sum(values) <= maxcount
    return bt.filter(filter_func, axis='sample', inplace=False)

### FILTER OTU LISTKEEP ### - remove all otus from table not im listkeep
def filter_otu_listkeep(bt, list_to_keep):
    filter_func = lambda values, id, md: id in list_to_keep
    return bt.filter(filter_func, axis='observation', inplace=False)

### FILTER OTU LISTREMOVE ### - give a list of otu's to remove from table
def filter_otu_listremove(bt, list_to_remove):
    filter_func = lambda values, id, md: id not in list_to_remove
    return bt.filter(filter_func, axis='observation', inplace=False)

### FILTER SAMPLE LISTKEEP ### - remove all otus from table not in listkeep
def filter_sample_listkeep(bt, list_to_keep): 
    filter_func = lambda values, id, md: id in list_to_keep 
    return bt.filter(filter_func, axis='sample', inplace=False)

### FILTER SAMPLE LISTREMOVE ### - give a list of otu's to remove from table
def filter_sample_listremove(bt, list_to_remove):
    filter_func = lambda values, id, md: id not in list_to_remove
    return bt.filter(filter_func, axis='sample', inplace=False)

### FILTER TO SAMPLES IN METADATA CATEGORY:GROUP ### - keep only samples in the specified group
def filter_metadata_contain(bt, metadata_category, metadata_group):
    filter_f = lambda values, id_, md: md[metadata_category] == metadata_group
    return bt.filter(filter_f, axis='sample', inplace=False)

### FILTER TO SAMPLES NOT IN METADATA CATEGORY:GROUP ### - keep only samples not in the specified group 
def filter_metadata_exclude(bt, metadata_category, metadata_group):
    filter_f = lambda values, id_, md: md[metadata_category] != metadata_group
    return bt.filter(filter_f, axis='sample', inplace=False)

#########################################################################################
##### TABLE MANIPULATION #####

### RAREFACTION ###
# Take in BIOM Table and Subsample each Sample to samplingDepth
# Returns Rarefied BIOM Table Object
def rarefaction(bt, samplingDepth = 1000, warnSkippedSamples=False):
    dataCounts = []; samplesDataCounts = []
    # Iterate over All of the Samples #
    iterSamples = bt.iter(axis='sample')
    for values, id, metadata in iterSamples:
        # Subsample to the Specified Depth #
        if sum(values) >= samplingDepth:
            # Store Subsampled Counts for Sample #
            dataCounts.append(subsample(values.astype(int), samplingDepth, replace=False))
            # Store Sample ID #
            samplesDataCounts.append(id)
        # If Sample has < Counts than samplingDepth: Print Warning #
        elif warnSkippedSamples==True: print "   Warning Skipped Sample : ", id
    # Return BIOM Table Object #
    return Table(np.matrix(dataCounts).T, bt.ids(axis='observation'), samplesDataCounts)

### RETURN RELATIVE ABUNDANCE TABLE ###
def get_table_relative(bt):
    tablerel = bt.norm(axis='sample', inplace=False)
    return tablerel

### BIOM COUNT ARRAY ### ! MAY RUN SLOW !
# Takes a biom table and converts the counts to a numpy array
# counts[sample][observation]
def count_array(bt):
    print " - Getting Numpy Array of Counts - "
    arr = [[] for _ in range(len(bt.ids(axis='sample')))]
    f = bt.iter(axis='sample')
    pos = 0
    for values, id, metadata in f:
        arr[pos] = values
        pos+=1
    return np.array(arr)

### COLLAPSE SAMPLES BY METADATA CATEGORY ###
def collapse_meta(bt, metadata_category):
    collapse_f = lambda id_, md: '; '.join(md[metadata_category])
    return bt.collapse(collapse_f, axis='sample')

### COLLAPSE OTUs AT TAXONOMIC LEVEL ###
# tax_level: 0 = Kingdom | 1 = Phylum | 2 = Class | 3 = Order | 4 = Family | 5 = Genus | 6 = Species
def collapse_taxonomy(bt, tax_level):
    collapse_f = lambda id_, md: '; '.join(md['taxonomy'][:tax_level + 1])
    tabletaxacollapse = bt.collapse(collapse_f, axis='observation',norm=False)
    return tabletaxacollapse

### CONVERT TO PRESENCE / ABSENCE TABLE ###
# Table contains 1 if count >0 or 0
def presence_absence(bt): return bt.pa()


#########################################################################################
##### ITERATE OVER OTUS AND SAMPLES #####

print """

##### USER GUIDE ####################################################

### DATA STRUCTURES ###
 - Output Directory             -> dirPath
 - Mapping File                 -> mapDf
 - BIOM Table                   -> biomTable
 - List of Observations         -> otus
 - List of Observation Counts   -> otusCounts
 - List of Samples              -> samples
 - List of Sample Counts        -> samplesCounts
 - Metadata Categories          -> metaCats
 
### I/O ###
 - *load_map(mapPath)                                           -> mapDf 
 - *load_table(biomPath)                                        -> biomTable
 - write_table(tableWrite, fileName, toJson=True)
 
### BASIC INFO ###
 - *get_table_info(bt, writePath=None, printOut = True)
 - *get_observations(bt)                                        -> otus
 - *counts_observations(bt)                                     -> otusCounts
 - *get_samples(bt)                                             -> samples
 - *counts_samples(bt)                                          -> samplesCounts
 - *get_table_metadata_categories(mappingFileDataframe)         -> metaCats
 
### FILTER TABLE ###
 - filter_otu_mincount(bt, mincount)
 - filter_otu_maxcount(bt, maxcount)
 - filter_sample_mincount(bt, mincount)
 - filter_sample_maxcount(bt, maxcount)
 - filter_otu_listkeep(bt, list_to_keep)
 - filter_otu_listremove(bt, list_to_remove)
 - filter_sample_listkeep(bt, list_to_keep)
 - filter_sample_listremove(bt, list_to_remove)
 - filter_metadata_contain(bt, metadata_category, metadata_group)
 - filter_metadata_exclude(bt, metadata_category, metadata_group)

### CONVERT TABLE ###
 - rarefaction(bt, samplingDepth = 1000, warnSkippedSamples=False)
 - get_table_relative(bt)
 - count_array(bt)
 - collapse_meta(bt, metadata_category)
 - collapse_taxonomy(bt, tax_level)
 - presence_absence(bt)

### ITERATE OVER OTUS ###
iterOTUs = biomTable.iter(axis='observation')
for values, id, metadata in iterOTUs:
    print id
    biomTable.data(id, axis='observation')
    
### ITERATE OVER SAMPLES ###
iterSamples = biomTable.iter(axis='sample')
for values, id, metadata in iterSamples:
    print id
    biomTable.data(id, axis='sample')

### ACCESS BY VALUES ###
table.get_value_by_ids('OBSERVATION_ID', 'SAMPLE_ID')

* ALREADY RUN

"""

<h3 style="text-align:center; color:black;">------------------------------------------------------------------------------</h3>
<h3 style="text-align:center; color:orange;"> - IMPORT FAMILY LEVEL RELATIVE ABUNDANCE BIOM TABLE - </h3>

In [None]:
famPath = "/Users/brooks/Dropbox/American_Gut/Data/21_0_summarize_taxa/20_2_filter_country_L5.txt"
famTable = load_table(famPath)

<h3 style="text-align:center; color:orange;"> - EXTRACT CHRISTENSENELLACEAE & ADD TO MAPPING FILE - </h3>

In [None]:
### ITERATE OVER OTUS - EXTRACT CHRISTENSENELLACEAE ###
iterOTUs = famTable.iter(axis='observation')
for values, id, metadata in iterOTUs:
    if id == "k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Christensenellaceae":
        print id
        break

mapChrist = pd.concat([mapDf, pd.DataFrame(data=values, index=get_samples(famTable), columns=['Christ'])], axis=1)

<h3 style="text-align:center; color:orange;"> - AVERAGE BMI WITH AND WITHOUT CHRISTENSENELLACEAE - </h3>

In [None]:
### AVERAGE BMI'S ###

mapAllWith = mapChrist[mapChrist['Christ'] > 0]
mapAllWithout = mapChrist[mapChrist['Christ'] == 0]

### ALL INDIVIDUALS ###
print "Number of Individuals :" + str(len(mapChrist))
print "Average BMI: " + str(mapChrist['bmi'].mean())
print

### ALL WITH CHRISTENSENELLACEAE ###
print "ALL WITH CHRISTENSENELLACEAE"
print "Number of Individuals :" + str(len(mapAllWith))
print "Average BMI: " + str(mapAllWith['bmi'].mean())
print

### ALL WITHOUT CHRISTENSENELLACEAE ###
print "ALL WITHOUT CHRISTENSENELLACEAE"
print "Number of Individuals :" + str(len(mapAllWithout))
print "Average BMI: " + str(mapAllWithout['bmi'].mean())
print

##################################################################################
### Function - Mann-Whitney U Test - Nonparametric rank test of lists
# Null hypothesis: two samples from the same population 
# Alternative hypothesis: one population tends to have larger values than the other [Wikipedia]
# N samples is > 20 and you have 2 independent samples of ranks (can be unequal lengths) [Scipy]
# For two tailed test multiply P-Value*2
import scipy.stats as sp
# IN: Two independent lists of floats
# OUT: Mann Whitney Test Statistic and P-Value
def list_mannwhitney(l1, l2, outFile=None, bonferroniComparisons=1):
    # use_continuity = Whether a continuity correction (1/2.) should be taken into account. Default is True. [Scipy]
    outMann = sp.mannwhitneyu(l1, l2, use_continuity=True)
    print "Mann Whitney U - Nonparametric Rank Test"
    if outFile != None: outFile.write("Mann Whitney U - Nonparametric Rank Test" + "\n")
    print "    List #1 Length: "+str(len(l1))+" | List #2 Length: "+str(len(l2))
    if outFile != None: outFile.write("    List #1 Length: "+str(len(l1))+" | List #2 Length: "+str(len(l2)) + "\n")
    print "    Test Statistic: "+str(outMann[0])
    if outFile != None: outFile.write("    Test Statistic: "+str(outMann[0]) + "\n")
    print "    P-Value (onetailed): "+str(outMann[1])
    if outFile != None: outFile.write("    P-Value (onetailed): "+str(outMann[1]) + "\n")
    print "    P-Value (twotailed): "+str(outMann[1]*2)
    if outFile != None: outFile.write("    P-Value (twotailed): "+str(outMann[1]*2) + "\n")
    
    print "    P-Value Bonferroni Corrected (twotailed): "+str((outMann[1]*2)*bonferroniComparisons)
    if outFile != None: outFile.write("    P-Value Bonferroni Corrected (twotailed): "+str((outMann[1]*2)*bonferroniComparisons) + "\n")
    print
    return outMann
##################################################################################

### COMPARE THOSE WITH TO THOSE WITHOUT USING LIST_MANNWHITNEY ###
list_mannwhitney(mapAllWith['bmi'], mapAllWithout['bmi'])

<h3 style="text-align:center; color:orange;"> - UBIQUITY OF CHRISTENSENELLACEAE - </h3>

In [None]:
### GET UBIQUITY BY RACE - ISOLATE RACES AS SEPARATE TABLES ###
mapAA = mapChrist[mapChrist['race'] == "African American"]
mapAS = mapChrist[mapChrist['race'] == "Asian or Pacific Islander"]
mapCA = mapChrist[mapChrist['race'] == "Caucasian"]
mapHI = mapChrist[mapChrist['race'] == "Hispanic"]

#### UBIQUITY FUNCTION ####
def ubiquityCount(dfIn):
    print "Non-Zero Individuals      : " + str(dfIn['Christ'].astype(bool).sum())
    print "Total Individuals         : " + str(len(dfIn['Christ']))
    print "Ubiquity                  : " + str( float(dfIn['Christ'].astype(bool).sum()) / float(len(dfIn['Christ'])))
    print 

# CALCULATE UBIQUITY #
print "AA"
ubiquityCount(mapAA)

print "AS"
ubiquityCount(mapAS)

print "CA"
ubiquityCount(mapCA)

print "HI"
ubiquityCount(mapHI)


<h3 style="text-align:center; color:orange;"> - AVERAGE BMI AND CHRISTENSENELLACEAE BY RACE - </h3>
<h4 style="text-align:center; color:orange;"> - This includes the boxplots generated for figure 4 - </h4>
<h4 style="text-align:center; color:orange;"> - and the statistics of BMI for those with and without Christensenellaceae used in figure 4 - </h4>
<h4 style="text-align:center; color:orange;"> - Also includes some additional stats and an additional regression figure not used in manuscript - </h4>

In [None]:
### GET AVERAGE BMI AND CHRISTENSENELLACEAE FOR EACH RACE - INCLUDING FOR THOSE WITH AND WITHOUT PRESENCE SUBSET ###
# Get log10 transformation of counts
mapChrist["ChristLog"] = np.log10(mapChrist["Christ"])
# Get arcsin sqrt transformation of counts (see Structure, Function, and Diversity of the Healthy Human Microbiome)
mapChrist["ChristArcSin"] = np.arcsin(np.sqrt(mapChrist["Christ"]))

print "GET AVERAGE BMI AND CHRISTENSENELLACEAE FOR EACH RACE - INCLUDING FOR THOSE WITH AND WITHOUT PRESENCE SUBSET"
# FOR ALL INDIVIDUALS
print 'AA Mean BMI                : ' +str(np.mean(mapChrist[mapChrist['race'] == 'African American']['bmi'] ))
print 'AA Mean Christensenellaceae: ' +str(np.mean(mapChrist[mapChrist['race'] == 'African American']['Christ'] ))
print
print 'AS Mean BMI                : ' +str(np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['bmi'] ))
print 'AS Mean Christensenellaceae: ' +str(np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['Christ'] ))
print
print 'CA Mean BMI                : ' +str(np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['bmi'] ))
print 'CA Mean Christensenellaceae: ' +str(np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['Christ'] ))
print
print 'HI Mean BMI                : ' +str(np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['bmi'] ))
print 'HI Mean Christensenellaceae: ' +str(np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['Christ'] ))

# Remove samples w/ no Christ
mapChristR = mapChrist[mapChrist["Christ"] != 0]
# FOR ONLY INDIVIDUALS WITH CHRISTENSENELLACEAE #
print
print '-----------------------------------------------'
print 'AA Mean BMI                : ' +str(np.mean(mapChristR[mapChristR['race'] == 'African American']['bmi'] ))
print 'AA Mean Christensenellaceae: ' +str(np.mean(mapChristR[mapChristR['race'] == 'African American']['Christ'] ))
print
print 'AS Mean BMI                : ' +str(np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['bmi'] ))
print 'AS Mean Christensenellaceae: ' +str(np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['Christ'] ))
print
print 'CA Mean BMI                : ' +str(np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['bmi'] ))
print 'CA Mean Christensenellaceae: ' +str(np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['Christ'] ))
print
print 'HI Mean BMI                : ' +str(np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['bmi'] ))
print 'HI Mean Christensenellaceae: ' +str(np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['Christ'] ))

# Remove samples w/ Christ
mapChristRWO = mapChrist[mapChrist["Christ"] == 0]
# FOR ONLY INDIVIDUALS WITHOUT CHRISTENSENELLACEAE #
print
print '-----------------------------------------------'
print 'AA Mean BMI                : ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['bmi'] ))
print 'AA Mean Christensenellaceae: ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['Christ'] ))
print 
print 'AS Mean BMI                : ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['bmi'] ))
print 'AS Mean Christensenellaceae: ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['Christ'] ))
print
print 'CA Mean BMI                : ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['bmi'] ))
print 'CA Mean Christensenellaceae: ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['Christ'] ))
print
print 'HI Mean BMI                : ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['bmi'] ))
print 'HI Mean Christensenellaceae: ' +str(np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['Christ'] ))

##################################################################################
##################################################################################
##################################################################################

### COMPARE WITH AND WITHOUT BMIs - EXTRACT THOSE WITH AND WITHOUT CHRISTENSENELLACEAE FOR EACH RACE ###
# EACH RACE WITH CHRISTENSENELLACEAE #
mapAAr = mapChristR[mapChristR['race'] == "African American"]
mapASr = mapChristR[mapChristR['race'] == "Asian or Pacific Islander"]
mapCAr = mapChristR[mapChristR['race'] == "Caucasian"]
mapHIr = mapChristR[mapChristR['race'] == "Hispanic"]
# EACH RACE WITHOUT CHRISTENSENELLACEAE #
mapAArwo = mapChristRWO[mapChristRWO['race'] == "African American"]
mapASrwo = mapChristRWO[mapChristRWO['race'] == "Asian or Pacific Islander"]
mapCArwo = mapChristRWO[mapChristRWO['race'] == "Caucasian"]
mapHIrwo = mapChristRWO[mapChristRWO['race'] == "Hispanic"]

##################################################################################
### Function - Mann-Whitney U Test - Nonparametric rank test of lists
# Null hypothesis: two samples from the same population 
# Alternative hypothesis: one population tends to have larger values than the other [Wikipedia]
# N samples is > 20 and you have 2 independent samples of ranks (can be unequal lengths) [Scipy]
# For two tailed test multiply P-Value*2
import scipy.stats as sp
# IN: Two independent lists of floats
# OUT: Mann Whitney Test Statistic and P-Value
def list_mannwhitney(l1, l2, outFile=None, bonferroniComparisons=1):
    # use_continuity = Whether a continuity correction (1/2.) should be taken into account. Default is True. [Scipy]
    outMann = sp.mannwhitneyu(l1, l2, use_continuity=True)
    print "Mann Whitney U - Nonparametric Rank Test"
    if outFile != None: outFile.write("Mann Whitney U - Nonparametric Rank Test" + "\n")
    print "    List #1 Length: "+str(len(l1))+" | List #2 Length: "+str(len(l2))
    if outFile != None: outFile.write("    List #1 Length: "+str(len(l1))+" | List #2 Length: "+str(len(l2)) + "\n")
    print "    Test Statistic: "+str(outMann[0])
    if outFile != None: outFile.write("    Test Statistic: "+str(outMann[0]) + "\n")
    print "    P-Value (onetailed): "+str(outMann[1])
    if outFile != None: outFile.write("    P-Value (onetailed): "+str(outMann[1]) + "\n")
    print "    P-Value (twotailed): "+str(outMann[1]*2)
    if outFile != None: outFile.write("    P-Value (twotailed): "+str(outMann[1]*2) + "\n")
    
    print "    P-Value Bonferroni Corrected (twotailed): "+str((outMann[1]*2)*bonferroniComparisons)
    if outFile != None: outFile.write("    P-Value Bonferroni Corrected (twotailed): "+str((outMann[1]*2)*bonferroniComparisons) + "\n")
    print
    return outMann
##################################################################################

### COMPARE BMIS OF THOSE WITH CHRISTENSENELLACEAE TO THOSE WITHOUT FOR EACH RACE USING MANN WHITNEY U TEST ###
print "COMPARE BMIS OF THOSE WITH CHRISTENSENELLACEAE TO THOSE WITHOUT FOR EACH RACE USING MANN WHITNEY U TEST"
print "COMPARE EACH RACES BMI WITH AND WITHOUT CHRISTENSENELLACEAE #"
print
print "AA"
list_mannwhitney(mapAAr['bmi'], mapAArwo['bmi'])

print
print "AS"
list_mannwhitney(mapASr['bmi'], mapASrwo['bmi'])

print
print "CA"
list_mannwhitney(mapCAr['bmi'], mapCArwo['bmi'])

print
print "HI"
list_mannwhitney(mapHIr['bmi'], mapHIrwo['bmi'])
print

##################################################################################
##################################################################################
##################################################################################

import scipy
#### PLOT OF AVERAGES - THIS IS A SCATTER PLOT OF THE AVERAGE BMI BY THE AVERAGE CHRISTENSENELLACEAE FOR THOSE WITH AND WITHOUT THE PRESENECE OF CHRISTENSENELLACEAE ####
#### NOT USED IN MANUSCRIPT - BUT HAS REGRESSIONS LINES AND REGRESSIONS STATISTICS OUTPUT WHICH ARE REALLY ONLY INTERESTING TO EVALUATE SLOPE (P-VALS USELESS) 
plt.figure(figsize=[10,5])
print "PLOT OF AVERAGES - THIS IS A SCATTER PLOT OF THE AVERAGE BMI BY THE AVERAGE CHRISTENSENELLACEAE FOR THOSE WITH AND WITHOUT THE PRESENECE OF CHRISTENSENELLACEAE"
print "NOT USED IN MANUSCRIPT - BUT HAS REGRESSIONS LINES AND REGRESSIONS STATISTICS OUTPUT WHICH ARE REALLY ONLY INTERESTING TO EVALUATE SLOPE (P-VALS USELESS)"
# PLOT ALL INDIVIDUALS #
plt.scatter(np.mean(mapChrist[mapChrist['race'] == 'African American']['bmi'] ),np.mean(mapChrist[mapChrist['race'] == 'African American']['Christ'] ), c='y', s=100, marker='^')
plt.scatter(np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['bmi'] ),np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['Christ'] ), c='b', s=100, marker='^')
plt.scatter(np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['bmi'] ),np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['Christ'] ), c='r', s=100, marker='^')
plt.scatter(np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['bmi'] ),np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['Christ'] ), c='g', s=100, marker='^')

# PLOT ALL INDIVIDUALS WITH #
plt.scatter(np.mean(mapChristR[mapChristR['race'] == 'African American']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'African American']['Christ'] ), c='y', s=100, marker='>')
plt.scatter(np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['Christ'] ), c='b', s=100, marker='>')
plt.scatter(np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['Christ'] ), c='r', s=100, marker='>')
plt.scatter(np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['Christ'] ), c='g', s=100, marker='>')

# PLOT ALL INDIVIDUALS WITH #
plt.scatter(np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['bmi'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['Christ'] ), c='y', s=100, marker='*')
plt.scatter(np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['bmi'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['Christ'] ), c='b', s=100, marker='*')
plt.scatter(np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['bmi'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['Christ'] ), c='r', s=100, marker='*')
plt.scatter(np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['bmi'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['Christ'] ), c='g', s=100, marker='*')

# REGRESSION ACROSS RACES OF INDIVIDIUALS WITH 
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChristR[mapChristR['race'] == 'African American']['bmi'] ), np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['bmi'] )],
    [np.mean(mapChristR[mapChristR['race'] == 'African American']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['Christ'] ),np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['Christ'] ),np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['Christ'] )])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'k--')
print
print "All With Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# REGRESSION ACROSS RACES OF INDIVIDIUALS WITH TRANSFORMED
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChristR[mapChristR['race'] == 'African American']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['Christ'] ),np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['Christ'] ),np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['Christ'] )],
    [np.mean(mapChristR[mapChristR['race'] == 'African American']['bmi'] ), np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['bmi'] )])

print
print "All With Regression Transformed"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# ADD REGRESSION LINES #
# AA
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChrist[mapChrist['race'] == 'African American']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'African American']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['bmi'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'African American']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'African American']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['Christ'] ) ])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'y--')
print
print "African American Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# AS
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['bmi'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['Christ'] ) ])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'b--')
print
print "Asian or Pacific Islander Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# CA
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['bmi'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['Christ'] ) ])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'r--')
print
print "Caucasian Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# HI
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['bmi'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['Christ'] ) ])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'g--')
print
print "Hispanic Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

##########
plt.xlabel("BMI")
plt.ylabel("Relative Christensenellaceae")
plt.title("Christensenellaceae by BMI for Each Race - All Individuals & with / without")
plt.tight_layout()
plt.ylim(-0.001, 0.007)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/AVG_PLOT2.pdf")
plt.show()


### PRINT REGRESSION TRANSFORMED ###
# ADD REGRESSION LINES #
print "NOT USED IN MANUSCRIPT - REGRESSIONS STATISTICS FOR TRANSFORMED CHRISTENSENELLACEAE ABUNDANCE OUTPUT WHICH ARE REALLY ONLY INTERESTING TO EVALUATE SLOPE (P-VALS USELESS)"

# AA
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChrist[mapChrist['race'] == 'African American']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'African American']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['Christ'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'African American']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'African American']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'African American']['bmi'] ) ]

)
print
print "African American Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# AS
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['Christ'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'Asian or Pacific Islander']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Asian or Pacific Islander']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'Asian or Pacific Islander']['bmi'] ) ]
)
print
print "Asian or Pacific Islander Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# CA
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(    
    [np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['Christ'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'Caucasian']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Caucasian']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'Caucasian']['bmi'] ) ]

)
print
print "Caucasian Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)

# HI
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    [np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['Christ'] ), np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['Christ'] ),np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['Christ'] ) ],
    [np.mean(mapChrist[mapChrist['race'] == 'Hispanic']['bmi'] ),np.mean(mapChristR[mapChristR['race'] == 'Hispanic']['bmi'] ), np.mean(mapChristRWO[mapChristRWO['race'] == 'Hispanic']['bmi'] ) ]
)
print
print "Hispanic Regression"
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)


### BOXPLOTS OF BMIS FOR INDIVDIDUALS WITH AND WITHOUT CHRISTENSENELLACEAE USED IN MANUSCRIPT - MEANS ARE INCLUDED AS RED BOXES ###
print "BOXPLOTS OF BMIS FOR INDIVDIDUALS WITH AND WITHOUT CHRISTENSENELLACEAE USED IN MANUSCRIPT - MEANS ARE INCLUDED AS RED BOXES"
ax1 = mapChrist.boxplot(column='bmi', by='race', figsize=[15,8], showmeans=True)
plt.title('BMI of All Individuals')
ax1.set_ylim(0,60)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/bmi_all.pdf")

ax2 = mapChristR.boxplot(column='bmi', by='race', figsize=[15,8], showmeans=True)
plt.title('BMI With Christensenellaceae')
ax2.set_ylim(0,60)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/bmi_with.pdf")

ax3 = mapChristRWO.boxplot(column='bmi', by='race', figsize=[15,8], showmeans=True)
plt.title('BMI Without Christensenellaceae')
ax3.set_ylim(0,60)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/bmi_without.pdf")

ax5 = mapChristR.boxplot(column='ChristLog', by='race', figsize=[15,8], showmeans=True)
plt.title('Christensenellaceae Log10 With Only')
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/log_abund.pdf")
ax5.set_ylim(-5,0)

ax5 = mapChrist.boxplot(column='ChristLog', by='race', figsize=[15,8], showmeans=True)
plt.title('Christensenellaceae Log10 All')
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/log_abund_all.pdf")
ax5.set_ylim(-5,0)

ax6 = mapChristR.boxplot(column='ChristArcSin', by='race', figsize=[15,8], showmeans=True)
plt.title('Christensenellaceae ArcSin Sqrt With Only')
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/arcsin_sqrt_abund.pdf")


<h3 style="text-align:center; color:orange;"> - REGRESSION OF BMI AGAINST CHRISTENSENELLACEAE ABUNDANCE - </h3>
<h4 style="text-align:center; color:orange;"> - These are regressions only including individuals with Christensenellaceae - </h4>

In [None]:
# Get log10 transformation of counts
mapChrist["ChristLog"] = np.log10(mapChrist["Christ"])
# Get arcsin sqrt transformation of counts (see Structure, Function, and Diversity of the Healthy Human Microbiome)
mapChrist["ChristArcSin"] = np.arcsin(np.sqrt(mapChrist["Christ"]))

# Set Colors
colors = {'African American':'red', 'Asian or Pacific Islander':'blue', 'Caucasian':'green', 'Hispanic':'yellow'}

# Remove samples w/ no Christ
mapChrist = mapChrist[mapChrist["Christ"] != 0]

# Plot ArcSin
mapChrist.plot(kind='scatter', x='bmi', y='ChristArcSin', c=mapChrist['race'].apply(lambda x: colors[x]), figsize=[10,10])
plt.title("ArcSin sqrt Transformation")

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(mapChrist["bmi"],mapChrist["ChristArcSin"])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'r--')
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/reg_ArcSin_Chr_only.pdf")
plt.show()

# Plot Log10
mapChrist.plot(kind='scatter', x='bmi', y='ChristLog', c=mapChrist['race'].apply(lambda x: colors[x]), figsize=[10,10])
plt.title("Log10 Transformation")

mapChrist["ChristLog"] = mapChrist["ChristLog"].replace(to_replace='-inf', value=-10)

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(mapChrist["bmi"],mapChrist["ChristLog"])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'r--')
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/reg_Log_Chr_only.pdf")
plt.show()

<h3 style="text-align:center; color:orange;"> - REGRESSION OF BMI AGAINST CHRISTENSENELLACEAE ABUNDANCE - </h3>
<h4 style="text-align:center; color:orange;"> - These are regressions including individuals WITH & WITHOUT Christensenellaceae - </h4>

In [None]:
# Get log10 transformation of counts
mapChrist["ChristLog"] = np.log10(mapChrist["Christ"])
# Get arcsin sqrt transformation of counts (see Structure, Function, and Diversity of the Healthy Human Microbiome)
mapChrist["ChristArcSin"] = np.arcsin(np.sqrt(mapChrist["Christ"]))

# Set Colors
colors = {'African American':'red', 'Asian or Pacific Islander':'blue', 'Caucasian':'green', 'Hispanic':'yellow'}

# Remove samples w/ no Christ
#mapChrist = mapChrist[mapChrist["Christ"] != 0]

# Plot ArcSin
mapChrist.plot(kind='scatter', x='bmi', y='ChristArcSin', c=mapChrist['race'].apply(lambda x: colors[x]), figsize=[10,10])
plt.title("ArcSin sqrt Transformation")

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(mapChrist["bmi"],mapChrist["ChristArcSin"])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'r--')
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/reg_ArcSin_All.pdf")
plt.show()

# Plot Log10
mapChrist.plot(kind='scatter', x='bmi', y='ChristLog', c=mapChrist['race'].apply(lambda x: colors[x]), figsize=[10,10])
plt.title("Log10 Transformation")

mapChrist["ChristLog"] = mapChrist["ChristLog"].replace(to_replace='-inf', value=-20)

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(mapChrist["bmi"],mapChrist["ChristLog"])
line = slope*mapChrist["bmi"]+intercept
plt.plot(mapChrist["bmi"],line, 'r--')
print "Slope: " + str(slope)
print "R^2  : " + str(r_value*r_value)
print "p-val: " + str(p_value)
plt.savefig("/Users/brooks/Dropbox/American_Gut/Figures/S5_HMP_Overlap/reg_Log_All_-20.pdf")
plt.show()

<h3 style="text-align:center; color:orange;"> - REGRESSION OF BMI AGAINST CHRISTENSENELLACEAE ABUNDANCE SUBSET BY RACE - NOT USED IN MANUSCRIPT - </h3>
<h4 style="text-align:center; color:orange;"> - These are regressions only including individuals with Christensenellaceae - </h4>

In [None]:
import scipy

# Get log10 transformation of counts
mapChrist["ChristLog"] = np.log10(mapChrist["Christ"])
# Get arcsin sqrt transformation of counts (see Structure, Function, and Diversity of the Healthy Human Microbiome)
mapChrist["ChristArcSin"] = np.arcsin(np.sqrt(mapChrist["Christ"]))


def plotChrist(mapTableChrist, raceIn, colorPattern):
    mapTableChrist = mapChrist[mapChrist['race'] == raceIn]
    
    # Exclude Individuals w/o Christ
    mapTableChrist = mapTableChrist[mapTableChrist["Christ"] != 0]
    
    #print mapTableChrist["ChristArcSin"]
    mapTableChrist.plot(kind='scatter', x='bmi', y='ChristArcSin', figsize=[10,10])
    
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(mapTableChrist["bmi"],mapTableChrist["ChristArcSin"])

    line = slope*mapTableChrist["bmi"]+intercept
    plt.plot(mapTableChrist["bmi"],line, colorPattern)
    plt.title(raceIn, size=30)
    
    print
    print raceIn
    print "Slope: " + str(slope)
    print "R^2  : " + str(r_value*r_value)
    print "p-val: " + str(p_value)
    plt.show()

plotChrist(mapChrist, "African American", 'r--')
plotChrist(mapChrist, "Asian or Pacific Islander", 'r--')
plotChrist(mapChrist, "Caucasian", 'r--')
plotChrist(mapChrist, "Hispanic", 'r--')

<h3 style="text-align:center; color:orange;"> - REGRESSION OF BMI AGAINST CHRISTENSENELLACEAE ABUNDANCE SUBSET BY RACE - NOT USED IN MANUSCRIPT - </h3>
<h4 style="text-align:center; color:orange;"> - These are regressions including individuals WITH & WITHOUT Christensenellaceae - </h4>

In [None]:
import scipy


# Get log10 transformation of counts
mapChrist["ChristLog"] = np.log10(mapChrist["Christ"])
# Get arcsin sqrt transformation of counts (see Structure, Function, and Diversity of the Healthy Human Microbiome)
mapChrist["ChristArcSin"] = np.arcsin(np.sqrt(mapChrist["Christ"]))


def plotChrist(mapTableChrist, raceIn, colorPattern):
    mapTableChrist = mapChrist[mapChrist['race'] == raceIn]
    
    # Exclude Individuals w/o Christ
    mapTableChrist = mapTableChrist[mapTableChrist["Christ"] != 0]
    
    # Exclude outliers w/ bmi > 35
    #mapTableChrist = mapTableChrist[mapTableChrist["bmi"]  < 35]
    
    # Include Individuals w/o Christ
    #mapTableChrist["ChristLog"] = mapTableChrist["ChristLog"].replace(to_replace='-inf', value=-10)
    
    
    #print mapTableChrist["ChristLog"]
    mapTableChrist.plot(kind='scatter', x='bmi', y='ChristLog', figsize=[10,10])
    
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(mapTableChrist["bmi"],mapTableChrist["ChristLog"])

    line = slope*mapTableChrist["bmi"]+intercept
    plt.plot(mapTableChrist["bmi"],line, colorPattern)
    plt.title(raceIn, size=30)
    
    print
    print raceIn
    print "Slope: " + str(slope)
    print "R^2  : " + str(r_value*r_value)
    print "p-val: " + str(p_value)
    plt.show()

plotChrist(mapChrist, "African American", 'r--')
plotChrist(mapChrist, "Asian or Pacific Islander", 'r--')
plotChrist(mapChrist, "Caucasian", 'r--')
plotChrist(mapChrist, "Hispanic", 'r--')