<h1 style="text-align:Center; color:orange;">- BIOM Analysis -</h1>
<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>
<h4 style="text-align:center; color:blue;">Andrew W. Brooks</h4>
<h4 style="text-align:center; color:blue;">Vanderbilt Genetics Institute</h4>
<h4 style="text-align:center; color:blue;">andrew.w.brooks(at)vanderbilt.edu</h4>
<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>

In [1]:
import glob
import os
import pandas as pd
from biom import load_table
import os
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy import stats

######################################################################################
##### USER INPUT #####################################################################


In [2]:
dfIn = pd.read_csv("22_2_collate_alpha_1000_81000_5000/shannon.txt", sep='\t', index_col=None,engine='python', verbose=False)

In [3]:
df1 = dfIn[dfIn["sequences per sample"] == 1000]

In [4]:
df2 = df1[df1.columns[3:]]

In [8]:
df2.convert_objects(convert_numeric=True).columns

Index([u'10317.000002503', u'10317.000013618', u'10317.000031796',
       u'10317.000026504', u'10317.000007104', u'10317.000007077',
       u'10317.000016408', u'10317.000005802', u'10317.000006952',
       u'10317.000013023', 
       ...
       u'10317.000001654', u'10317.000011375', u'10317.000042586',
       u'10317.000041282', u'10317.000001208', u'10317.000027729',
       u'10317.000013573', u'10317.000010544', u'10317.000005878',
       u'10317.000012987'],
      dtype='object', length=1375)

In [9]:
x = pd.DataFrame(columns=df2.columns, index=["Mean"])
for i in df2.convert_objects(convert_numeric=True).columns:
    #print i, df2[i].astype(float).mean()
    x[i] = df2[i].astype(float).mean()

In [10]:
x

Unnamed: 0,10317.000002503,10317.000013618,10317.000031796,10317.000026504,10317.000007104,10317.000007077,10317.000016408,10317.000005802,10317.000006952,10317.000013023,...,10317.000001654,10317.000011375,10317.000042586,10317.000041282,10317.000001208,10317.000027729,10317.000013573,10317.000010544,10317.000005878,10317.000012987
Mean,5.234787,6.366268,6.180711,5.858434,5.598638,5.633328,6.161,4.366205,5.656045,5.763796,...,3.258558,2.527899,6.107339,4.795678,4.060061,5.719675,5.950429,2.183108,0.72349,4.569603


In [11]:
### PATH TO MAPPING FILE ###
mapPath = "30_0_map_trimmed.txt"

##### IMPORT MAPPING FILE #####
def load_map(mapPathIn):
    print " - Importing Mapping File - "
    mapDfIn = pd.read_csv(mapPathIn, sep='\t', index_col=None, skiprows=0, verbose=False)
    return mapDfIn.set_index("#SampleID")
mapDf = load_map(mapPath)

 - Importing Mapping File - 


<h4 style="text-align:center; color:blue;">Without Rarefaction Inflation</h4>

In [34]:
alpha = {}
for i in x:
    raceIn = mapDf["race"][str(i)]
    if raceIn not in alpha.keys(): alpha[raceIn] = []
    alpha[raceIn].extend(x[i].astype('float'))
for i in alpha.keys():
    print i
    print "  Count        : " + str(len(alpha[i]))
    print "  Average Alpha: " + str(np.mean(alpha[i]))
print "\nTest Stat      -    P-value"
print stats.kruskal(alpha["Asian or Pacific Islander"], alpha["African American"], alpha["Caucasian"], alpha["Hispanic"])

Asian or Pacific Islander
  Count        : 88
  Average Alpha: 4.87992270229
African American
  Count        : 13
  Average Alpha: 4.87024597081
Caucasian
  Count        : 1237
  Average Alpha: 5.31661337805
Hispanic
  Count        : 37
  Average Alpha: 5.61113089289

Test Stat      -    P-value
(37.98717006987772, 2.8442081191384018e-08)


In [33]:

##################################################################################
### Function - Mann-Whitney U Test - Nonparametric rank test of lists
# Null hypothesis: two samples from the same population 
# Alternative hypothesis: one population tends to have larger values than the other [Wikipedia]
# N samples is > 20 and you have 2 independent samples of ranks (can be unequal lengths) [Scipy]
# For two tailed test multiply P-Value*2
import scipy.stats as sp
# IN: Two independent lists of floats
# OUT: Mann Whitney Test Statistic and P-Value
def list_mannwhitney(l1, l2, outFile=None):
    # use_continuity = Whether a continuity correction (1/2.) should be taken into account. Default is True. [Scipy]
    outMann = sp.mannwhitneyu(l1, l2, use_continuity=True)
    print "Mann Whitney U - Nonparametric Rank Test"
    if outFile != None: outFile.write("Mann Whitney U - Nonparametric Rank Test" + "\n")
    print "    List #1 Length: "+str(len(l1))+" | List #2 Length: "+str(len(l2))
    print "    Values 1:5 in list #1: " + str(l1[0:4])
    print "    Values 1:5 in list #2: " + str(l2[0:4])
    if outFile != None: outFile.write("    List #1 Length: "+str(len(l1))+" | List #2 Length: "+str(len(l2)) + "\n")
    print "    Test Statistic: "+str(outMann[0])
    if outFile != None: outFile.write("    Test Statistic: "+str(outMann[0]) + "\n")
    print "    P-Value (onetailed): "+str(outMann[1])
    if outFile != None: outFile.write("    P-Value (onetailed): "+str(outMann[1]) + "\n")
    print "    P-Value (twotailed): "+str(outMann[1]*2)
    if outFile != None: outFile.write("    P-Value (twotailed): "+str(outMann[1]*2) + "\n")
    return outMann
##################################################################################
#alpha["Asian or Pacific Islander"], alpha["African American"], alpha["Caucasian"], alpha["Hispanic"]

print "##############################################################################################"
print "###################### WITHOUT ROUNDING TO ONE DECIMAL PLACE #################################"
print "##############################################################################################"

print "Asian or Pacific Islander" + " - " + "African American"
list_mannwhitney(alpha["Asian or Pacific Islander"],alpha["African American"])
print
print "Asian or Pacific Islander" + " - " + "Caucasian"
list_mannwhitney(alpha["Asian or Pacific Islander"],alpha["Caucasian"])
print
print "Asian or Pacific Islander" + " - " + "Hispanic"
list_mannwhitney(alpha["Asian or Pacific Islander"],alpha["Hispanic"])
print
print "African American" + " - " + "Caucasian"
list_mannwhitney(alpha["African American"],alpha["Caucasian"])
print
print "African American" + " - " + "Hispanic"
list_mannwhitney(alpha["African American"],alpha["Hispanic"])
print
print "Caucasian" + " - " + "Hispanic"
list_mannwhitney(alpha["Caucasian"],alpha["Hispanic"])
print

print "##############################################################################################"
print "######################### ROUNDING TO ONE DECIMAL PLACE ######################################"
print "##############################################################################################"

alpha2 = {}
alpha2["Asian or Pacific Islander"] = [ round(elem, 1) for elem in alpha["Asian or Pacific Islander"] ]
alpha2["African American"] = [ round(elem, 1) for elem in alpha["African American"] ]
alpha2["Caucasian"] = [ round(elem, 1) for elem in alpha["Caucasian"] ]
alpha2["Hispanic"] = [ round(elem, 1) for elem in alpha["Hispanic"] ]

print "Asian or Pacific Islander" + " - " + "African American"
list_mannwhitney(alpha2["Asian or Pacific Islander"],alpha2["African American"])
print
print "Asian or Pacific Islander" + " - " + "Caucasian"
list_mannwhitney(alpha2["Asian or Pacific Islander"],alpha2["Caucasian"])
print
print "Asian or Pacific Islander" + " - " + "Hispanic"
list_mannwhitney(alpha2["Asian or Pacific Islander"],alpha2["Hispanic"])
print
print "African American" + " - " + "Caucasian"
list_mannwhitney(alpha2["African American"],alpha2["Caucasian"])
print
print "African American" + " - " + "Hispanic"
list_mannwhitney(alpha2["African American"],alpha2["Hispanic"])
print
print "Caucasian" + " - " + "Hispanic"
list_mannwhitney(alpha2["Caucasian"],alpha2["Hispanic"])
print

##############################################################################################
###################### WITHOUT ROUNDING TO ONE DECIMAL PLACE #################################
##############################################################################################
Asian or Pacific Islander - African American
Mann Whitney U - Nonparametric Rank Test
    List #1 Length: 88 | List #2 Length: 13
    Values 1:5 in list #1: [4.3399837082191999, 5.4690898845573006, 5.6441114337517995, 4.5985001611831002]
    Values 1:5 in list #2: [4.4401350361455005, 6.0528632337629995, 4.9297746196245003, 5.3152234834489995]
    Test Statistic: 562.0
    P-Value (onetailed): 0.461625757224
    P-Value (twotailed): 0.923251514448

Asian or Pacific Islander - Caucasian
Mann Whitney U - Nonparametric Rank Test
    List #1 Length: 88 | List #2 Length: 1237
    Values 1:5 in list #1: [4.3399837082191999, 5.4690898845573006, 5.6441114337517995, 4.5985001611831002]
    Values 1:5 in list #2: [5

In [98]:
fileOut = open('22_2_alpha_kw_hispanic.txt', 'w')
for item in alpha["Hispanic"]:
    fileOut.write("%s\n" % item)
fileOut.close()

In [99]:
fileOut = open('22_2_alpha_kw_african_american.txt', 'w')
for item in alpha["African American"]:
    fileOut.write("%s\n" % item)
fileOut.close()

In [100]:
fileOut = open('22_2_alpha_kw_asian.txt', 'w')
for item in alpha["Asian or Pacific Islander"]:
    fileOut.write("%s\n" % item)
fileOut.close()

In [101]:
fileOut = open('22_2_alpha_kw_caucasian.txt', 'w')
for item in alpha["Caucasian"]:
    fileOut.write("%s\n" % item)
fileOut.close()

<h4 style="text-align:center; color:blue;">With Rarefaction Inflation</h4>

In [89]:
alpha = {}
for i in df2:
    raceIn = mapDf["race"][str(i)]
    if raceIn not in alpha.keys(): alpha[raceIn] = []
    alpha[raceIn].extend(df2[i].astype('float'))
for i in alpha.keys():
    print i
    print "  Count        : " + str(len(alpha[i]))
    print "  Average Alpha: " + str(np.mean(alpha[i]))
print "\nTest Stat      -     P-value"
print stats.kruskal(alpha["Asian or Pacific Islander"], alpha["African American"], alpha["Caucasian"], alpha["Hispanic"])

Asian or Pacific Islander
  Count        : 8800
  Average Alpha: 4.87992270229
African American
  Count        : 1300
  Average Alpha: 4.87024597081
Caucasian
  Count        : 123700
  Average Alpha: 5.31661337805
Hispanic
  Count        : 3700
  Average Alpha: 5.61113089289

Test Stat      -     P-value
(3770.3667625389644, 0.0)
