In [1]:
#!/usr/bin/env python

### PRIMARY TOOLKITS ###
import numpy as np # Numpy Numerical Toolkit
import pandas as pd # Pandas Dataframes
import scipy as sp  # Scipy Scientific Toolkit

### PANDAS TOOLKITS ###
import patsy # Regression Formatting
import statsmodels.api as sm # StatsModels Statsistics Toolkit
from statsmodels.sandbox.stats.multicomp import multipletests # Statsmodels p correction

### PLOTTING ###
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### FUNCTIONALITY ###
import os # Tool for terminal and operating system type calls
import glob # Tool to Regex Search for Files
import itertools # Iterate through data
import time # Time and Date Tools
import random # Generate Random Values
import copy # Generate copy and deepcopy of objects
random.seed(54321)  # Set Random Seed for Reproducibility

### NOTEBOOK ###
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell # Set Notebook to Evaulate Variables on All Lines (not just last line)
InteractiveShell.ast_node_interactivity = "all"
from IPython.utils import io # Used to capture the output of cells and write to file

### BROOKS TOOLS ###
from brooks import *

### PRINT DATE AND TIME ###
print (' - Date: '+time.strftime("%d/%m/%Y")+' '+time.strftime("%H:%M:%S")+' - ')

  from pandas.core import datetools


 - Date: 03/01/2018 16:31:47 - 


<h1 style="text-align:Center; color:orange;">- Brooks Notebook -</h1>
<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>
<h4 style="text-align:center; color:blue;">Andrew W. Brooks</h4>
<h4 style="text-align:center; color:blue;">Vanderbilt Genetics Institute</h4>
<h4 style="text-align:center; color:blue;">andrew.w.brooks(at)vanderbilt.edu</h4>
<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>
<h4 style="text-align:center; color:black;">Released under MIT License</h4>
<h4 style="text-align:center; color:black;">Copyright (c) 2017 Andrew W. Brooks</h4>
<h4 style="text-align:center; color:black;">Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. The software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the software.</h4>
<h4 style="text-align:center; color:red;"></h4>
<h1 style="text-align:center; color:black;">------------------------------------------------------------------------------</h1>

<h1 style="text-align:center; color:orange;"> - Playplace -</h1>
<h4 style="text-align:center; color:orange;"> Where all of your wildest dreams come true!</h4>

<h3 style="text-align:center; color:blue;"> - Fst Analysis - </h3>
<h4 style="text-align:center; color:black;"> Test if Fst for microbe associated SNPs is significantly higher than background. </h4>

<h4 style="text-align:center; color:blue;"> - Input Data and Organize Analysis - </h4>

In [2]:
### Number of Random SNPs to Subsample for Pvalue Calculation ###
nSubsample = 10000
### Number of Times to Subsample SNPs for Pvalue Calculation ###
repSubsample = 10

### If so, then read in the SNPs of Interest & Set Index to Position ###
snpsIn = pd.read_csv(('round_2_snps_vep.txt'), sep='\t',header=0)

In [3]:
### ADD POPULATIONS TO DATAFRAME ###
for pops in itertools.combinations(['AFR', 'AMR', 'EAS', 'EUR', 'SAS'], 2):
    kwargs = {pops[0]+'_'+pops[1]+'_fst':np.zeros(len(snpsIn))}
    snpsIn = snpsIn.assign(**kwargs)
    kwargs = {pops[0]+'_'+pops[1]+'_p':np.zeros(len(snpsIn))}
    snpsIn = snpsIn.assign(**kwargs)

In [4]:
snpsIn

Unnamed: 0,taxa,chr,pos,rs,v1,v2,info,AFR_AMR_fst,AFR_AMR_p,AFR_EAS_fst,...,AMR_EUR_fst,AMR_EUR_p,AMR_SAS_fst,AMR_SAS_p,EAS_EUR_fst,EAS_EUR_p,EAS_SAS_fst,EAS_SAS_p,EUR_SAS_fst,EUR_SAS_p
0,Clostridiaceae,1,66166499,rs7524581,T,C,CSQ=C|intergenic_variant|MODIFIER|||||||||||||...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Rikenellaceae,1,76787267,rs17098734,A,G,CSQ=G|intron_variant|MODIFIER|ST6GALNAC3|ENSG0...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Rikenellaceae,1,119944761,rs147600757,G,A,CSQ=A|intergenic_variant|MODIFIER|||||||||||||...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Clostridiaceae,1,165470814,rs185902,T,C,CSQ=C|non_coding_transcript_exon_variant|MODIF...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Clostridiaceae,1,175670134,rs7527642,C,A,CSQ=A|intron_variant|MODIFIER|TNR|ENSG00000116...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Peptococcaceae,1,246909388,rs143179968,C,T,CSQ=T|intron_variant|MODIFIER|SCCPDH|ENSG00000...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Clostridiaceae,10,32669237,rs2505338,T,C,CSQ=C|upstream_gene_variant|MODIFIER|EPC1|ENSG...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Clostridiaceae,11,79689054,rs641527,G,A,CSQ=A|intergenic_variant|MODIFIER|||||||||||||...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Clostridiaceae,11,93259965,rs2248020,A,C,CSQ=C|intron_variant|MODIFIER|SMCO4|ENSG000001...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Clostridiaceae,12,105761662,rs7302174,G,C,CSQ=C|intron_variant|MODIFIER|C12orf75|ENSG000...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h4 style="text-align:center; color:blue;"> - Get SNP Fst's (and Get Unused PValue with Permutation Analysis) - </h4>

In [5]:
cur = 0
snpsOut = pd.DataFrame(columns=['snp', 'chr', 'pos', 'pop1', 'pop2', 'fst', 'pval', 'pstd'])

### LOOP THROUGH EACH SNP ###
for idx in snpsIn.index:
    
    ### PRINT SNP INFO ###
    print('SNP: ' + str(snpsIn.loc[idx,'rs']) + ' - ' + str(snpsIn.loc[idx,'chr']) + ':' + str(snpsIn.loc[idx,'pos']))
    
    ### FOR FST FILES AT CHROMOSOME ###
    for pops in itertools.combinations(['AFR', 'AMR', 'EAS', 'EUR', 'SAS'], 2):
        if (pops[0] == 'EAS') and (pops[1] == 'SAS'): continue
        
        
        curFile = '1_1_Fst_superpopulations/out_'+str(snpsIn.loc[idx,'chr'])+'_'+pops[0]+'_'+pops[1]+'.weir.fst'
        
        ### GET INFO ABOUT POPULATIONS ###
        pop1 = curFile.split('_')[5]
        pop2 = curFile.split('_')[6].split('.')[0]
        print(curFile+'_'+pop1+'_'+pop2)
        
        ### READ SNP FILE ###
        fstIn = pd.read_csv(curFile, sep='\t')
        
        ### GET FST ###
        curfst = fstIn[fstIn['POS'] == snpsIn.loc[idx,'pos']]['WEIR_AND_COCKERHAM_FST']
        print('   Fst: '+str(float(curfst)))
        
        ### CALCULATE P-VALUE ###
        pvals = []
        for x in np.arange(repSubsample): # FOR THE NUMBER OF TIMES TO SUBSAMPLE #
            ### SUBSAMPLE THE TABLE ###
            fstSubsample = fstIn.sample(n=nSubsample)
            ### STORE P ITERATION ###
            pvals.append( len(fstSubsample[fstSubsample['WEIR_AND_COCKERHAM_FST'] > float(curfst)])/nSubsample )
        
        ### PRINT & STORE RESULTS ###
        print('   P-mean: '+str(np.mean(pvals)))
        print('   P-std : '+str(np.std(pvals)))
        snpsIn.loc[idx, (pop1+'_'+pop2+'_p')] = np.mean(pvals)
        snpsIn.loc[idx, (pop1+'_'+pop2+'_fst')] = float(curfst)
        ### STORE RESULTS FOR BY SNP ANALYSIS ###
        snpsOut.loc[cur] = [str(snpsIn.loc[idx,'rs']), str(snpsIn.loc[idx,'chr']),str(snpsIn.loc[idx,'pos']),pop1,pop2,float(curfst),np.mean(pvals),np.std(pvals)]
        cur+=1
    print()

SNP: rs7524581 - 1:66166499
1_1_Fst_superpopulations/out_1_AFR_AMR.weir.fst_AFR_AMR
   Fst: 0.0334988
   P-mean: 0.5729
   P-std : 0.00547375556634
1_1_Fst_superpopulations/out_1_AFR_EAS.weir.fst_AFR_EAS
   Fst: 0.375077
   P-mean: 0.04221
   P-std : 0.00165858373319
1_1_Fst_superpopulations/out_1_AFR_EUR.weir.fst_AFR_EUR
   Fst: -0.000667398
   P-mean: 0.96642
   P-std : 0.00116516093309
1_1_Fst_superpopulations/out_1_AFR_SAS.weir.fst_AFR_SAS
   Fst: 0.0310765
   P-mean: 0.67639
   P-std : 0.00368658378448
1_1_Fst_superpopulations/out_1_AMR_EAS.weir.fst_AMR_EAS
   Fst: 0.24458200000000002
   P-mean: 0.02071
   P-std : 0.00129649527573
1_1_Fst_superpopulations/out_1_AMR_EUR.weir.fst_AMR_EUR
   Fst: 0.028480400000000003
   P-mean: 0.16962
   P-std : 0.00310348191553
1_1_Fst_superpopulations/out_1_AMR_SAS.weir.fst_AMR_SAS
   Fst: 0.126352
   P-mean: 0.0204
   P-std : 0.00122392810246
1_1_Fst_superpopulations/out_1_EAS_EUR.weir.fst_EAS_EUR
   Fst: 0.371016
   P-mean: 0.01507
   P-std : 0.

<h4 style="text-align:center; color:blue;"> - Output Unused Permutation Analysis - </h4>

In [6]:
### OUTPUT RAW TABLE ###
snpsIn.to_csv('/Users/brooks/Dropbox/Vanderbilt/1000_Genomes/Data/1_1_Fst_superpopulations_results.txt', sep='\t')

### OUTPUT BY SNP WITH CORRECTION ###
for curSnp in snpsOut['snp'].unique():
    #snpsOut
    xCur = snpsOut[snpsOut['snp'] == curSnp] 
    xCur['pfdr'] = multipletests(xCur['pval'], alpha=0.05, method='fdr_bh')[1]
    xCur[xCur['pfdr'] < 0.1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
1,rs7524581,1,66166499,AFR,EAS,0.375077,0.04221,0.001659,0.094973
4,rs7524581,1,66166499,AMR,EAS,0.244582,0.02071,0.001296,0.06213
6,rs7524581,1,66166499,AMR,SAS,0.126352,0.0204,0.001224,0.06213
7,rs7524581,1,66166499,EAS,EUR,0.371016,0.01507,0.001096,0.06213


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
64,rs641527,11,79689054,AFR,EAS,0.483081,0.0163,0.001073,0.086085
70,rs641527,11,79689054,EAS,EUR,0.316903,0.01913,0.001306,0.086085


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
77,rs2248020,11,93259965,AMR,EUR,0.163805,0.00642,0.000494,0.05778


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
184,rs9938742,16,62460017,AMR,EAS,0.302367,0.00874,0.000784,0.07866


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
252,rs7587067,2,197259243,AFR,AMR,0.441126,0.00927,0.000626,0.03996
253,rs7587067,2,197259243,AFR,EAS,0.54445,0.01332,0.000572,0.03996
254,rs7587067,2,197259243,AFR,EUR,0.449014,0.01302,0.000858,0.03996


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
348,rs2269706,6,30652872,AMR,SAS,0.136853,0.01731,0.000999,0.077895
350,rs2269706,6,30652872,EUR,SAS,0.149668,0.00969,0.000986,0.077895


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
420,rs6999713,8,30499522,AMR,SAS,0.171279,0.00779,0.001305,0.07011


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr


Unnamed: 0,snp,chr,pos,pop1,pop2,fst,pval,pstd,pfdr
437,rs2170226,9,116464160,AMR,EUR,0.18524,0.00386,0.000546,0.03474


In [7]:
### FOR POPULATION COMBINATIONS ###
for pops in itertools.combinations(['AFR', 'AMR', 'EAS', 'EUR', 'SAS'], 2):
    
    ### SKIP EAST AND SOUTH ASIAN COMBINATION ###
    if (pops[0] == 'EAS') and (pops[1] == 'SAS'): continue
    
    ### MAKE COLUMN TO STORE NUMBER OF SNPs WITH HIGHER Fst ###
    snpsIn[pops[0]+'_'+pops[1]+'_higher'] = 0
    snpsIn[pops[0]+'_'+pops[1]+'_chromosome_higher'] = 0
    snpsIn[pops[0]+'_'+pops[1]+'_chromosome_total'] = 0
    snpsIn[pops[0]+'_'+pops[1]+'_chromosome_percent'] = 0.0
    
    ### STORE TOTAL NUMBER OF SNPs ###
    totSNPs = 0
    
    ### FOR EACH CHROMOSOME ###
    for curChrom in np.arange(1,23):
        print(pops[0]+'_'+pops[1]+'_'+str(curChrom))
         
        ### READ SNP FILE ###
        curFile = '1_1_Fst_superpopulations/out_'+str(curChrom)+'_'+pops[0]+'_'+pops[1]+'.weir.fst'
        fstIn = pd.read_csv(curFile, sep='\t')
        
        ### ADD TO TOTAL NUMBER OF SNPS ###
        totSNPs += len(fstIn)
        
        ### FOR EACH SNP ###
        for curSnp in snpsIn.index:
            
            ### GET FST ###
            snpFst = snpsIn.loc[curSnp, pops[0]+'_'+pops[1]+'_fst']
            
            ### ADD NUMBER OF SNPs WITH HIGHER FST ###
            snpsIn.loc[curSnp, pops[0]+'_'+pops[1]+'_higher'] += len(fstIn[fstIn['WEIR_AND_COCKERHAM_FST']>snpFst])
            
            ### CHECK IF ON CHROMOSOME ###
            if snpsIn.loc[curSnp, 'chr'] == curChrom:
                
                ### ADD CHROMOSOME TOTAL AND HIGHER ###
                snpsIn.loc[curSnp,pops[0]+'_'+pops[1]+'_chromosome_higher'] = len(fstIn[fstIn['WEIR_AND_COCKERHAM_FST']>snpFst])
                snpsIn.loc[curSnp,pops[0]+'_'+pops[1]+'_chromosome_total'] = len(fstIn)
                snpsIn.loc[curSnp,pops[0]+'_'+pops[1]+'_chromosome_percent'] = len(fstIn[fstIn['WEIR_AND_COCKERHAM_FST']>snpFst]) / len(fstIn) 
    ### STORE TOTAL ###
    snpsIn[pops[0]+'_'+pops[1]+'_total'] = totSNPs
    snpsIn[pops[0]+'_'+pops[1]+'_percent'] = snpsIn[pops[0]+'_'+pops[1]+'_higher'] / snpsIn[pops[0]+'_'+pops[1]+'_total']
    
    ### DROP P-values from previous analysis ###
    snpsIn.drop(pops[0]+'_'+pops[1]+'_p',axis=1, inplace=True)
    

AFR_AMR_1
AFR_AMR_2
AFR_AMR_3
AFR_AMR_4
AFR_AMR_5
AFR_AMR_6
AFR_AMR_7
AFR_AMR_8
AFR_AMR_9
AFR_AMR_10
AFR_AMR_11
AFR_AMR_12
AFR_AMR_13
AFR_AMR_14
AFR_AMR_15
AFR_AMR_16
AFR_AMR_17
AFR_AMR_18
AFR_AMR_19
AFR_AMR_20
AFR_AMR_21
AFR_AMR_22
AFR_EAS_1
AFR_EAS_2
AFR_EAS_3
AFR_EAS_4
AFR_EAS_5
AFR_EAS_6
AFR_EAS_7
AFR_EAS_8
AFR_EAS_9
AFR_EAS_10
AFR_EAS_11
AFR_EAS_12
AFR_EAS_13
AFR_EAS_14
AFR_EAS_15
AFR_EAS_16
AFR_EAS_17
AFR_EAS_18
AFR_EAS_19
AFR_EAS_20
AFR_EAS_21
AFR_EAS_22
AFR_EUR_1
AFR_EUR_2
AFR_EUR_3
AFR_EUR_4
AFR_EUR_5
AFR_EUR_6
AFR_EUR_7
AFR_EUR_8
AFR_EUR_9
AFR_EUR_10
AFR_EUR_11
AFR_EUR_12
AFR_EUR_13
AFR_EUR_14
AFR_EUR_15
AFR_EUR_16
AFR_EUR_17
AFR_EUR_18
AFR_EUR_19
AFR_EUR_20
AFR_EUR_21
AFR_EUR_22
AFR_SAS_1
AFR_SAS_2
AFR_SAS_3
AFR_SAS_4
AFR_SAS_5
AFR_SAS_6
AFR_SAS_7
AFR_SAS_8
AFR_SAS_9
AFR_SAS_10
AFR_SAS_11
AFR_SAS_12
AFR_SAS_13
AFR_SAS_14
AFR_SAS_15
AFR_SAS_16
AFR_SAS_17
AFR_SAS_18
AFR_SAS_19
AFR_SAS_20
AFR_SAS_21
AFR_SAS_22
AMR_EAS_1
AMR_EAS_2
AMR_EAS_3
AMR_EAS_4
AMR_EAS_5
AMR_EAS_6
AMR_EAS_

In [8]:
snpsIn = snpsIn.sort_index(axis=1)

In [10]:
snpsIn.to_csv('superpopulation_fst_final_results.txt', sep='\t')