In [91]:
import pandas as pd
import os
import glob
from collections import defaultdict
os.chdir('/data6/Angelo/alexcc/AngeloStrainsPaper/manuscript_materials/soil_popgen/notebooks')

In [92]:
final_table = defaultdict(list)
#genome names
f = open('../data_tables/names.txt')
f.readline()
for line in f.readlines():
    final_table['genomes'].append(line.split()[0])
    final_table['phylum'].append(line.split()[1].split("_")[0])
    final_table['label'].append(line.split()[1].split("_")[-1].strip())
f.close()


In [93]:
## Get relative abundance information

abunds = pd.read_csv('../data_tables/bin_reads.txt', sep="\t").sum()
total_reads = pd.read_csv('../data_tables/bin_total_reads.tsv', sep="\t")
total = int(total_reads['reads'].sum())

for name in final_table['genomes']:
    i = 0

    for sum in abunds:
    #     print(sum)
        if abunds.index[i] == name:
            final_table['abundance'].append(sum / total * 100)
            break
        i += 1

In [94]:
## Get SNP #'s (aa SNVs)
for name in final_table['genomes']:
    for fn in glob.glob('../data/all/snps/*.tsv'):
        if name in fn:
            break
            
    snps = pd.read_csv(fn, sep="\t")
    snps['varFreq'] = [eval(v)/(A + C + T + G) for A, C, T, G, v in zip(snps['A'], snps['C'], snps['T'], snps['G'], snps['varBase'])]    
    
    # count total
    final_table['SNPs'].append(len(snps.index))
    
    # count N and S
    snps_s = snps[snps.mutation_type == 'S']
    snps_n = snps[snps.mutation_type == 'N']
    final_table['N:S'].append(float(len(snps_n.index)) / float(len(snps_s.index)) )
    
    #  N mean MAF
    final_table['N_freq'].append(snps_n['varFreq'].mean())
    
    #  S mean MAF 
    final_table['S_freq'].append(snps_s['varFreq'].mean())
    

In [95]:
## Get mean linkages (for both N and S)

for name in final_table['genomes']:
    for fn in glob.glob('../data/all/linkage/*.tsv'):
        if name in fn:
            break
            
    snps = pd.read_csv(fn, sep="\t")

    snps_s = snps[(snps.mutation_type_A == 'S') & (snps.mutation_type_B == 'S')]
    snps_n = snps[(snps.mutation_type_A == 'N') & (snps.mutation_type_B == 'N')]

    final_table['r2'].append(snps['r2'].mean())
    final_table['d_prime'].append(snps['d_prime'].mean())
    final_table['r2_normalized'].append(snps['r2_normalized'].mean())
    final_table['d_prime_normalized'].append(snps['d_prime_normalized'].mean())

    final_table['r2_n'].append(snps_n['r2'].mean())
    final_table['d_prime_n'].append(snps_n['d_prime'].mean())

    final_table['r2_s'].append(snps_s['r2'].mean())
    final_table['d_prime_s'].append(snps_s['d_prime'].mean())


In [96]:
## Get mean microdiversity

for name in final_table['genomes']:
    for fn in glob.glob('../data/all/nuc_diversity/*.tsv'):
        if name in fn:
            break
            
    snps = pd.read_csv(fn, sep="\t")
    final_table['pi'].append(snps['pi'].mean())

In [98]:
## get genome sizes
f = open('../data_tables/genome_sizes.txt')
sizes = {}
for line in f.readlines():
    sizes[line.split()[0]] = int(line.split()[1].strip())
f.close()

for name in final_table['genomes']:
    final_table['genome_size'].append(sizes[name])

In [99]:
## Output final table 
final = pd.DataFrame(final_table)
final.to_csv('genome_summaries.txt')
final

Unnamed: 0,genomes,phylum,label,abundance,SNPs,N:S,N_freq,S_freq,r2,d_prime,r2_normalized,d_prime_normalized,r2_n,d_prime_n,r2_s,d_prime_s,pi,genome_size
0,14_0903_05_40cm_Acidobacteria_477_57_13,Acidobacteria,477,0.095333,41659,0.81583,0.171854,0.175381,0.250134,0.908727,0.282791,0.939872,0.242541,0.933953,0.277267,0.89525,0.00347,5473360
1,14_0903_09_40cm_Bacteria_750_66_8,Dormibacteraeota,750,0.035341,88984,0.349681,0.225513,0.243178,0.250094,0.89674,0.279687,0.931087,0.245752,0.921544,0.25298,0.886085,0.01255,2943230
2,14_0927_12_40cm_Bacteria_6527_55_8,Verrucomicrobia,6527,0.051157,55032,0.694443,0.183401,0.179492,0.237439,0.889738,0.273482,0.930403,0.227948,0.904264,0.264095,0.879425,0.007492,2915259
3,14_0929_12_30cm_Bacteria_7383_54_8,Verrucomicrobia,7383,0.102463,79796,0.550088,0.175723,0.173406,0.230256,0.88638,0.265732,0.928773,0.221518,0.898584,0.252686,0.885381,0.009577,3182575
4,14_0927_05_20cm_Bacteria_2135_68_10,Chloroflexi,2135,0.054429,59047,0.391368,0.191591,0.205685,0.197282,0.853163,0.226723,0.902467,0.200115,0.884921,0.202626,0.841779,0.00726,2993566
5,14_0903_12_20cm_Proteobacteria_884_68_14,Gammaproteobacteria,884,0.059495,92819,0.562269,0.203951,0.205105,0.29387,0.925962,0.331056,0.953788,0.316552,0.935124,0.283817,0.920588,0.009579,3619965
6,14_1009_02_30cm_Bacteria_3696_55_11,Verrucomicrobia,3696,0.06462,77282,0.635501,0.168618,0.174394,0.21028,0.906281,0.243808,0.941318,0.196079,0.92669,0.226043,0.891729,0.010673,2939151
7,14_0929_02_40cm_Rokubacteria_2807_70_29,Rokubacteria,2807,0.140153,93781,0.583001,0.162173,0.164737,0.165582,0.870571,0.195566,0.916097,0.174629,0.893195,0.167603,0.863118,0.005956,5623218
8,14_0927_05_20cm_Gemmatimonadetes_2150_69_20,Gemmatimonadetes,2150,0.20909,145222,0.391522,0.179465,0.182481,0.168366,0.915349,0.199016,0.946756,0.174004,0.933831,0.16649,0.909262,0.015573,3359616
9,14_1009_02_30cm_Gemmatimonadetes_3684_69_12,Gemmatimonadetes,3684,0.110383,39112,0.485598,0.184313,0.193694,0.23514,0.858713,0.267992,0.909571,0.231702,0.876632,0.245317,0.854317,0.004369,3448421
