# 

In [1]:
import pandas as pd
import sourmash
import glob
import os
import re

In [2]:
fmg_csv = "../output.roux2017/fastmultigather/roux2017-x-refseq-v219.dna.k21-sc10.t0.gather.csv"

## Load gather csvs with multiple parameters at once

In [3]:
fmg_csvs = glob.glob("../output.roux2017/fastmultigather/roux2017-x-refseq-v219.dna.k*-sc*.t0.gather.csv")

def extract_parameters_from_filename(filename):
    match = re.search(r'k(\d+)-sc(\d+).t(\d+)', filename)
    if match:
        ksize = int(match.group(1))
        scaled = int(match.group(2))
        threshold = int(match.group(3))
        return ksize, scaled, threshold
    else:
        return None, None, None

In [4]:
fmg_dfs = []
for inf in fmg_csvs:
    ksize, scaled, threshold = extract_parameters_from_filename(inf)
    inD = pd.read_csv(inf)
    inD['ksize'] = ksize
    inD['scaled'] = scaled
    inD['threshold'] = threshold
    fmg_dfs.append(inD)

fmg = pd.concat(fmg_dfs)

In [5]:
fmg.head()

Unnamed: 0,query_name,query_md5,match_name,match_md5,f_match_query,intersect_bp,ksize,scaled,threshold
0,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|479336462|ref|NC_021061.1| Mycobacterium ph...,fe4925d714f82bb52c847ac7ec487b1d,1.0,43700,21,100,0
1,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|273809542|ref|NC_013597.1| Aggregatibacter ...,7eb3626e5efc71a3a6a4f0310f3f1113,0.975446,43700,21,100,0
2,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|1070619425|ref|NC_031241.1| Staphylococcus ...,74bc58e1c10927dc5e6c4eb92f5b72f9,1.0,42200,21,100,0
3,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|157311135|ref|NC_009819.1| Streptococcus ph...,f0c5b5931b0c7d993b4e2ec278df773f,1.0,41300,21,100,0
4,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|197935853|ref|NC_011201.1| Ralstonia phage ...,a023a5fa98a68ce2d90884efcad473e7,1.0,41100,21,100,0


## Read in the Sample compositions tsvs

In [6]:
samplecomp_tsvs = glob.glob('../roux2017-samplecomp/*tsv')
samplecomp_tsvs

['../roux2017-samplecomp/Sample_10_comp.tsv',
 '../roux2017-samplecomp/Sample_11_comp.tsv',
 '../roux2017-samplecomp/Sample_7_comp.tsv',
 '../roux2017-samplecomp/Sample_6_comp.tsv',
 '../roux2017-samplecomp/Sample_1_comp.tsv',
 '../roux2017-samplecomp/Sample_4_comp.tsv',
 '../roux2017-samplecomp/Sample_5_comp.tsv',
 '../roux2017-samplecomp/Sample_13_comp.tsv',
 '../roux2017-samplecomp/Sample_12_comp.tsv',
 '../roux2017-samplecomp/Sample_3_comp.tsv',
 '../roux2017-samplecomp/Sample_2_comp.tsv',
 '../roux2017-samplecomp/Sample_14_comp.tsv',
 '../roux2017-samplecomp/Sample_9_comp.tsv',
 '../roux2017-samplecomp/Sample_8_comp.tsv']

In [7]:
cmpDF = pd.concat((pd.read_csv(f, sep='\t').assign(Sample=os.path.basename(f).split('_comp.tsv')[0]) for f in samplecomp_tsvs), ignore_index=True)
cmpDF.rename(columns={'## Virus': 'Virus'}, inplace=True)
cmpDF.head()

Unnamed: 0,Virus,Name,Coverage,Sample
0,22855216,Pseudomonas phage phi-12 segment S,0.058544,Sample_10
1,134288688,Burkholderia phage phiE12-2 chromosome,0.054693,Sample_10
2,530787156,Bacillus phage Troll,0.05109,Sample_10
3,269838908,Morganella phage MmP1,0.047719,Sample_10
4,557307743,Mycobacterium phage Fredward,0.044565,Sample_10


In [8]:
fmg['Name'] = fmg['match_name'].str.split(' ', n=1, expand=True)[1]
fmg['gi'] = fmg['match_name'].str.split('|', expand=True)[1]
fmg.head()

Unnamed: 0,query_name,query_md5,match_name,match_md5,f_match_query,intersect_bp,ksize,scaled,threshold,Name,gi
0,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|479336462|ref|NC_021061.1| Mycobacterium ph...,fe4925d714f82bb52c847ac7ec487b1d,1.0,43700,21,100,0,"Mycobacterium phage Butters, complete genome",479336462
1,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|273809542|ref|NC_013597.1| Aggregatibacter ...,7eb3626e5efc71a3a6a4f0310f3f1113,0.975446,43700,21,100,0,"Aggregatibacter phage S1249, complete sequence",273809542
2,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|1070619425|ref|NC_031241.1| Staphylococcus ...,74bc58e1c10927dc5e6c4eb92f5b72f9,1.0,42200,21,100,0,"Staphylococcus phage CNPx, complete genome",1070619425
3,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|157311135|ref|NC_009819.1| Streptococcus ph...,f0c5b5931b0c7d993b4e2ec278df773f,1.0,41300,21,100,0,"Streptococcus phage P9, complete genome",157311135
4,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|197935853|ref|NC_011201.1| Ralstonia phage ...,a023a5fa98a68ce2d90884efcad473e7,1.0,41100,21,100,0,"Ralstonia phage RSB1, complete genome",197935853


## start with a single sample (sample 5)

In [9]:
sample5_fmg = fmg[fmg['query_name'] == "Sample_5"]
sample5_fmg.shape

(4016, 11)

In [10]:
sample5_fmg.head()

Unnamed: 0,query_name,query_md5,match_name,match_md5,f_match_query,intersect_bp,ksize,scaled,threshold,Name,gi
0,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|479336462|ref|NC_021061.1| Mycobacterium ph...,fe4925d714f82bb52c847ac7ec487b1d,1.0,43700,21,100,0,"Mycobacterium phage Butters, complete genome",479336462
1,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|273809542|ref|NC_013597.1| Aggregatibacter ...,7eb3626e5efc71a3a6a4f0310f3f1113,0.975446,43700,21,100,0,"Aggregatibacter phage S1249, complete sequence",273809542
2,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|1070619425|ref|NC_031241.1| Staphylococcus ...,74bc58e1c10927dc5e6c4eb92f5b72f9,1.0,42200,21,100,0,"Staphylococcus phage CNPx, complete genome",1070619425
3,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|157311135|ref|NC_009819.1| Streptococcus ph...,f0c5b5931b0c7d993b4e2ec278df773f,1.0,41300,21,100,0,"Streptococcus phage P9, complete genome",157311135
4,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|197935853|ref|NC_011201.1| Ralstonia phage ...,a023a5fa98a68ce2d90884efcad473e7,1.0,41100,21,100,0,"Ralstonia phage RSB1, complete genome",197935853


In [11]:
sample5_cmp = cmpDF[cmpDF['Sample'] == 'Sample_5']
sample5_cmp.shape

(508, 4)

In [12]:
sample5_cmp.head()

Unnamed: 0,Virus,Name,Coverage,Sample
4581,31044225,Enterobacteria phage BP-4795,0.093009,Sample_5
4582,725915949,Staphylococcus phage phiSa119,0.084515,Sample_5
4583,431810540,Helicobacter phage KHP30 DNA,0.076783,Sample_5
4584,9632893,Streptococcus phage Sfi19,0.069745,Sample_5
4585,744692834,Salmonella phage LSPA1,0.06334,Sample_5


In [13]:
sample5_fmg[sample5_fmg['Name'].str.contains('Enterobacteria')].head()

Unnamed: 0,query_name,query_md5,match_name,match_md5,f_match_query,intersect_bp,ksize,scaled,threshold,Name,gi
18,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|89888607|ref|NC_007817.1| Enterobacteria ph...,48498256fd4f45e4162266d34b89f453,0.114754,1100,21,100,0,"Enterobacteria phage ID2 Moscow/ID/2001, compl...",89888607
41,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|422934269|ref|NC_019501.1| Enterobacteria p...,e3a257498956e99ae2b98f0b9a706ba2,0.974026,38500,21,100,0,"Enterobacteria phage IME10, complete genome",422934269
50,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|169936017|ref|NC_010463.1| Enterobacteria p...,905b44449d2c33255c59804fba142447,0.012308,11800,21,100,0,"Enterobacteria phage Fels-2, complete genome",169936017
73,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|29366675|ref|NC_000866.4| Enterobacteria ph...,f7691cb10a71235d9ef18cfab0b9a642,0.999398,166000,21,100,0,"Enterobacteria phage T4, complete genome",29366675
75,Sample_5,cc604cf6c7bbd9634d06112a8bcacdb3,gi|238695156|ref|NC_012741.1| Enterobacteria p...,ac7fa71aa2b91d3b69c39405f934af5e,0.012881,8300,21,100,0,"Enterobacteria phage JS10, complete genome",238695156


In [14]:
fmg['match_name'].nunique()

3620

## Merge based on gi number match