In [1]:
import os
import sys
import glob
import scipy
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from shutil import copyfile

from tqdm import tqdm

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

sys.path.append('/home/mattolm/Bio_scripts/')

import shutil
from IPython.display import display, HTML

from Bio import SeqIO

https://github.com/alexcritschristoph/wastewater_sarscov2

## Links to input data

In [2]:
PROBLEMATIC_SITES_LOCATION = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/GithubRepo/wastewater_sarscov2/public_data/problematic_sites_sarsCov2.vcf'
EXLUDE_STRAINS_LOCATION = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/GithubRepo/wastewater_sarscov2/public_data/exclude.txt'

IS_PROFILE_FOLDER_LOCATION = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/GithubRepo/wastewater_sarscov2/data/wastewater/'

## Links to non-github input data

In [3]:
MSA_LOCATION = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/combined_good/patient_aug/msa_0824.fasta'
METADATA_LOCATION = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/combined_good/patient_aug/metadata_2020-08-24_12-12.tsv'

## Links to where data will be saved

In [4]:
INTRAPATIENT_SNVS_SAVE_LOCATION = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/GithubRepo/wastewater_sarscov2/tables/Interpatient_SNVs_v2.csv.gz'
INTRAPATIENT_SNVS_SNPLEVEL_SAVE_LOCATION ='/groups/banfield/projects/industrial/wastewater_bayarea/2020/GithubRepo/wastewater_sarscov2/tables/Interpatient_SNVs_SNPlevel_v2.csv.gz'

WASTEWATER_SNVS_SAVELOC = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/GithubRepo/wastewater_sarscov2/tables/Wastewater_SNVs_v2.csv'
WASTEWATER_ABUND_SAVELOC = '/groups/banfield/projects/industrial/wastewater_bayarea/2020/GithubRepo/wastewater_sarscov2/tables/Wastewater_abundance_v2.csv'

# Load interpatient SNPs

In [5]:
def load_SNPs_from_MSA(msa_loc, reference_id='hCoV-19/Wuhan/WIV04/2019|EPI_ISL_402124|2019-12-30|China', ignore=set()):
    """
    Load SNPs from a multiple sequence alignment
    """
    table = defaultdict(list)
    
    # Load the referece sequence
    reference = None
    for record in SeqIO.parse(msa_loc, 'fasta'):
        if record.id == reference_id:
            reference = record
            break
    if reference is None:
        print("Cannot find sequence {0}".format(reference_id))
        raise Exception()
        
    # Load the rest of the sequences into RAM
    record_ids = []
    record_sequences = []
    record_SNPs = defaultdict(list)
    for record in SeqIO.parse(msa_loc, 'fasta'):
        record_ids.append(record.description)
        record_sequences.append(str(record.seq).upper())
        
    # Call SNPs
    print("There are {0} columns".format(len(str(reference.seq).upper())))
    reference_position = 0
    #for bases in tqdm(zip(str(reference.seq).upper(), *record_sequences)):
    for column, ref_base in tqdm(enumerate(str(reference.seq).upper())):
        if ref_base in ['A', 'C', 'G', 'T']:
            if reference_position in ignore:
                reference_position += 1
                continue
            
            bases = [r[column] for r in record_sequences]
            for i, base in enumerate(bases): # loop through the records and this column
                if base in ['A', 'C', 'G', 'T']:
                    if base != ref_base:
                        record_SNPs[record_ids[i]].append((reference_position, column, base))
                else:
                    record_SNPs[record_ids[i]].append((reference_position, column, 'NA'))
            reference_position += 1
        else:
            pass
        
    # Make table
    table = defaultdict(list)
    for record in record_ids:
        SNPs = record_SNPs[record]
        table['record'].append(record)
        table['SNP_count'].append(len([True for s in SNPs if s[2] != 'NA']))
        table['missing_count'].append(len([True for s in SNPs if s[2] == 'NA']))
        table['SNP_IDs'].append(set(SNPs))
        
    ISdb = pd.DataFrame(table)

    return ISdb

# Load problem sites
problem_sites = pd.read_csv(PROBLEMATIC_SITES_LOCATION, comment="#", sep="\t")
filter_sites = problem_sites.query("FILTER=='mask'")
filter_sites.loc[:,'POS'] = filter_sites['POS'] - 1
filter_sites = set(filter_sites['POS'].tolist())

IPdb = load_SNPs_from_MSA(MSA_LOCATION, ignore=filter_sites)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
168it [00:00, 1047.62it/s]

There are 30861 columns


30861it [11:28, 44.82it/s]


# Incorporate metadata

In [6]:
MEdb = pd.read_csv(METADATA_LOCATION, sep='\t')

exclude = []
f = open(EXLUDE_STRAINS_LOCATION)
for line in f.readlines():
    if not line.startswith("#") and line.strip() != '':
        exclude.append(line.strip())

IPMdb = IPdb.copy()
IPMdb['strain'] = [x.replace('hCoV-19/', '').split('|')[0] for x in IPMdb['record']]
IPMdb = pd.merge(IPMdb, MEdb, on='strain', how='outer')
IPMdb = IPMdb[IPMdb['host'] == 'Human']
IPMdb = IPMdb[~IPMdb['strain'].isin(exclude)]

print("{0:,} strains are in the metadata but not in the MSA".format(len(IPMdb[IPMdb['record'].isna()])))
print("{0:,} strains are in the MSA but not in the metadata".format(len(IPMdb[IPMdb['region'].isna()])))

Idb = IPMdb[~IPMdb['record'].isna()]


9,064 strains are in the metadata but not in the MSA
0 strains are in the MSA but not in the metadata


## Save

In [7]:
Idb.to_csv(INTRAPATIENT_SNVS_SAVE_LOCATION, index=False)

# Make a SNV-level table

In [8]:
ALL_SNPS = set(x for x in set.union(*Idb['SNP_IDs'].tolist()) if x[2] != 'NA')
print("There are {0:,} unique SNPs in this dataset".format(len(ALL_SNPS)))

There are 21,554 unique SNPs in this dataset


In [9]:
table = defaultdict(list)
for SNP in tqdm(ALL_SNPS):
    db = Idb[[SNP in x for x in Idb['SNP_IDs']]]
    
    table['SNP'].append(str(SNP))
    table['number_samples'].append(len(db))
    table['strains'].append(set(db['strain'].tolist()))

Vdb = pd.DataFrame(table)

100%|██████████| 21554/21554 [08:37<00:00, 41.65it/s]


## Save

In [10]:
Vdb.to_csv(INTRAPATIENT_SNVS_SNPLEVEL_SAVE_LOCATION, index=False)

# Make wastewater tables

In [11]:
Adb = []
for fn in glob.glob(IS_PROFILE_FOLDER_LOCATION + '*/output/*genomeWide_scaffold_info.tsv'):
    adb = pd.read_csv(fn, sep='\t')
    adb['sample'] = os.path.basename(fn).split('_genomeWide')[0]
    Adb.append(adb)
Adb = pd.concat(Adb).reset_index(drop=True)
Adb

Unnamed: 0,genome,detected_scaffolds,true_scaffolds,true_length,SNPs,BiAllelic_SNPs,MultiAllelic_SNPs,consensus_SNPs,population_SNPs,breadth,coverage,std_cov,mean_clonality,rarefied_mean_microdiversity,conANI,popANI,unmaskedBreadth,expected_breadth,sample
0,all_scaffolds,1,1,29840,70,64,0,7,6,0.999129,33.796247,8.19115,0.997073,0.002109,0.999765,0.999799,0.999062,1.0,6_30_S_MOS
1,all_scaffolds,1,1,29840,42,33,0,14,9,0.999765,88.467225,16.853628,0.997316,0.002547,0.999531,0.999698,0.999296,1.0,MR_7_1_MOS
2,all_scaffolds,1,1,29840,37,32,0,6,5,0.995912,8.407775,5.71535,0.995928,0.0,0.999786,0.999822,0.940885,0.999403,5_28_S
3,all_scaffolds,1,1,29840,42,34,0,11,8,0.999397,32.567828,7.868926,0.997842,0.001941,0.999631,0.999732,0.999162,1.0,6_30_S_COL
4,all_scaffolds,1,1,29840,16,10,0,8,6,0.999497,107.051642,28.055301,0.99838,0.001509,0.999732,0.999799,0.998727,1.0,5_19_S
5,all_scaffolds,1,1,29840,73,65,0,14,8,0.999229,13.0375,8.045137,0.997008,0.007359,0.999525,0.999729,0.988673,0.99999,SQ_7_1_MOS
6,all_scaffolds,1,1,29840,31,27,0,6,4,0.999129,10.832574,4.921677,0.997967,0.0,0.999796,0.999864,0.985757,0.99993,5_19_S_2
7,all_scaffolds,1,1,29840,25,23,0,5,2,0.994437,7.075737,4.30769,0.996682,0.0,0.999813,0.999925,0.89742,0.998065,6_09_S


In [12]:
al = []
for fn in glob.glob(IS_PROFILE_FOLDER_LOCATION + '*/output/*SNVs.tsv'):
    adb = pd.read_csv(fn, sep='\t')
    adb['sample'] = os.path.basename(fn).split('_SNVs')[0]
    al.append(adb)
Sdb = pd.concat(al).reset_index(drop=True)
Sdb

Unnamed: 0,scaffold,position,refBase,A,C,T,G,conBase,varBase,allele_count,cryptic,baseCoverage,varFreq,conFreq,refFreq,sample
0,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,33,A,6,0,2,0,A,T,2,False,8,0.250000,0.750000,0.750000,6_30_S_MOS
1,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,34,A,5,0,2,0,A,T,2,False,7,0.285714,0.714286,0.714286,6_30_S_MOS
2,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,35,C,0,7,2,0,C,T,2,False,9,0.222222,0.777778,0.777778,6_30_S_MOS
3,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,240,C,0,1,31,0,T,C,1,False,32,0.031250,0.968750,0.031250,6_30_S_MOS
4,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,1025,C,0,24,2,0,C,T,2,False,26,0.076923,0.923077,0.923077,6_30_S_MOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,28377,G,0,0,8,20,G,T,2,False,28,0.285714,0.714286,0.714286,6_09_S
332,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,28880,G,18,0,0,7,A,G,2,False,25,0.280000,0.720000,0.280000,6_09_S
333,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,28881,G,18,0,0,7,A,G,2,False,25,0.280000,0.720000,0.280000,6_09_S
334,hCoV-19_Wuhan_WIV04_2019_EPI_ISL_402124_2019-1...,28882,G,0,18,0,7,C,G,2,False,25,0.280000,0.720000,0.280000,6_09_S


## Parse

In [13]:
def parse_SNP(row):
    if row['allele_count'] == 1:
        return (row['position'], row['conBase'])
    elif row['allele_count'] == 2:
        if row['conBase'] != row['refBase']:
            return (row['position'], row['conBase'])
        else:
            assert row['varBase'] != row['refBase'], str(row)
            return (row['position'], row['varBase'])
    else:
        raise Exception()
        
def key_freq(row):
    return row[row['key'][1]] / row['baseCoverage']

Sdb['key'] = Sdb.apply(parse_SNP, axis=1)
Sdb['key_freq'] = Sdb.apply(key_freq, axis=1)

## Save

In [14]:
Sdb.to_csv(WASTEWATER_SNVS_SAVELOC, index=False)
Adb.to_csv(WASTEWATER_ABUND_SAVELOC, index=False)