In [22]:
import os
import sys
import glob
import scipy
import skbio
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42

from IPython.display import display, HTML
import tRep

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 100

In [23]:
Mdb = pd.read_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/metadata/sample_metadata.tsv',sep='\t')
Mdb = Mdb.rename(columns=lambda x: x.strip())
Mdb.to_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/metadata/sample_metadata_v1_original.csv',index=False)


In [25]:
Mdb2 = Mdb.copy()
Mdb2['sample'] = [x.strip() for x in Mdb2['sample']]
Mdb2['core'] = [x.strip() for x in Mdb2['core']]
Mdb2.at[11,'sample'] = '6_09_S1'
Mdb2

Unnamed: 0,sample,core,location,method,date,mean_ct,ct_from_replicate,sequencing
0,5_13_A_S1,MIGS,Oakland,amicon,05/13/20,30.7,1,enriched
1,5_13_C_S2,MIGS,Oakland,amicon,05/13/20,30.7,0,enriched
2,5_19_F_S3,MIGS,Oakland,amicon,05/19/20,29.5,0,enriched
3,5_19_A_S4,MIGS,Oakland,amicon,05/19/20,29.5,1,enriched
4,5_28_I,MIGS,Oakland,amicon,05/28/20,31.37,0,enriched
5,5_28_E,MIGS,Berkeley,amicon,05/28/20,33.8,0,enriched
6,5_28_A,MIGS,BerkeleyHills,amicon,05/28/20,35.47,0,enriched
7,6_02_Nc,MIGS,Berkeley,amicon,06/02/20,33.37,0,enriched
8,6_02_Sc,MIGS,Oakland,amicon,06/02/20,30.37,0,enriched
9,6_02_Ac,MIGS,BerkeleyHills,amicon,06/02/20,36.23,0,enriched


In [26]:
# check if sample naming is all the same

Rdb = pd.read_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/waster_water_readDepths_formatted.csv')
Rdb

Unnamed: 0,sample,read_bases,reads,read_length,Illumina_enrichment
0,5_13_A_S1,6107540,82174,74.324482,unenriched
1,5_13_C_S2,1796238,24144,74.396869,unenriched
2,5_19_A_S4,25431914,343252,74.091088,unenriched
3,5_19_F_S3,24851304,334748,74.238842,unenriched
4,5_28_A_S28,71023874,978422,72.590226,unenriched
5,5_28_E_S26,72838764,1009198,72.174899,unenriched
6,5_28_I_S25,89771642,1248062,71.928832,unenriched
7,630-N,4177966696,54973246,76.0,unenriched
8,630-S1,2563889792,33735392,76.0,unenriched
9,630-S2,4420453816,58163866,76.0,unenriched


In [27]:
inconsistent_sample_name = list(set(Mdb2['sample'])-set(Rdb['sample']))
print(len(inconsistent_sample_name))
inconsistent_sample_name

12


['MR_1_7_1_MOS',
 'SQ_3_7_1_MOS',
 '6_02_Sc',
 '6_02_Nc',
 'S_3_6_30_MOS',
 '6_09_N1',
 '6_09_S1',
 '5_28_E',
 'S_3_6_30_COL',
 '5_28_A',
 '6_02_Ac',
 '5_28_I']

In [28]:
Mdb3 = Mdb2.copy()

for i,row in Mdb3.iterrows():
    if row['sample'] in inconsistent_sample_name:
        for i2,row2 in Rdb.iterrows():
            if row['sample'] in row2['sample']:
                Mdb3.at[i,'sample'] = row2['sample']
print(set(Mdb3['sample'])-set(Rdb['sample']))
print(set(Rdb['sample'])-set(Mdb3['sample']))
Mdb3

set()
set()


Unnamed: 0,sample,core,location,method,date,mean_ct,ct_from_replicate,sequencing
0,5_13_A_S1,MIGS,Oakland,amicon,05/13/20,30.7,1,enriched
1,5_13_C_S2,MIGS,Oakland,amicon,05/13/20,30.7,0,enriched
2,5_19_F_S3,MIGS,Oakland,amicon,05/19/20,29.5,0,enriched
3,5_19_A_S4,MIGS,Oakland,amicon,05/19/20,29.5,1,enriched
4,5_28_I_S25,MIGS,Oakland,amicon,05/28/20,31.37,0,enriched
5,5_28_E_S26,MIGS,Berkeley,amicon,05/28/20,33.8,0,enriched
6,5_28_A_S28,MIGS,BerkeleyHills,amicon,05/28/20,35.47,0,enriched
7,6_02_Nc_S27,MIGS,Berkeley,amicon,06/02/20,33.37,0,enriched
8,6_02_Sc_S29,MIGS,Oakland,amicon,06/02/20,30.37,0,enriched
9,6_02_Ac_S31,MIGS,BerkeleyHills,amicon,06/02/20,36.23,0,enriched


In [29]:
Mdb3['sample_complete_description'] = ["_".join([x,y]) for x,y in zip(Mdb3['sample'],Mdb3['core'])]
Mdb3

Unnamed: 0,sample,core,location,method,date,mean_ct,ct_from_replicate,sequencing,sample_complete_description
0,5_13_A_S1,MIGS,Oakland,amicon,05/13/20,30.7,1,enriched,5_13_A_S1_MIGS
1,5_13_C_S2,MIGS,Oakland,amicon,05/13/20,30.7,0,enriched,5_13_C_S2_MIGS
2,5_19_F_S3,MIGS,Oakland,amicon,05/19/20,29.5,0,enriched,5_19_F_S3_MIGS
3,5_19_A_S4,MIGS,Oakland,amicon,05/19/20,29.5,1,enriched,5_19_A_S4_MIGS
4,5_28_I_S25,MIGS,Oakland,amicon,05/28/20,31.37,0,enriched,5_28_I_S25_MIGS
5,5_28_E_S26,MIGS,Berkeley,amicon,05/28/20,33.8,0,enriched,5_28_E_S26_MIGS
6,5_28_A_S28,MIGS,BerkeleyHills,amicon,05/28/20,35.47,0,enriched,5_28_A_S28_MIGS
7,6_02_Nc_S27,MIGS,Berkeley,amicon,06/02/20,33.37,0,enriched,6_02_Nc_S27_MIGS
8,6_02_Sc_S29,MIGS,Oakland,amicon,06/02/20,30.37,0,enriched,6_02_Sc_S29_MIGS
9,6_02_Ac_S31,MIGS,BerkeleyHills,amicon,06/02/20,36.23,0,enriched,6_02_Ac_S31_MIGS


In [30]:
Mdb3.to_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/metadata/sample_metadata_v2_sample_name_adjusted.csv',index=False)
