In [1]:
import glob
import os
import shutil
import sys
sys.path.append("../src")

from ncbi_upload import convert_metadata, merge_metadata, recover_vaccine_date, _convert_str_date_to_timestamp
import pandas as pd

In [2]:
def check_name(name, reference):
    if name in reference:
        return "Yes"
    else:
        return "No"

In [3]:
with open("../data/ncbi_upload/nuccore_result.txt", 'r') as infile:
    lines = infile.readlines()
    lines_blanked = [line if "Severe" in line else 1000 for line in lines]
    relevant_lines = list(filter((1000).__ne__, lines_blanked))

In [4]:
uploaded_sequences = [line.split(" ")[8].split("/")[3] for line in relevant_lines]

In [5]:
len(uploaded_sequences)

1034

In [6]:
uploaded_bam_files = pd.read_csv("../data/ncbi_upload/sra_result.csv")["Library Name"].to_list()

In [7]:
len(uploaded_bam_files)

965

In [8]:
with open("../data/ncbi_upload/wastewater_samples.txt", "r") as infile:
        lines = infile.readlines()
        wastewater_samples_list = [line.split("/")[1].split(".")[0] for line in lines]
len(wastewater_samples_list)

215

In [9]:
all_files = glob.glob("/Users/karthikramesh/src/HCoV-19-Genomics/consensus_sequences/*.fasta")
all_names = [file.split("/")[6].split(".")[0] for file in all_files]
# build a dictionary from this
mapping_dict = {file: {"sequence_uploaded": check_name(file, uploaded_sequences),
                       "bam_uploaded": check_name(file, uploaded_bam_files),
                       "wastewater": check_name(file, wastewater_samples_list)} for file in all_names}

In [10]:
status_df = pd.DataFrame.from_dict(mapping_dict, orient='index')
status_df

Unnamed: 0,sequence_uploaded,bam_uploaded,wastewater
STM-0000072-E03,No,No,No
STM-0000104-B04,No,No,No
SEARCH-5347-SAN,No,No,No
SEARCH-102584,No,No,No
BCN-SEARCH-102962,No,No,No
...,...,...,...
IS-SEARCH-108029,No,No,No
STM-0000038-A09,No,No,No
CA-SEARCH-46481,No,No,No
SEARCH-14138,No,No,No


In [11]:
# load and merge metadata
online_metadata = pd.read_csv("../data/ncbi_upload/metadata_online.csv")
repo_metadata = pd.read_csv("../data/ncbi_upload/metadata.csv")
metadata = merge_metadata(repo_metadata, online_metadata)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
metadata

Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,fasta_hdr,...,Sequencing technology,Assembly method,Coverage,Originating lab,Address,Sample ID given by the sample provider,Submitting lab,Address.1,Sample ID given by the submitting laboratory,Authors
0,MG0987,MT598172,EPI_ISL_416457,3/18/20,USA/California/San Diego,99.5954,2465.60,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_MG0987,...,,,,,,,,,,
1,PC00101P,MT192765,EPI_ISL_414648,3/11/20,USA/California/San Diego,99.7525,3516.14,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_PC00101P_threshold_0_quality_20,...,,,,,,,,,,
2,SEARCH-0007-SAN,MT598171,EPI_ISL_429990,3/21/20,USA/California/San Diego,100.0000,6215.17,SEARCH Alliance San Diego with Christina Clark...,Rady Children’s Hospital,Consensus_SEARCH-0007-SAN_L1_threshold_0_quali...,...,,,,,,,,,,
3,SEARCH-0016-SAN,MT598173,EPI_ISL_430016,3/24/20,USA/California/San Diego,100.0000,6440.67,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_SEARCH-0016-SAN_L1_threshold_0_quali...,...,,,,,,,,,,
4,SEARCH-0017-SAN,MT598174,EPI_ISL_429991,3/24/20,USA/California/San Diego,100.0000,4947.09,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_SEARCH-0017-SAN_L1_threshold_0_quali...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30698,SEARCH-108990,,EPI_ISL_4412901,8/16/21,USA/California/San Diego,99.9796,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-108990/2021,...,Illumina,iVar 1.3.1,100.0,San Diego County Public Health Laboratory,"3851 Rosecrans St., San Diego, CA, 92110",SEARCH-108990,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108990,SEARCH Alliance San Diego with Ashleigh Murphy...
30699,SEARCH-108991,,EPI_ISL_4412904,8/17/21,USA/California/San Diego,99.8062,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-108991/2021,...,Illumina,iVar 1.3.1,99.8,San Diego County Public Health Laboratory,"3851 Rosecrans St., San Diego, CA, 92110",SEARCH-108991,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108991,SEARCH Alliance San Diego with Ashleigh Murphy...
30700,SEARCH-108993,,EPI_ISL_4412907,8/30/21,USA/California/San Diego,99.8912,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-108993/2021,...,Illumina,iVar 1.3.1,99.9,San Diego County Public Health Laboratory,"3851 Rosecrans St., San Diego, CA, 92110",SEARCH-108993,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108993,SEARCH Alliance San Diego with Ashleigh Murphy...
30701,SEARCH-108996,,EPI_ISL_4412909,8/16/21,USA/California/San Diego,99.8232,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-108996/2021,...,Illumina,iVar 1.3.1,99.8,San Diego County Public Health Laboratory,"3851 Rosecrans St., San Diego, CA, 92110",SEARCH-108996,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108996,SEARCH Alliance San Diego with Ashleigh Murphy...


In [13]:
metadata_asia = metadata[metadata["location"].str.contains("Pakistan|Jordan", na=False)]
metadata_na = metadata[~metadata["location"].str.contains("Pakistan|Jordan", na=False)]
metadata_asia["location"] = "Asia" + "/" + metadata_asia["location"]
metadata_na["location"] = "North America" + "/" + metadata_na["location"]
metadata = pd.concat([metadata_na, metadata_asia])
metadata

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_asia["location"] = "Asia" + "/" + metadata_asia["location"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_na["location"] = "North America" + "/" + metadata_na["location"]


Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,fasta_hdr,...,Sequencing technology,Assembly method,Coverage,Originating lab,Address,Sample ID given by the sample provider,Submitting lab,Address.1,Sample ID given by the submitting laboratory,Authors
0,MG0987,MT598172,EPI_ISL_416457,3/18/20,North America/USA/California/San Diego,99.5954,2465.60,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_MG0987,...,,,,,,,,,,
1,PC00101P,MT192765,EPI_ISL_414648,3/11/20,North America/USA/California/San Diego,99.7525,3516.14,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_PC00101P_threshold_0_quality_20,...,,,,,,,,,,
2,SEARCH-0007-SAN,MT598171,EPI_ISL_429990,3/21/20,North America/USA/California/San Diego,100.0000,6215.17,SEARCH Alliance San Diego with Christina Clark...,Rady Children’s Hospital,Consensus_SEARCH-0007-SAN_L1_threshold_0_quali...,...,,,,,,,,,,
3,SEARCH-0016-SAN,MT598173,EPI_ISL_430016,3/24/20,North America/USA/California/San Diego,100.0000,6440.67,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_SEARCH-0016-SAN_L1_threshold_0_quali...,...,,,,,,,,,,
4,SEARCH-0017-SAN,MT598174,EPI_ISL_429991,3/24/20,North America/USA/California/San Diego,100.0000,4947.09,SEARCH Alliance San Diego,Andersen lab at Scripps Research,Consensus_SEARCH-0017-SAN_L1_threshold_0_quali...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30135,SEARCH-108236,,EPI_ISL_4269128,3/15/21,Asia/Pakistan/Punjab/Lahore,99.9320,,SEARCH Alliance with Shifa Tameer-e-Millat Uni...,Islamabad Diagnostic Center,hCoV-19/Pakistan/PB-SEARCH-108236/2021,...,Illumina,iVar 1.3.1,99.9,Islamabad Diagnostic Center,"13-A Kohistan Rd., F-8 Markaz, Islamabad",SEARCH-108236,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108236,SEARCH Alliance with Shifa Tameer-e-Millat Uni...
30136,SEARCH-108239,,EPI_ISL_4269129,5/10/21,Asia/Pakistan/Punjab/Lahore,100.0000,,SEARCH Alliance with Shifa Tameer-e-Millat Uni...,Islamabad Diagnostic Center,hCoV-19/Pakistan/PB-SEARCH-108239/2021,...,Illumina,iVar 1.3.1,100.0,Islamabad Diagnostic Center,"13-A Kohistan Rd., F-8 Markaz, Islamabad",SEARCH-108239,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108239,SEARCH Alliance with Shifa Tameer-e-Millat Uni...
30137,SEARCH-108241,,EPI_ISL_4269130,5/17/21,Asia/Pakistan/Punjab/Lahore,100.0000,,SEARCH Alliance with Shifa Tameer-e-Millat Uni...,Islamabad Diagnostic Center,hCoV-19/Pakistan/PB-SEARCH-108241/2021,...,Illumina,iVar 1.3.1,100.0,Islamabad Diagnostic Center,"13-A Kohistan Rd., F-8 Markaz, Islamabad",SEARCH-108241,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108241,SEARCH Alliance with Shifa Tameer-e-Millat Uni...
30138,SEARCH-108243,,EPI_ISL_4269131,5/8/21,Asia/Pakistan/Punjab/Lahore,100.0000,,SEARCH Alliance with Shifa Tameer-e-Millat Uni...,Islamabad Diagnostic Center,hCoV-19/Pakistan/PB-SEARCH-108243/2021,...,Illumina,iVar 1.3.1,100.0,Islamabad Diagnostic Center,"13-A Kohistan Rd., F-8 Markaz, Islamabad",SEARCH-108243,Andersen lab at Scripps Research,"10550 North Torrey Pines Road, La Jolla, CA 92037",SEARCH-108243,SEARCH Alliance with Shifa Tameer-e-Millat Uni...


In [14]:
# load and convert the metadata for samples that are not from wastewater
uploaded_or_wastewater = list(status_df[(status_df["wastewater"] == "Yes") | (status_df["sequence_uploaded"] == "Yes")].index)
non_wastewater_metadata = metadata[~metadata["ID"].isin(uploaded_or_wastewater)]
non_wastewater_converted_data = convert_metadata(non_wastewater_metadata,
                                                 "../data/ncbi_upload/metadata_column_mapping.json",
                                                 "../data/ncbi_upload/metadata_constant_mapping.json",
                                                 "../data/ncbi_upload/author_conversions.csv",
                                                False)
non_wastewater_converted_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_both_fields["collected_by"] = has_both_fields["collected_by_1"] + " with the help of " + has_both_fields["collected_by_2"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_second_field["collected_by"] = has_second_field["collected_by_2"]


Unnamed: 0,sample_name,collection_date,geo_loc_name,isolate,isolation_source,collection_method,gisaid_accession,gisaid_virus_name,last_vaccinated_raw,bioproject_accession,organism,host,host_disease,collected_by
0,SEARCH-0063-NBG,3/31/20,North America:USA:Louisiana:New Orleans,,,Nasopharyngeal swab,EPI_ISL_434520,hCoV-19/USA/SEARCH-0063-NBG/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"Allison Smither, Gilberto Sabino-Santos, Patri..."
1,SEARCH-0096-SAN,3/19/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,,hCoV-19/USA/SEARCH-0096-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
2,SEARCH-0106-SAN,3/14/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,EPI_ISL_445102,hCoV-19/USA/SEARCH-0106-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
3,SEARCH-0107-SAN,3/13/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,EPI_ISL_445103,hCoV-19/USA/SEARCH-0107-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
4,SEARCH-0114-SAN,3/19/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,,hCoV-19/USA/SEARCH-0114-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17600,SEARCH-49984,8/11/21,North America:USA:California:San Diego,,,,EPI_ISL_3908899,hCoV-19/USA/CA-SEARCH-49984/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17601,SEARCH-49989,8/3/21,North America:USA:California:San Diego,,,,EPI_ISL_3908901,hCoV-19/USA/CA-SEARCH-49989/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17602,SEARCH-49991,8/4/21,North America:USA:California:San Diego,,,,EPI_ISL_3908902,hCoV-19/USA/CA-SEARCH-49991/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17605,SEARCH-49996,8/9/21,North America:USA:California:San Diego,,,,EPI_ISL_3908906,hCoV-19/USA/CA-SEARCH-49996/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab


In [15]:
non_wastewater_converted_data[(~non_wastewater_converted_data["collected_by"].str.contains("Helix"))]

Unnamed: 0,sample_name,collection_date,geo_loc_name,isolate,isolation_source,collection_method,gisaid_accession,gisaid_virus_name,last_vaccinated_raw,bioproject_accession,organism,host,host_disease,collected_by
0,SEARCH-0063-NBG,3/31/20,North America:USA:Louisiana:New Orleans,,,Nasopharyngeal swab,EPI_ISL_434520,hCoV-19/USA/SEARCH-0063-NBG/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"Allison Smither, Gilberto Sabino-Santos, Patri..."
1,SEARCH-0096-SAN,3/19/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,,hCoV-19/USA/SEARCH-0096-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
2,SEARCH-0106-SAN,3/14/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,EPI_ISL_445102,hCoV-19/USA/SEARCH-0106-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
3,SEARCH-0107-SAN,3/13/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,EPI_ISL_445103,hCoV-19/USA/SEARCH-0107-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
4,SEARCH-0114-SAN,3/19/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,,hCoV-19/USA/SEARCH-0114-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17600,SEARCH-49984,8/11/21,North America:USA:California:San Diego,,,,EPI_ISL_3908899,hCoV-19/USA/CA-SEARCH-49984/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17601,SEARCH-49989,8/3/21,North America:USA:California:San Diego,,,,EPI_ISL_3908901,hCoV-19/USA/CA-SEARCH-49989/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17602,SEARCH-49991,8/4/21,North America:USA:California:San Diego,,,,EPI_ISL_3908902,hCoV-19/USA/CA-SEARCH-49991/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17605,SEARCH-49996,8/9/21,North America:USA:California:San Diego,,,,EPI_ISL_3908906,hCoV-19/USA/CA-SEARCH-49996/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab


In [17]:
need_to_upload_non_ww = non_wastewater_converted_data[(~non_wastewater_converted_data["collected_by"].str.contains("Helix")) & (~non_wastewater_converted_data["gisaid_accession"].isna())]
need_to_upload_non_ww

Unnamed: 0,sample_name,collection_date,geo_loc_name,isolate,isolation_source,collection_method,gisaid_accession,gisaid_virus_name,last_vaccinated_raw,bioproject_accession,organism,host,host_disease,collected_by
0,SEARCH-0063-NBG,3/31/20,North America:USA:Louisiana:New Orleans,,,Nasopharyngeal swab,EPI_ISL_434520,hCoV-19/USA/SEARCH-0063-NBG/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"Allison Smither, Gilberto Sabino-Santos, Patri..."
2,SEARCH-0106-SAN,3/14/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,EPI_ISL_445102,hCoV-19/USA/SEARCH-0106-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
3,SEARCH-0107-SAN,3/13/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,EPI_ISL_445103,hCoV-19/USA/SEARCH-0107-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"SEARCH Alliance San Diego with David Pride, Ji..."
9,SEARCH-0215-NBG,4/3/20,North America:USA:Louisiana:New Orleans,,,Nasopharyngeal swab,EPI_ISL_437541,hCoV-19/USA/SEARCH-0215-NBG/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,"Allison Smither, Gilberto Sabino-Santos, Patri..."
10,SEARCH-0244-SAN,3/28/20,North America:USA:California:San Diego,,,Nasopharyngeal swab,EPI_ISL_437565,hCoV-19/USA/SEARCH-0244-SAN/2020,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17600,SEARCH-49984,8/11/21,North America:USA:California:San Diego,,,,EPI_ISL_3908899,hCoV-19/USA/CA-SEARCH-49984/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17601,SEARCH-49989,8/3/21,North America:USA:California:San Diego,,,,EPI_ISL_3908901,hCoV-19/USA/CA-SEARCH-49989/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17602,SEARCH-49991,8/4/21,North America:USA:California:San Diego,,,,EPI_ISL_3908902,hCoV-19/USA/CA-SEARCH-49991/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab
17605,SEARCH-49996,8/9/21,North America:USA:California:San Diego,,,,EPI_ISL_3908906,hCoV-19/USA/CA-SEARCH-49996/2021,,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab


In [18]:
need_to_upload_non_ww = recover_vaccine_date(need_to_upload_non_ww).drop(columns=["last_vaccinated_raw", "date_of_sars_cov_2_vaccination"])
need_to_upload_non_ww

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  needs_to_be_vaccine_timestamped["date_of_sars_cov_2_vaccination"], needs_to_be_vaccine_timestamped["vaccine_received"] = _convert_str_date_to_timestamp(needs_to_be_vaccine_timestamped["last_vaccinated_raw"], needs_to_be_vaccine_timestamped["collection_date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  needs_to_be_vaccine_timestamped["date_of_sars_cov_2_vaccination"] = needs_to_be_vaccine_timestamped["date_of_sars_cov_2_vaccination"].dt.strftime('%Y-%m-%d')


Unnamed: 0,sample_name,collection_date,geo_loc_name,isolate,isolation_source,collection_method,gisaid_accession,gisaid_virus_name,bioproject_accession,organism,host,host_disease,collected_by,vaccine_received
4500,SEARCH-8927,3/31/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794738,hCoV-19/USA/SEARCH-8927/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,vaccine
4501,SEARCH-8928,3/31/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794739,hCoV-19/USA/SEARCH-8928/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Pfizer
4502,SEARCH-8930,3/29/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794740,hCoV-19/USA/SEARCH-8930/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Pfizer
4503,SEARCH-8931,3/30/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794741,hCoV-19/USA/SEARCH-8931/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Pfizer
4701,SEARCH-9191,4/6/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794934,hCoV-19/USA/SEARCH-9191/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Moderna
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17600,SEARCH-49984,8/11/21,North America:USA:California:San Diego,,,,EPI_ISL_3908899,hCoV-19/USA/CA-SEARCH-49984/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,
17601,SEARCH-49989,8/3/21,North America:USA:California:San Diego,,,,EPI_ISL_3908901,hCoV-19/USA/CA-SEARCH-49989/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,
17602,SEARCH-49991,8/4/21,North America:USA:California:San Diego,,,,EPI_ISL_3908902,hCoV-19/USA/CA-SEARCH-49991/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,
17605,SEARCH-49996,8/9/21,North America:USA:California:San Diego,,,,EPI_ISL_3908906,hCoV-19/USA/CA-SEARCH-49996/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,


In [21]:
# final cleanup
final_file = need_to_upload_non_ww.fillna("").drop_duplicates(subset=["sample_name"])
final_file.to_csv("../data/ncbi_upload/non_ww_upload_ready.csv", index=False)
final_file

Unnamed: 0,sample_name,collection_date,geo_loc_name,isolate,isolation_source,collection_method,gisaid_accession,gisaid_virus_name,bioproject_accession,organism,host,host_disease,collected_by,vaccine_received
4500,SEARCH-8927,3/31/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794738,hCoV-19/USA/SEARCH-8927/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,vaccine
4501,SEARCH-8928,3/31/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794739,hCoV-19/USA/SEARCH-8928/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Pfizer
4502,SEARCH-8930,3/29/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794740,hCoV-19/USA/SEARCH-8930/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Pfizer
4503,SEARCH-8931,3/30/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794741,hCoV-19/USA/SEARCH-8931/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Pfizer
4701,SEARCH-9191,4/6/21,North America:USA:California:San Diego,,,Nasal swab,EPI_ISL_1794934,hCoV-19/USA/SEARCH-9191/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,SEARCH Alliance San Diego with Michael Quigley...,Moderna
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17600,SEARCH-49984,8/11/21,North America:USA:California:San Diego,,,,EPI_ISL_3908899,hCoV-19/USA/CA-SEARCH-49984/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,
17601,SEARCH-49989,8/3/21,North America:USA:California:San Diego,,,,EPI_ISL_3908901,hCoV-19/USA/CA-SEARCH-49989/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,
17602,SEARCH-49991,8/4/21,North America:USA:California:San Diego,,,,EPI_ISL_3908902,hCoV-19/USA/CA-SEARCH-49991/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,
17605,SEARCH-49996,8/9/21,North America:USA:California:San Diego,,,,EPI_ISL_3908906,hCoV-19/USA/CA-SEARCH-49996/2021,PRJNA612578,SARS-CoV-2,Homo Sapiens,COVID-19,EXCITE Lab,
