# Import New Flowcell Data from TSV
## Author: Jonn Smith
## Date: 2023/03/08

Adds a new set of flowcell data into the Terra data table for flowcells from a given TSV file.

The given TSV file must have the same columns as the data table in Terra.
This is enforced before any import occurs.

This is **step 3** in data import.

- **Step 1** is running `create_tsvs_and_script_for_gp_data_import.py` on the GP Metrics Excel spreadsheet.
- **Step 2** is running `20230314_STEP_2_Malaria_Senegal_Sample_Import.ipynb`on the gcloud sheet from step 1.


***

In [15]:
# Now load in the dataframe we want to add:
# new_data_file_gs_path = "gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/manual_data_upload/20230315_2022_Broad_Senegal_Data.sample_bams.ready_for_terra.DBL.NDO_PP_PS.tsv"
# new_data_file_gs_path = "gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/manual_data_upload/PDO-30031_LCSET_Metrics.gcloud_paths.ready_for_terra.tsv"
# new_data_file_gs_path = "gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/manual_data_upload/2022_Senegal_malaria.new_samples.gcloud_paths.ready_for_terra.filtered.tsv"
# new_data_file_gs_path = "gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/manual_data_upload/2020_Senegal_Malaria.gcloud_paths.ready_for_terra.tsv"
# new_data_file_gs_path = "gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/manual_data_upload/20231205_new_data_PDO_32762_32763_32790_32791.gcloud_paths.uniq.ready_for_terra.tsv"

new_data_file_gs_path = "gs://broad-dsp-lrma-sr-malaria/falciparum/senegal/2020_samples/2020_master_samples_to_add.tsv"

***

In [16]:
import os
import sys
import json
import math

import firecloud.api as fapi
import pandas as pd
import numpy as np

from IPython.core.display import HTML

from tqdm.notebook import tqdm

from collections import defaultdict

bucket = os.environ['WORKSPACE_BUCKET']
workspace = os.environ['WORKSPACE_NAME']
namespace = os.environ['WORKSPACE_NAMESPACE']

flowcell_table = "sample"

#################################################################

print(f"Namespace: {namespace}")
print(f"Workspace: {workspace}")
print(f"Bucket:    {bucket}")
print()
print(f"Flowcell table: {flowcell_table}")

Namespace: broad-firecloud-dsde-methods
Workspace: sr-malaria
Bucket:    gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd

Flowcell table: sample


***
## Load Existing Data

In [17]:
def load_table(namespace, workspace, table_name, store_membership=False, membership_column="samples"):
    ent_old = fapi.get_entities(namespace, workspace, table_name).json()
    tbl_old = None

    membership = None
    if len(ent_old) > 0:
        tbl_old = pd.DataFrame(list(map(lambda e: e['attributes'], ent_old)))
        tbl_old[f"entity:{table_name}_id"] = list(map(lambda f: f['name'], ent_old))

        if store_membership:
            membership = list(map(lambda g: set(map(lambda h: h['entityName'], g['items'])), tbl_old[membership_column]))
            del tbl_old[membership_column]

        c = list(tbl_old.columns)
        c.remove(f"entity:{table_name}_id")
        c = [f"entity:{table_name}_id"] + c
        tbl_old = tbl_old[c]
        tbl_old = tbl_old.astype(str)

    return tbl_old, membership

def fix_nans(df, quiet=True):
    if not quiet: print("Replacing all `nan` values with empty strings: ")
    for c in df.columns.values:
        nan_types = ("nan", float('nan'))
        has_nan = False
        num_denaned = 0
        for n in nan_types:
            if (sum(df[c] == n) > 0):
                num_denaned += sum(df[c] == n)
                df.loc[df[c] == n, c] = ""
                has_nan = True
        if has_nan and not quiet:
            print(f"\t{c}: {num_denaned}")

    if not quiet: print("Replacing numpy nan values...")
    if not quiet: print("Done.")
    return df.replace(np.nan, "")

In [18]:
# Get our flowcell and sample tables:
print("Loading Flowcell Table...\t", end="")
tbl_fc, _ = load_table(namespace, workspace, flowcell_table)
tbl_fc = fix_nans(tbl_fc)

original_num_rows = len(tbl_fc)
print("Done!")

tbl_fc

Loading Flowcell Table...	Done!


Unnamed: 0,entity:sample_id,collection_site,country,input_fq_end2,insert_size_standard_deviation,unaligned_bam,num_reads,num_reads_Q10,study,read_length,...,raw_est_fold_cov,aligned_bam,read_qual_median,pct_properly_paired_reads,collection_site_longitude,ENA,aligned_bai,average_identity,fq1,input_bai
0,FP0008-C,Hodh el Gharbi,Mauritania,gs://broad-dsp-lrma-pf7/inputs/ERR1081237/ERR1...,81.5,,8916670.0,8914561.0,1147-PF-MR-CONWAY,100.0,...,38.21510961439369,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,29.2,83.6,-9.832345292,ERR1081237,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,99.2836862161324,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,
1,FP0009-C,Hodh el Gharbi,Mauritania,gs://broad-dsp-lrma-pf7/inputs/ERR1081238/ERR1...,84.9,,28198096.0,28192895.0,1147-PF-MR-CONWAY,100.0,...,120.85154318340773,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,29.3,91.4,-9.832345292,ERR1081238,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,99.26806151770327,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,
2,FP0010-CW,Hodh el Gharbi,Mauritania,gs://broad-dsp-lrma-pf7/inputs/ERR2889621/ERR2...,1063.5,,47021604.0,47004148.0,1147-PF-MR-CONWAY,151.0,...,304.3033984848565,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,27.350993377483444,89.2,-9.832345292,ERR2889621,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,97.63157406395946,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,
3,FP0011-CW,Hodh el Gharbi,Mauritania,gs://broad-dsp-lrma-pf7/inputs/ERR2889624/ERR2...,404.4,,38554147.0,38537926.0,1147-PF-MR-CONWAY,151.0,...,249.50569439921134,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,27.66887417218543,91.1,-9.832345292,ERR2889624,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,97.68691485653939,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,
4,FP0012-CW,Hodh el Gharbi,Mauritania,gs://broad-dsp-lrma-pf7/inputs/ERR2889627/ERR2...,278.8,,43682989.0,43663214.0,1147-PF-MR-CONWAY,151.0,...,282.6973322449103,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,27.821192052980134,86.3,-9.832345292,ERR2889627,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,97.74077030364568,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22680,SEN_2022_SES.PD3_0180,Sessine,Senegal,,150.6,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,12948009.0,12948008.0,,151.0,...,83.79389061913983,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,26.821192052980134,90.3,-16.358667,,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,97.50162838762182,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...
22681,SEN_2022_SES.PD3_0181,Sessine,Senegal,,220.7,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,25538109.0,25538109.0,,151.0,...,165.27154963868733,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,27.94701986754967,89.9,-16.358667,,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,98.05146365119019,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...
22682,SEN_2022_SES.PD3_0182,Sessine,Senegal,,1519.3,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,30919936.0,30919876.0,,151.0,...,200.10039652697213,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,20.105960264900663,65.1,-16.358667,,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,94.98883680963937,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...
22683,SEN_2022_SES.PD3_0193,Sessine,Senegal,,130.1,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,38879625.0,38879623.0,,151.0,...,251.6120466523598,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,26.596026490066226,42.6,-16.358667,,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,98.0523517442276,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...


***
## Load New Data from TSV

In [19]:
print(f"Reading in new data from tsv: {new_data_file_gs_path}")
new_data_df = pd.read_csv(new_data_file_gs_path, sep="\t")
print()
new_data_df = fix_nans(new_data_df)

def assert_new_data_are_valid(existing_df, new_df, entity_id_col):
    # Now we have to validate that all the columns are in our Terra data:
    extra_cols = set()
    for c in new_df.columns:
        if c not in existing_df.columns.values:
            extra_cols.add(c)

    if len(extra_cols) > 0:
        print("ERROR: New data has extra columns that are not present in the Terra data tables:", file=sys.stderr)
        for c in extra_cols:
            print(f"\t{c}", file=sys.stderr)
        print()
        raise RuntimeError("USER ERROR IN NEW DATASET.  SEE ABOVE.")

    # Now make sure the new data itself doesn't have duplicated sample_id values:
    dupe_ids = set()
    for new_id in new_df[entity_id_col]:
        if sum(new_id == existing_df[entity_id_col]) > 1:
            dupe_ids.add(new_id)
    if len(dupe_ids) > 0:
        print(f"ERROR: New data has {len(dupe_ids)} duplicated entity IDs ({entity_id_col}):", file=sys.stderr)
        for i in sorted(list(dupe_ids)):
            print(f"\t{i}", file=sys.stderr)
        print()
        raise RuntimeError("USER ERROR IN NEW DATASET.  SEE ABOVE.")

    # Now we have to make sure that our sample IDs don't exist already:
    dupe_ids = set()
    for new_id in new_df[entity_id_col]:
        if new_id in existing_df[entity_id_col].values:
            dupe_ids.add(new_id)
    if len(dupe_ids) > 0:
        print(f"ERROR: New data has entity IDs ({entity_id_col}) that already exist!", file=sys.stderr)
        for i in dupe_ids:
            print(f"\t{i}", file=sys.stderr)
        print()
        raise RuntimeError("USER ERROR IN NEW DATASET.  SEE ABOVE.")
        
    return True
    
print("Validating new data... ")
assert_new_data_are_valid(tbl_fc, new_data_df, "entity:sample_id")
print("Done")
    
# If we've come this far, show our results to the user:
display(new_data_df)

Reading in new data from tsv: gs://broad-dsp-lrma-sr-malaria/falciparum/senegal/2020_samples/2020_master_samples_to_add.tsv

Validating new data... 
Done


Unnamed: 0,entity:sample_id,country,country_latitude,country_longitude,collection_site,collection_site_latitude,collection_site_longitude,dataset_provenance,participant,year,input_bam,input_bai,study,sample_type,population,ENA,old_sample_name,additional_metadata
0,SEN_2020_SES.PPS_0188,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0188,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_SES_PS_188_2020,
1,SEN_2020_SES.PPS_0111,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0111,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_SES_PS_111_2020,
2,SEN_2020_SES.PPS_0143,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0143,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_SES_PS_143_2020,
3,SEN_2020_SES.PPS_0151,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0151,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_SES_PS_151_2020,
4,SEN_2020_SES.PPS_0159,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0159,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_SES_PS_159_2020,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,SEN_2020_NDO.PD2_0054,Senegal,14.359109,-14.470363,,13.753000,-14.102000,Broad 2020 Senegal Dataset,SEN_2020_NDO.PD2_0054,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_NDO_DM2_054_2020,
612,SEN_2116_SES.PPS_0198,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2116_SES.PPS_0198,2116,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_SES_PS_198_2116,
613,SEN_2020_NDO.PD3_0141,Senegal,14.359109,-14.470363,,13.753000,-14.102000,Broad 2020 Senegal Dataset,SEN_2020_NDO.PD3_0141,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_NDO_DM3_141_2020,
614,SEN_2020_NDO.PD3_0130,Senegal,14.359109,-14.470363,,13.753000,-14.102000,Broad 2020 Senegal Dataset,SEN_2020_NDO.PD3_0130,2020,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,gs://broad-dsp-lrma-sr-malaria/falciparum/sene...,,,,,SEN_NDO_DM3_130_2020,


In [20]:
# Now make sure the new data has all the right columns for our existing data table:
missing_columns = set(tbl_fc.columns) - set(new_data_df)
extra_columns = set(new_data_df) - set(tbl_fc.columns)

if len(extra_columns) != 0:
    msg = "New data has extra columns in it.  It is undefined what to do with extra columns.  Failing."
    print(f"Error: {msg}", file=sys.stderr)
    raise RuntimeError(msg)

print(f"New data is missing {len(missing_columns)} columns.  This is OK.  We will add them in before we upload.")
print()

for c in missing_columns:
    new_data_df[c] = ""
    
print("New data with all required columns:")
display(new_data_df)

New data is missing 33 columns.  This is OK.  We will add them in before we upload.

New data with all required columns:


Unnamed: 0,entity:sample_id,country,country_latitude,country_longitude,collection_site,collection_site_latitude,collection_site_longitude,dataset_provenance,participant,year,...,num_reads_Q5,num_bases,num_reads,aligned_read_length,insert_size_standard_deviation,input_fq_end1,input_fq_end2,num_reads_Q7,average_identity,pct_callable
0,SEN_2020_SES.PPS_0188,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0188,2020,...,,,,,,,,,,
1,SEN_2020_SES.PPS_0111,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0111,2020,...,,,,,,,,,,
2,SEN_2020_SES.PPS_0143,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0143,2020,...,,,,,,,,,,
3,SEN_2020_SES.PPS_0151,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0151,2020,...,,,,,,,,,,
4,SEN_2020_SES.PPS_0159,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2020_SES.PPS_0159,2020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,SEN_2020_NDO.PD2_0054,Senegal,14.359109,-14.470363,,13.753000,-14.102000,Broad 2020 Senegal Dataset,SEN_2020_NDO.PD2_0054,2020,...,,,,,,,,,,
612,SEN_2116_SES.PPS_0198,Senegal,14.359109,-14.470363,,14.920131,-16.358667,Broad 2020 Senegal Dataset,SEN_2116_SES.PPS_0198,2116,...,,,,,,,,,,
613,SEN_2020_NDO.PD3_0141,Senegal,14.359109,-14.470363,,13.753000,-14.102000,Broad 2020 Senegal Dataset,SEN_2020_NDO.PD3_0141,2020,...,,,,,,,,,,
614,SEN_2020_NDO.PD3_0130,Senegal,14.359109,-14.470363,,13.753000,-14.102000,Broad 2020 Senegal Dataset,SEN_2020_NDO.PD3_0130,2020,...,,,,,,,,,,


## Add our new data to the flowcell table:

In [21]:
def upload_table(namespace, workspace, table, label):
    # upload new samples
    print(f"Uploading data from {label} ... ", end="")
    a = fapi.upload_entities(namespace, workspace, entity_data=table.to_csv(index=False, sep="\t"), model='flexible')

    if a.status_code == 200:
        print(f'Uploaded {len(table)} rows successfully.')
    else:
        print(a.json())

In [22]:
# Now let's upload our new data:
upload_table(namespace, workspace, new_data_df, f"Flowcell Table ({flowcell_table})")

Uploading data from Flowcell Table (sample) ... Uploaded 616 rows successfully.


In [23]:
print()
print()
display(HTML('<H1><span style="color:red">OK - NOW DON\'T FORGET TO INDEX YOUR BAM FILES!<br />(read: run `SRIndexBam`)</span></H1>'))





***
***
***