# Pre/post Discovery Sample Assessment

Goal: Determine if/how many samples where:

* two samples are paired from the same patient
* of the two samples, one must be pre-colonoscopy and one must by post-colonoscopy
* the pre-colonoscopy sample must be unresected; the post-colonoscopy sample must be resected 
* samples should have some disease - i.e. there's likely no signal in NEG

Gotchas

* there's usually one diagnosis for each sample
* resection status is in lims, and not calzone
* samples with resected leisons have been marked "Unclassifiable"


In [None]:
!pip install --user google-cloud-bigquery[bqstorage,pandas]

In [4]:
%load_ext autoreload 
%autoreload 2

In [1]:
query = """WITH samples_at_2020_02_09 AS (
  SELECT sm_tbl.*
  FROM (
    SELECT id, MAX(timestamp) as maxtime
    FROM `freenome-computational.calzone.sample_metadata_table`
    WHERE timestamp <= TIMESTAMP '2020-02-09'
    GROUP BY id
  ) AS sm_id_to_max_time
  INNER JOIN `freenome-computational.calzone.sample_metadata_table` AS sm_tbl
  ON sm_tbl.id = sm_id_to_max_time.id AND sm_tbl.timestamp = sm_id_to_max_time.maxtime
), datasets_at_2020_02_09 AS (
  SELECT ds_tbl.*
  FROM (
    SELECT id, MAX(timestamp) as maxtime
    FROM `freenome-computational.calzone.dataset_metadata_table`
    WHERE timestamp <= TIMESTAMP '2020-02-09'
    GROUP BY id
  ) AS ds_id_to_max_time
  INNER JOIN `freenome-computational.calzone.dataset_metadata_table` AS ds_tbl
  ON ds_tbl.id = ds_id_to_max_time.id AND ds_tbl.timestamp = ds_id_to_max_time.maxtime
), alpha_manifest AS (
-- schema: Study:STRING,WGS_Processing_Batch:STRING,Sample_ID:INTEGER,Sample_Secondary_ID:STRING,Patient_ID:STRING,
-- Patient_Secondary_ID:STRING,Discovery_Study:STRING,Pre_Post_Study:STRING,Analysis_Batch:STRING
  SELECT
    *
  FROM
    `freenome-computational.discovery_studies.permanent_study_manifest_20200108`
  WHERE
    Study LIKE '%DC2%'
    OR Study LIKE '%DC1%' )

SELECT
-- Dataset fields
dataset.id as dataset_id,
dataset.raw_data_path,
dataset.assay,
dataset.assay_version,
dataset.analyte,
dataset.passed_latest_qc,
lims_batch.name as lims_batch,

-- Sample fields
lims_sample.secondary_id,
lims_sample.id as sample_id,
lims_sample.age_at_blood_draw,
lims_sample.blood_draw_relative_to_tx,

lims_patient.id as patient_id,
lims_patient.secondary_id as patient_secondary_id,
lims_patient.sex,
lims_patient.age,
lims_top_diagnosis.cancer_type,
lims_crc_builtin.colorectal_resection,

-- Manifest Fields
Study,
WGS_Processing_Batch,
Sample_ID as manifest_sample_id,
Sample_Secondary_ID,
Patient_ID as manifest_patient_id,
Patient_Secondary_ID as manifest_Patient_Secondary_ID,
Discovery_Study,
Pre_Post_Study,
Analysis_Batch
FROM datasets_at_2020_02_09
RIGHT OUTER JOIN alpha_manifest
ON datasets_at_2020_02_09.dataset.sample_id = alpha_manifest.Sample_ID
INNER JOIN samples_at_2020_02_09
ON datasets_at_2020_02_09.dataset.sample_id = samples_at_2020_02_09.id
"""

import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

# Explicitly create a credentials object and the clients.
credentials, your_project_id = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
bqclient = bigquery.Client(
    credentials=credentials,
    project=your_project_id,
)
bqstorageclient = bigquery_storage.BigQueryStorageClient(
    credentials=credentials
)



# Return a dataframe containing all the records.
dataframe = (
    bqclient.query(query)
        .result()
        .to_dataframe(bqstorage_client=bqstorageclient)
)
dataframe

Unnamed: 0,dataset_id,raw_data_path,assay,assay_version,analyte,passed_latest_qc,lims_batch,secondary_id,sample_id,age_at_blood_draw,...,colorectal_resection,Study,WGS_Processing_Batch,manifest_sample_id,Sample_Secondary_ID,manifest_patient_id,manifest_Patient_Secondary_ID,Discovery_Study,Pre_Post_Study,Analysis_Batch
0,SD-00004FAA,gs://aab-data-us-1/dc2_ingest_ready/41_IgG_ori...,aab-protein-array,1,Protein,,DC2_BG1_CDI_1,118317-post,22808,53,...,,DC2 BG1,DC2_BG1_WGS_1,22808,118317-post,24564,118317,,Y,
1,SD-00004FDA,gs://aab-data-us-1/dc2_ingest_ready/41_IgG+IgM...,aab-protein-array,1,Protein,,DC2_BG1_CDI_1,118317-post,22808,53,...,,DC2 BG1,DC2_BG1_WGS_1,22808,118317-post,24564,118317,,Y,
2,SD-00004F79,gs://seq-data-us-1/bam/20190906_HMJVVDSXX_HMJH...,WGS,2,cfDNA,True,DC2_BG1_WGS_1,118317-post,22808,53,...,,DC2 BG1,DC2_BG1_WGS_1,22808,118317-post,24564,118317,,Y,
3,SD-000061C2,gs://discovery-phase-2-qc/Flow Cytometry/11831...,WGS,2,cfDNA,,DC2_BG1_WGS_1,118317-post,22808,53,...,,DC2 BG1,DC2_BG1_WGS_1,22808,118317-post,24564,118317,,Y,
4,SD-00006522,gs://discovery-phase-2-qc/Hemoglobin/118317-po...,WGS,2,cfDNA,,DC2_BG1_WGS_1,118317-post,22808,53,...,,DC2 BG1,DC2_BG1_WGS_1,22808,118317-post,24564,118317,,Y,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15700,SD-000067FC,gs://discovery-phase-2-qc/Hemoglobin/126717-pr...,WGS,2,cfDNA,,DC2_BG1_WGS_9,126717-pre,23606,66,...,,DC2 BG1,DC2_BG1_WGS_9,23606,126717-pre,26134,126717,Y,N,gamma
15701,SD-00006B38,gs://discovery-phase-2-qc/LINE-1 qPCR/126717-p...,WGS,2,cfDNA,,DC2_BG1_WGS_9,126717-pre,23606,66,...,,DC2 BG1,DC2_BG1_WGS_9,23606,126717-pre,26134,126717,Y,N,gamma
15702,SD-00006EBE,gs://discovery-phase-2-qc/Lunatic/126717-pre-1...,WGS,2,cfDNA,,DC2_BG1_WGS_9,126717-pre,23606,66,...,,DC2 BG1,DC2_BG1_WGS_9,23606,126717-pre,26134,126717,Y,N,gamma
15703,SD-00009154,gs://aab-data-us-1/dc2_ingest_ready/771_IgG+Ig...,aab-protein-array,1,Protein,,DC2_BG1_CDI_17,126717-pre,23606,66,...,,DC2 BG1,DC2_BG1_WGS_9,23606,126717-pre,26134,126717,Y,N,gamma


In [2]:
print(dataframe.columns)
print(dataframe.shape)

Index(['dataset_id', 'raw_data_path', 'assay', 'assay_version', 'analyte',
       'passed_latest_qc', 'lims_batch', 'secondary_id', 'sample_id',
       'age_at_blood_draw', 'blood_draw_relative_to_tx', 'patient_id',
       'patient_secondary_id', 'sex', 'age', 'cancer_type',
       'colorectal_resection', 'Study', 'WGS_Processing_Batch',
       'manifest_sample_id', 'Sample_Secondary_ID', 'manifest_patient_id',
       'manifest_Patient_Secondary_ID', 'Discovery_Study', 'Pre_Post_Study',
       'Analysis_Batch'],
      dtype='object')
(15705, 26)


In [18]:
import pandas as pd
pd.set_option('display.max_rows', 500)

# how many unique samples (tubes)?
all_samples = dataframe[['assay', 'lims_batch', 'sample_id', 'patient_id', 'blood_draw_relative_to_tx', 'cancer_type', 'Study','Analysis_Batch','passed_latest_qc']].drop_duplicates()
all_samples

Unnamed: 0,assay,lims_batch,sample_id,patient_id,blood_draw_relative_to_tx,cancer_type,Study,Analysis_Batch,passed_latest_qc
0,aab-protein-array,DC2_BG1_CDI_1,22808,24564,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,,
2,WGS,DC2_BG1_WGS_1,22808,24564,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,,True
3,WGS,DC2_BG1_WGS_1,22808,24564,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,,
7,targeted-panel,DC2_BG1_Olink_1,22808,24564,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,,True
8,TEM-Seq,DC2_BG1_TEMSeq_1,22808,24564,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,,True
...,...,...,...,...,...,...,...,...,...
15697,targeted-panel,DC2_BG1_Olink_9,20989,25004,Post-Colonoscopy,Colorectal Cancer,DC2 BG1,gamma,False
15698,WGS,DC2_BG1_WGS_9,23606,26134,Pre-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,gamma,True
15699,WGS,DC2_BG1_WGS_9,23606,26134,Pre-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,gamma,
15703,aab-protein-array,DC2_BG1_CDI_17,23606,26134,Pre-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,gamma,


In [19]:
# eliminate samples that did not pass QC
all_samples = all_samples[all_samples["passed_latest_qc"]!=False]
print(all_samples.shape)

(9011, 9)


In [20]:
# number of unique patients
all_samples.patient_id.nunique()
all_samples = all_samples.fillna("unknown")

In [21]:
# protein samples
#print(set(all_samples["lims_batch"]))
all_olink_samples = all_samples[all_samples["lims_batch"].str.contains('Olink', na=False)]
print(set(all_olink_samples["lims_batch"]))
all_olink_samples.shape

{'DC2_BG1_Olink_1', 'DC2_BG1_Olink_7', 'Discovery1_Olink_Plate7', 'Olink_Study2_CRC-Training_Plate2', 'DC2_BG1_Olink_2', 'DC2_BG1_Olink_8', 'DC2_BG1_Olink_9', 'DC2_BG1_Olink_3', 'DC2_BG2_Olink_4', 'Discovery1_Olink_Plate9', 'Olink_Study2_CRC-Training_Plate1', 'DC2_BG2_Olink_2', 'DC2_BG2_Olink_5', 'DC2_BG2_Olink_1', 'Olink_Study2_CRC-Training_Plate3', 'DC2_BG1_Olink_6', 'DC2_BG2_Olink_3', 'DC2_BG1_Olink_5', 'Discovery1_Olink_Plate4', 'Olink_Study3_CRC-Evaluation_Plate1', 'Discovery1_Olink_Plate3', 'Discovery1_Olink_Plate5', 'Discovery1_Olink_Plate6', 'DC2_BG1_Olink_4', 'Discovery1_Olink_Plate8', 'Discovery1_Olink_Plate1', 'DC2_BG1_Olink_10', 'Discovery1_Olink_Plate2'}


(1924, 9)

### All healthy pre/post colo samples

In [22]:
all_olink_samples = all_olink_samples.fillna("unknown")
#all_olink_samples.groupby(["cancer_type", "blood_draw_relative_to_tx"])[["sample_id"]].count()

In [220]:
# healthies = all_olink_samples[all_olink_samples["cancer_type"]=="Colorectal Normal / Healthy"]
# print(healthies.shape)
# healthies.groupby(["blood_draw_relative_to_tx", "Analysis_Batch"])[["sample_id"]].count()

(1134, 9)


Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
blood_draw_relative_to_tx,Analysis_Batch,Unnamed: 2_level_1
Post-Colonoscopy,PON,14
Post-Colonoscopy,alpha,2
Post-Colonoscopy,beta,1
Post-Colonoscopy,delta,55
Post-Colonoscopy,gamma,5
Post-Colonoscopy,unknown,40
Pre-Colonoscopy,DC1,531
Pre-Colonoscopy,alpha,129
Pre-Colonoscopy,beta,125
Pre-Colonoscopy,delta,112


### Paired pre/post colo samples

In [23]:
# number of olink patients with more than 2 samples
paired_olink_pids = all_olink_samples[['patient_id', 'sample_id']].drop_duplicates().patient_id.value_counts().pipe(lambda x: x[x>1])
paired_olink_pids
# the index is now the patient id

24580    2
25880    2
26410    2
23788    2
24287    2
26314    2
24956    2
26288    2
22362    2
26272    2
25251    2
24921    2
25245    2
26254    2
25012    2
25869    2
24909    2
26289    2
24919    2
26277    2
26864    2
23791    2
24924    2
25299    2
26471    2
24422    2
24669    2
22340    2
24705    2
25385    2
23753    2
25347    2
24388    2
24384    2
26429    2
26390    2
21730    2
23437    2
25789    2
26563    2
23756    2
26275    2
24923    2
22302    2
24067    2
24078    2
24077    2
25146    2
25145    2
25807    2
24068    2
25258    2
24066    2
25254    2
24925    2
25975    2
25248    2
25168    2
25061    2
24045    2
25180    2
25125    2
26047    2
25963    2
26469    2
24995    2
25846    2
25912    2
26255    2
25916    2
25907    2
25915    2
24955    2
25737    2
25100    2
24957    2
24961    2
25175    2
25970    2
25112    2
26162    2
25974    2
26432    2
25161    2
24393    2
24922    2
26833    2
26540    2
24608    2
24481    2
26528    2

In [24]:
# for patients with 2 samples, see distribution of patients between dc1 and dc2
df_paired_olinks = all_olink_samples[[x in paired_olink_pids for x in all_olink_samples["patient_id"]]]

print(df_paired_olinks.shape)
df_paired_olinks.head()

#a[['patient_id', 'Study']].drop_duplicates().query('patient_id.isin(@paired_olink_pids.index)')['Study'].value_counts(dropna=False)


(252, 9)


Unnamed: 0,assay,lims_batch,sample_id,patient_id,blood_draw_relative_to_tx,cancer_type,Study,Analysis_Batch,passed_latest_qc
7,targeted-panel,DC2_BG1_Olink_1,22808,24564,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,unknown,True
16,targeted-panel,DC2_BG1_Olink_1,23394,25603,Post-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,unknown,True
26,targeted-panel,DC2_BG1_Olink_1,21163,24435,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,unknown,True
35,targeted-panel,DC2_BG1_Olink_1,20983,24445,Post-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,unknown,True
45,targeted-panel,DC2_BG1_Olink_1,21655,24705,Post-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,unknown,True


In [26]:
#df_paired_olinks[['patient_id', 'blood_draw_relative_to_tx']].blood_draw_relative_to_tx.value_counts()

Pre-Colonoscopy     126
Post-Colonoscopy    126
Name: blood_draw_relative_to_tx, dtype: int64

In [25]:
# get resection status from lims

from lims_api import Client
c = Client("http://intercluster")
patient_ids = [int(x) for x in df_paired_olinks.patient_id]
filters = [dict(name='patient_id', op='in', val=patient_ids)]
procedures = list(c.search_collection('procedure', filters))

print(filters)



[{'name': 'patient_id', 'op': 'in', 'val': [24564, 25603, 24435, 24445, 24705, 24961, 25168, 25564, 25125, 24668, 24659, 24068, 25175, 25061, 25112, 25299, 24925, 25161, 25915, 25100, 24287, 24606, 25880, 25248, 26314, 26255, 24955, 25254, 25180, 25012, 25963, 24585, 25975, 25912, 24461, 24909, 24923, 24924, 26272, 24422, 25638, 24956, 25807, 26289, 25683, 26429, 24388, 25789, 25974, 24921, 24669, 25846, 25146, 26254, 24468, 25245, 24995, 24608, 24045, 25722, 25685, 26471, 26410, 25737, 26366, 24582, 25567, 24481, 24919, 25145, 24580, 25970, 24957, 25869, 26275, 24922, 25642, 25487, 26521, 26540, 26424, 25347, 25251, 26390, 25385, 25907, 26277, 24440, 26546, 26825, 24393, 26563, 25485, 25717, 26047, 26469, 26288, 26833, 25258, 26864, 26432, 26525, 26561, 24599, 26566, 25916, 26528, 25562, 26162, 24066, 23788, 24078, 22302, 23656, 23791, 24078, 24066, 23756, 22340, 23679, 23788, 23437, 23539, 22302, 24384, 21730, 24077, 23753, 24067, 22362, 24067, 23539, 22362, 23437, 24384, 23753, 2173

In [110]:
len(procedures)
procedures[0]

{'anaesthesia_type': None,
 'id': 185,
 'notes': "[('Colon polyp', 2, 'Cecum', 'Completely Resected'), ('Colon polyp', 15, 'Cecum', 'Completely Resected')]",
 'patient': None,
 'patient_id': 24045,
 'procedure_date': '2018-11-28',
 'procedure_name': 'Colonoscopy',
 'procedure_type': None,
 'tissue_disposition': 'Completely Resected',
 'treatment': None,
 'treatment_id': None}

In [26]:
df_procedures = pd.DataFrame(procedures)
#print(df_procedures.head())
df_procedures.shape

# so all patient_ids are assoc with a single tissue dispo
for i, df in df_procedures.groupby(["patient_id", "tissue_disposition"]):
    if len(df)>1:
        print(df)
        break

pid_to_tiss_dispo = dict(zip (df_procedures["patient_id"], df_procedures["tissue_disposition"].fillna("unknown")))
#pid_to_tiss_dispo
#print(df_procedures["tissue_disposition"].value_counts(dropna=False))



In [27]:
## THIS IS IMPORTANT, all the groupby and count functions will exclude where value == None
df_paired_olinks = df_paired_olinks.fillna("unknown")

In [28]:
# merge resection tissue disposition data with paired sample info

# df_paired_olinks = df_paired_olinks.merge(pd.DataFrame(procedures)[['patient_id', 'tissue_disposition']], on='patient_id') # it doesn't seem this is really working

df_paired_olinks["tissue_disposition"] = [pid_to_tiss_dispo[pid] for pid in df_paired_olinks["patient_id"]]

#print(df_paired_olinks.groupby(["patient_id","tissue_disposition"])[["sample_id"]].count())
#print(set(df_paired_olinks["tissue_disposition"]))

In [29]:
# for the Paired samples, replace all the tissue dispo values for pre-colo samples to 'pre-colo'

df_paired_olinks.groupby(["tissue_disposition"])[["sample_id"]].count()

# there should be 126 pre-colo tissue_dipsos and a total of 126 other dispos
new_tiss_dispo = [ "pre-colo" if colo_stat=="Pre-Colonoscopy" else tiss_disp for colo_stat,tiss_disp in zip(df_paired_olinks["blood_draw_relative_to_tx"],df_paired_olinks["tissue_disposition"])]
df_paired_olinks["tissue_disposition"] = new_tiss_dispo
#df_paired_olinks.head()
df_paired_olinks.groupby(["tissue_disposition"])[["sample_id"]].count()

# so we have 126 pre-resection (pre-colo) samples and 126 post-resection samples

Unnamed: 0_level_0,sample_id
tissue_disposition,Unnamed: 1_level_1
Biopsied,4
Completely Resected,59
pre-colo,126
unknown,63


## Write paired data set

In [30]:
df_paired_olinks = df_paired_olinks.fillna("unknown")
df_paired_olinks.to_csv(open("./df_paired_olinks.csv", "w"))

## Impute some ambig and unknown vals

In [14]:
# a subset of about 8 patients have different cancer type labels depending on if the sample was pre or post colo

for name, df in df_paired_olinks.groupby(["patient_id"]):
    if list(df.cancer_type)[0] != list(df.cancer_type)[1]:
        print(name)
        print(df["cancer_type"])

24078
730            Unclassifiable
2654    Colorectal Pre-Cancer
Name: cancer_type, dtype: object
24919
630             Unclassifiable
14846    Colorectal Pre-Cancer
Name: cancer_type, dtype: object
25100
185      Colorectal Normal / Healthy
12809          Colorectal Pre-Cancer
Name: cancer_type, dtype: object
25245
527          Colorectal Benign
14160    Colorectal Pre-Cancer
Name: cancer_type, dtype: object
25807
410         Colorectal Benign
9864    Colorectal Pre-Cancer
Name: cancer_type, dtype: object
25846
491          Colorectal Benign
10596    Colorectal Pre-Cancer
Name: cancer_type, dtype: object
26275
672          Colorectal Benign
15285    Colorectal Pre-Cancer
Name: cancer_type, dtype: object
26561
721      Colorectal Pre-Cancer
13615        Colorectal Benign
Name: cancer_type, dtype: object


In [31]:
# change the ambig cancer types to the post_colo types
df_post = df_paired_olinks[df_paired_olinks["blood_draw_relative_to_tx"]=="Post-Colonoscopy"]
pid_to_ctype = dict(zip( df_post["patient_id"], df_post["cancer_type"]))

corrected_types = [pid_to_ctype[pid] for pid in df_paired_olinks["patient_id"]]
df_paired_olinks_ctype_corrected = df_paired_olinks
df_paired_olinks_ctype_corrected["cancer_type"] = corrected_types

In [32]:
# now check there is an even number of samples for each cancer type
df_paired_olinks_ctype_corrected.cancer_type.value_counts()

Colorectal Normal / Healthy    104
Colorectal Benign               76
Colorectal Pre-Cancer           64
Unclassifiable                   8
Name: cancer_type, dtype: int64

In [33]:
# for paired samples, how many do we have in the resected and non-resected categories, in each disease type
df = df_paired_olinks_ctype_corrected[df_paired_olinks_ctype_corrected["tissue_disposition"]!="pre-colo"]
df.groupby(["tissue_disposition","cancer_type"])[["sample_id"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
tissue_disposition,cancer_type,Unnamed: 2_level_1
Biopsied,Colorectal Benign,1
Biopsied,Colorectal Normal / Healthy,1
Biopsied,Colorectal Pre-Cancer,1
Biopsied,Unclassifiable,1
Completely Resected,Colorectal Benign,25
Completely Resected,Colorectal Normal / Healthy,8
Completely Resected,Colorectal Pre-Cancer,24
Completely Resected,Unclassifiable,2
unknown,Colorectal Benign,12
unknown,Colorectal Normal / Healthy,43


In [34]:
df_paired_olinks_ctype_corrected.groupby(["Analysis_Batch", "tissue_disposition"])[["sample_id"]].count()
#print(set(df_paired_olinks_ctype_corrected["tissue_disposition"]))

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
Analysis_Batch,tissue_disposition,Unnamed: 2_level_1
DC1,pre-colo,17
,Completely Resected,4
PON,Completely Resected,2
PON,unknown,11
alpha,pre-colo,26
beta,pre-colo,26
delta,pre-colo,31
gamma,pre-colo,26
unknown,Biopsied,4
unknown,Completely Resected,53


## Pull sample metadata directly from Calzone

#### https://github.com/freenome/featureio/tree/master/calzone

In [35]:
from calzone import Calzone, CalzoneSampleMetadata

In [36]:
df_paired_olinks_ctype_corrected.head()

Unnamed: 0,assay,lims_batch,sample_id,patient_id,blood_draw_relative_to_tx,cancer_type,Study,Analysis_Batch,passed_latest_qc,tissue_disposition
7,targeted-panel,DC2_BG1_Olink_1,22808,24564,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,unknown,True,Completely Resected
16,targeted-panel,DC2_BG1_Olink_1,23394,25603,Post-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,unknown,True,unknown
26,targeted-panel,DC2_BG1_Olink_1,21163,24435,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,unknown,True,Completely Resected
35,targeted-panel,DC2_BG1_Olink_1,20983,24445,Post-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,unknown,True,Biopsied
45,targeted-panel,DC2_BG1_Olink_1,21655,24705,Post-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,unknown,True,Completely Resected


In [37]:
# get the dataset IDS for the sample ids we're interested in (Calzone only queries fromt these)

dsids = []
for sid in df_paired_olinks_ctype_corrected.sample_id:
    df = dataframe[dataframe["sample_id"]==sid]
    ids = df.loc[df["lims_batch"].str.contains("Olink"), "dataset_id"].values
    assert len(ids)==1
    dsids.append(ids[0])

df_paired_olinks_ctype_corrected["olink_dsid"] = dsids
print(len(dsids))

252


In [38]:
metadata = Calzone.create_dataset_metadata_with_ids(
    dsids,
    load_options=[CalzoneSampleMetadata])
print(len(metadata))

252


In [51]:
sids = [d.raw_sample_metadata["id"] for d in metadata]
path_type = [d.raw_sample_metadata["lims_top_diagnosis"]["pathologic_type"] for d in metadata]
path_sub_type = [d.raw_sample_metadata["lims_top_diagnosis"]["pathologic_sub_type"] for d in metadata]

In [55]:
df_paired_olinks_ctype_corrected["path_type"] = path_type
df_paired_olinks_ctype_corrected["path_sub_type"] = path_sub_type
print(Counter(df_paired_olinks_ctype_corrected["path_type"]))

# fill in the 'None' path_types
df_paired_olinks_ctype_corrected = df_paired_olinks_ctype_corrected.fillna("unknown")
print(Counter(df_paired_olinks_ctype_corrected["path_type"]))

Counter({'Negative by colonoscopy (no histopathology)': 79, 'NAA': 73, 'AA': 70, 'Negative by histopathology': 23, None: 7})
Counter({'Negative by colonoscopy (no histopathology)': 79, 'NAA': 73, 'AA': 70, 'Negative by histopathology': 23, 'unknown': 7})


In [None]:
from collections import Counter

In [57]:
# path_type changes pre/post colonoscopy for some samples
for name, df in df_paired_olinks_ctype_corrected.groupby(["patient_id"]):
    if list(df.path_type)[0] != list(df.path_type)[1]:
        print(f"Sample ID: {name}")
        print(df[["blood_draw_relative_to_tx", "cancer_type","path_type"]])

Sample ID: 24078
     blood_draw_relative_to_tx     cancer_type path_type
730           Post-Colonoscopy  Unclassifiable   unknown
2654           Pre-Colonoscopy  Unclassifiable        AA
Sample ID: 24919
      blood_draw_relative_to_tx     cancer_type path_type
630            Post-Colonoscopy  Unclassifiable   unknown
14846           Pre-Colonoscopy  Unclassifiable        AA
Sample ID: 25100
      blood_draw_relative_to_tx                  cancer_type  \
185            Post-Colonoscopy  Colorectal Normal / Healthy   
12809           Pre-Colonoscopy  Colorectal Normal / Healthy   

                        path_type  
185    Negative by histopathology  
12809                          AA  
Sample ID: 25245
      blood_draw_relative_to_tx        cancer_type path_type
527            Post-Colonoscopy  Colorectal Benign       NAA
14160           Pre-Colonoscopy  Colorectal Benign        AA
Sample ID: 25807
     blood_draw_relative_to_tx        cancer_type path_type
410           Post-Colonos

In [58]:
# change the ambig pathology types to the post_colo types

df_post = df_paired_olinks_ctype_corrected[df_paired_olinks_ctype_corrected["blood_draw_relative_to_tx"]=="Post-Colonoscopy"]
pid_to_path_type = dict(zip( df_post["patient_id"], df_post["path_type"]))

corrected_types = [pid_to_path_type[pid] for pid in df_paired_olinks_ctype_corrected["patient_id"]]
df_paired_olinks_corrected2 = df_paired_olinks_ctype_corrected
df_paired_olinks_corrected2["path_type"] = corrected_types

print(df_paired_olinks_corrected2.shape)
df_olink_paired_corrected2 = df_paired_olinks_corrected2.fillna("unknown")
df_paired_olinks_corrected2.groupby(["path_type","tissue_disposition"])[["sample_id"]].count()

(252, 13)


Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
path_type,tissue_disposition,Unnamed: 2_level_1
AA,Biopsied,1
AA,Completely Resected,24
AA,pre-colo,32
AA,unknown,7
NAA,Biopsied,1
NAA,Completely Resected,25
NAA,pre-colo,38
NAA,unknown,12
Negative by colonoscopy (no histopathology),pre-colo,40
Negative by colonoscopy (no histopathology),unknown,40


In [60]:
# there should be even numbers of each path type, 2 per patient
df_paired_olinks_corrected2.groupby(["path_type"])[["sample_id"]].count()

Unnamed: 0_level_0,sample_id
path_type,Unnamed: 1_level_1
AA,64
NAA,76
Negative by colonoscopy (no histopathology),80
Negative by histopathology,24
unknown,8


## Write imputed paired data

In [61]:
df_paired_olinks_corrected2.to_csv(open("./df_paired_olinks_pathol_corrected.csv","w"), index=False)

In [62]:
# how are patients (paired) distributed by cancer type and resection status
df = df_paired_olinks_corrected2[df_paired_olinks_corrected2["blood_draw_relative_to_tx"]=="Post-Colonoscopy"]
df.groupby(["path_type","tissue_disposition"])[["patient_id"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,patient_id
path_type,tissue_disposition,Unnamed: 2_level_1
AA,Biopsied,1
AA,Completely Resected,24
AA,unknown,7
NAA,Biopsied,1
NAA,Completely Resected,25
NAA,unknown,12
Negative by colonoscopy (no histopathology),unknown,40
Negative by histopathology,Biopsied,1
Negative by histopathology,Completely Resected,8
Negative by histopathology,unknown,3


In [None]:
# how many patients have both pre and post samples present within DC1 and DC2a? >> NONE

df_DC12a = df_paired_olinks_corrected[(df_paired_olinks_corrected["Analysis_Batch"]=="DC1") | (df_paired_olinks_corrected["Analysis_Batch"]=="alpha")]
print(df_DC12a.shape)

df_DC12a.groupby(["patient_id"])[["sample_id"]].count()
#df_paired_olinks_corrected.groupby(["Analysis_Batch", "path_type","tissue_disposition"])[["sample_id"]].count()

In [63]:

df_paired_olinks_corrected2.groupby(["tissue_disposition","Analysis_Batch"])[["sample_id"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
tissue_disposition,Analysis_Batch,Unnamed: 2_level_1
Biopsied,unknown,4
Completely Resected,,4
Completely Resected,PON,2
Completely Resected,unknown,53
pre-colo,DC1,17
pre-colo,alpha,26
pre-colo,beta,26
pre-colo,delta,31
pre-colo,gamma,26
unknown,PON,11


In [64]:
df_paired_olinks[df_paired_olinks["tissue_disposition"]=="Biopsied"]

Unnamed: 0,assay,lims_batch,sample_id,patient_id,blood_draw_relative_to_tx,cancer_type,Study,Analysis_Batch,passed_latest_qc,tissue_disposition,olink_dsid,path_type,path_sub_type
35,targeted-panel,DC2_BG1_Olink_1,20983,24445,Post-Colonoscopy,Colorectal Normal / Healthy,DC2 BG1,unknown,True,Biopsied,SD-000092C4,Negative by histopathology,
283,targeted-panel,DC2_BG1_Olink_4,21249,25012,Post-Colonoscopy,Colorectal Benign,DC2 BG1,unknown,True,Biopsied,SD-000093DE,NAA,NAA3.2
473,targeted-panel,DC2_BG1_Olink_6,21165,24921,Post-Colonoscopy,Colorectal Pre-Cancer,DC2 BG1,unknown,True,Biopsied,SD-00009476,AA,AA2.3
630,targeted-panel,DC2_BG1_Olink_8,21236,24919,Post-Colonoscopy,Unclassifiable,DC2 BG1,unknown,True,Biopsied,SD-0000954B,,
