In [None]:
import numpy as np
import pandas as pd

cw_pid = pd.read_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__chipwell_participant.csv').drop_duplicates()
cw_groups = pd.read_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__chipwell_groups.csv').drop_duplicates()
cw_fp = pd.read_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__fingerprinting.csv').drop_duplicates()

# participant_id of fingerprinting is includes sample id info. Strip this
cw_fp['participant_sample_id'] = cw_fp['participant_id']
# remove ^NBB_ prefixes from participant_id
# then remove _.*$ suffixes
cw_fp['participant_id'] = cw_fp['participant_id'].str.replace('^NBB_', '', regex=True)
cw_fp['participant_id'] = cw_fp['participant_id'].str.replace('_.*$', '', regex=True)

manifest = pd.read_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__terra_manifest.csv').drop_duplicates()
calico_participant_metadata = pd.read_csv('~/sc-online/notebook_data/calico_participant_metadata.csv').drop_duplicates()
gtex_participant_metadata = pd.read_csv('~/sc-online/notebook_data/gtex_participant_metadata.csv').drop_duplicates()

cw_groups['chip_well_barcode'] = cw_groups['chip_well_barcode'].str.replace('_1$', '', regex=True)
cw_groups['Representative'] = cw_groups['Representative'].str.replace('_1$', '', regex=True)
manifest['chip_well_barcode'] = manifest['chip_well_barcode'].str.replace('_1$', '', regex=True)

cw_pid.shape, cw_groups.shape, cw_fp.shape, manifest.shape, calico_participant_metadata.shape, gtex_participant_metadata.shape

calico_participant_metadata = calico_participant_metadata.rename(columns={
    'Collaborator Participant ID': 'participant_id',
    'Age': 'age',
    "Gender": "sex", 
    "Primary Disease": "case_control"
})
calico_participant_metadata.case_control[calico_participant_metadata.case_control=="Control"] = 'ctr'
calico_participant_metadata.case_control[calico_participant_metadata.case_control!="ctr"] = 'pd'

gtex_participant_metadata = gtex_participant_metadata.rename(columns={
    "SUBJID": "participant_id",
    "SEX": "sex",
    "AGE": "age"
})

gtex_cases = ["GTEX-14PQA", "GTEX-1HBPH", "GTEX-1IDJV"]
gtex_participant_metadata['case_control'] = 'ctr'
gtex_participant_metadata['case_control'][[(i in gtex_cases) for i in gtex_participant_metadata.participant_id]] = 'pd'

gtex_participant_metadata.sex[gtex_participant_metadata.sex == 1] = 'Male'
gtex_participant_metadata.sex[gtex_participant_metadata.sex == 2] = 'Female'

participant_metadata_cols = [
    'participant_id',
    'age',
    'sex',
    'case_control'
]
gtex_participant_metadata = gtex_participant_metadata[participant_metadata_cols]
calico_participant_metadata = calico_participant_metadata[participant_metadata_cols]
participant_metadata = pd.concat([calico_participant_metadata, gtex_participant_metadata])
participant_metadata = participant_metadata.dropna(subset=['participant_id']).drop_duplicates()
# remove duplicated rows where sex is labeled "Unknown"
participant_metadata = participant_metadata[participant_metadata.sex != "Unknown"]



In [None]:
cw_map = (manifest
    .merge(cw_groups, how='inner', on='chip_well_barcode')
    .merge(cw_fp, how='left', on='chip_well_barcode', suffixes=('_og', '_fp'))
)
cw_map.shape

# find chip_well_barcode values appearing more than once in cw_map
# looks like each duped barcode has the same participant as their dupes,
# with slightly different ids corresponding to different anatomical samples
dupes = cw_map[cw_map.duplicated(subset='chip_well_barcode', keep=False)]
print(dupes)

# drop duplicate rows of chip_well_barcode
cw_map = cw_map.drop_duplicates(subset=['chip_well_barcode'])

In [None]:
cw_map = cw_map.rename(
    columns={
        "collaborator_participant_id": "manifest_participant_id",
        "participant_id": "fingerprinting_participant_id",
        "Representative": "chip_well_barcode_in_whitelist"})

#coalesce participant_id from fingerprinting, then the original manifest
cw_map["coalesced_participant_id"] = cw_map.fingerprinting_participant_id 
cw_map["coalesced_participant_id"][cw_map.fingerprinting_participant_id.isna()] = cw_map.manifest_participant_id[cw_map.fingerprinting_participant_id.isna()]


In [None]:
# now join with participant_metadata using the coalesced participant_id
print(cw_map.shape)
cw_map = cw_map.merge(
    participant_metadata, 
    how='left', 
    left_on='coalesced_participant_id',
    right_on='participant_id')
print(cw_map.shape)

In [None]:
#Finally, add _1 to chip_well_barcode, chip_well_barcode_in_whitelist to match what's on the vireo donor list
cw_map['chip_well_barcode_in_whitelist'] = cw_map['chip_well_barcode_in_whitelist'] + '_1'
cw_map['chip_well_barcode'] = cw_map['chip_well_barcode'] + '_1'

In [None]:
cw_map[cw_map.chip_well_barcode_in_whitelist == "206954930011_R11C01_1"]

In [None]:
# find chip_well_barcode_in_whitelists with more than one participant_id
df_with_multiple_participants = cw_map.groupby('chip_well_barcode_in_whitelist')['participant_id'].nunique()
df_with_multiple_participants = df_with_multiple_participants[df_with_multiple_participants > 1]
df_with_multiple_participants

In [None]:
cw_map.to_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__fp_coalesced.csv')