In [1]:
import numpy as np
import pandas as pd

# cw_groups maps chip well barcodes to those used in the all-donor whitelist
cw_groups = pd.read_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__chipwell_groups.csv').drop_duplicates()
cw_groups['chip_well_barcode'] = cw_groups['chip_well_barcode'].str.replace('_1$', '', regex=True)
cw_groups['Representative'] = cw_groups['Representative'].str.replace('_1$', '', regex=True)

# cw_fp maps chip well barcodes to participant ids derived from fingerprinting
# participant_id of fingerprinting is includes sample id info. Strip this
cw_fp = pd.read_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__fingerprinting.csv').drop_duplicates()
cw_fp['participant_sample_id'] = cw_fp['participant_id']
# remove ^NBB_ prefixes from participant_id, then remove _.*$ suffixes
cw_fp['participant_id'] = cw_fp['participant_id'].str.replace('^NBB_', '', regex=True)
cw_fp['participant_id'] = cw_fp['participant_id'].str.replace('_.*$', '', regex=True)

# manifest was our original mapping of chip well barcodes to participant ids (can be overrruled by fingerprinting)
manifest = pd.read_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__terra_manifest.csv').drop_duplicates()
manifest['chip_well_barcode'] = manifest['chip_well_barcode'].str.replace('_1$', '', regex=True)



In [17]:
# participant metadata maps participant id to age, sex, case-control status, etc.
calico_participant_metadata = pd.read_csv('~/sc-online/notebook_data/calico_participant_metadata.csv').drop_duplicates()
gtex_participant_metadata = pd.read_csv('~/sc-online/notebook_data/gtex_participant_metadata.csv').drop_duplicates()


# rename some columns + restructure case_control data format
calico_participant_metadata = calico_participant_metadata.rename(columns={
    'Collaborator Participant ID': 'participant_id',
    'Age': 'age',
    "Gender": "sex", 
    "Primary Disease": "case_control"
})
calico_participant_metadata.case_control[calico_participant_metadata.case_control=="Control"] = 'ctr'
calico_participant_metadata.case_control[calico_participant_metadata.case_control!="ctr"] = 'pd'

gtex_participant_metadata = gtex_participant_metadata.rename(columns={
    "SUBJID": "participant_id",
    "SEX": "sex",
    "AGE": "age"
})

# all GTEX are ctr except for these cases
gtex_cases = ["GTEX-14PQA", "GTEX-1HBPH", "GTEX-1IDJV"]
gtex_participant_metadata['case_control'] = 'ctr'
gtex_participant_metadata['case_control'][[(i in gtex_cases) for i in gtex_participant_metadata.participant_id]] = 'pd'

gtex_participant_metadata.sex[gtex_participant_metadata.sex == 1] = 'Male'
gtex_participant_metadata.sex[gtex_participant_metadata.sex == 2] = 'Female'

participant_metadata_cols = [
    'participant_id',
    'age',
    'sex',
    'case_control'
]
gtex_participant_metadata = gtex_participant_metadata[participant_metadata_cols]
calico_participant_metadata = calico_participant_metadata[participant_metadata_cols]
participant_metadata = pd.concat([calico_participant_metadata, gtex_participant_metadata])

# strip spaces from participant_id
participant_metadata['participant_id'] = participant_metadata['participant_id'].str.strip()

# remove duplicated rows where and sex is labeled "Unknown"
participant_metadata = participant_metadata.dropna(subset=['participant_id']).drop_duplicates()
participant_metadata = participant_metadata[participant_metadata.sex != "Unknown"]

print(participant_metadata.head())
participant_metadata.to_csv("~/sc-online/notebook_data/participant_metadata.csv")

  participant_id   age     sex case_control
0       1995-078  80.0  Female          ctr
1       1997-115  89.0  Female          ctr
2       1998-016  82.0  Female          ctr
3       1998-051  94.0  Female          ctr
4       2002-086  74.0  Female          ctr


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calico_participant_metadata.case_control[calico_participant_metadata.case_control=="Control"] = 'ctr'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calico_participant_metadata.case_control[calico_participant_metadata.case_control!="ctr"] = 'pd'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gtex_participant_metadata['case_control'][[(i in gtex_cases) for i in gtex_participant_metadata.participant_id]] = 'pd'
A value is trying to be set on a copy of a slice from a D

In [3]:
cw_groups.head()

Unnamed: 0,chip_well_barcode,Representative
0,206954930010_R04C01,206954930093_R05C02
1,206954930093_R05C02,206954930093_R05C02
2,207762950055_R07C02,206954930093_R05C02
3,206954930060_R02C01,207762950086_R04C01
4,207762950086_R04C01,207762950086_R04C01


In [4]:
cw_map = (manifest
    .merge(cw_groups, how='inner', on='chip_well_barcode') # for every chip_well_barcode, get the Representative barcode in the whitelist
    .merge(cw_fp, how='left', on='chip_well_barcode', suffixes=('_og', '_fp')) # for every chip_well_barcode, get the participant_id from fingerprinting if it exists
)
print(cw_map.shape)

# find chip_well_barcode values appearing more than once in cw_map
# looks like each duped barcode has the same participant as their dupes,
# with slightly different ids corresponding to different anatomical samples
dupes = cw_map[cw_map.duplicated(subset='chip_well_barcode', keep=False)]
print(dupes)

# drop duplicate rows of chip_well_barcode
cw_map = cw_map.drop_duplicates(subset=['chip_well_barcode'])

print(cw_map.shape)

(125, 5)
       chip_well_barcode collaborator_participant_id       Representative  \
7    206954930010_R05C02                      PDC091  206954930010_R05C02   
8    206954930010_R05C02                      PDC091  206954930010_R05C02   
20   206954930011_R04C01                      PD0976  206954930011_R04C01   
21   206954930011_R04C01                      PD0976  206954930011_R04C01   
23   206954930011_R05C01                      PD0833  206954930011_R05C01   
24   206954930011_R05C01                      PD0833  206954930011_R05C01   
67   206954930093_R07C01                      PD0906  206954930093_R07C01   
68   206954930093_R07C01                      PD0906  206954930093_R07C01   
102  207762950108_R02C01                      PDC094  207762950108_R02C01   
103  207762950108_R02C01                      PDC094  207762950108_R02C01   

    participant_id participant_sample_id  
7           PDC110           PDC110_Pons  
8           PDC110             PDC110_SN  
20          PD

In [5]:
cw_map = cw_map.rename(
    columns={
        "collaborator_participant_id": "manifest_participant_id",
        "participant_id": "fingerprinting_participant_id",
        "Representative": "chip_well_barcode_in_whitelist"})

#coalesce participant_id from fingerprinting, then the original manifest
cw_map["coalesced_participant_id"] = cw_map.fingerprinting_participant_id 
cw_map["coalesced_participant_id"][cw_map.fingerprinting_participant_id.isna()] = cw_map.manifest_participant_id[cw_map.fingerprinting_participant_id.isna()]

# Now, for every chip_well barcode, we have (a) the chip_well_barcode that appears in the whitelist, and (b) the coalesced participant_id

# remove samples without a coalesced_participant_id
cw_map = cw_map.dropna(subset=['coalesced_participant_id'])
print(cw_map.shape)



(120, 6)


In [6]:
cw_map.head()

Unnamed: 0,chip_well_barcode,manifest_participant_id,chip_well_barcode_in_whitelist,fingerprinting_participant_id,participant_sample_id,coalesced_participant_id
0,206954930010_R01C01,PDC090,206954930010_R01C01,PDC090,PDC090_SN,PDC090
1,206954930010_R01C02,PD0717,206954930010_R01C02,,,PD0717
2,206954930010_R02C01,PDC139,206954930010_R02C01,PDC139,PDC139_SN,PDC139
3,206954930010_R03C01,PD0730,206954930010_R03C01,PD0730,PD0730_SN,PD0730
4,206954930010_R03C02,PD0940,206954930010_R03C02,PD0940,PD0940_SN,PD0940


In [7]:
# now join with participant_metadata using the coalesced participant_id
cw_map = cw_map.merge(
    participant_metadata, 
    how='left', 
    left_on='coalesced_participant_id',
    right_on='participant_id')
print(cw_map.shape)
cw_map.head()

(120, 10)


Unnamed: 0,chip_well_barcode,manifest_participant_id,chip_well_barcode_in_whitelist,fingerprinting_participant_id,participant_sample_id,coalesced_participant_id,participant_id,age,sex,case_control
0,206954930010_R01C01,PDC090,206954930010_R01C01,PDC090,PDC090_SN,PDC090,PDC090,83.0,Male,ctr
1,206954930010_R01C02,PD0717,206954930010_R01C02,,,PD0717,PD0717,68.0,Female,pd
2,206954930010_R02C01,PDC139,206954930010_R02C01,PDC139,PDC139_SN,PDC139,PDC139,90.0,Male,ctr
3,206954930010_R03C01,PD0730,206954930010_R03C01,PD0730,PD0730_SN,PD0730,PD0730,77.0,Female,pd
4,206954930010_R03C02,PD0940,206954930010_R03C02,PD0940,PD0940_SN,PD0940,PD0940,81.0,Female,pd


In [10]:
#Finally, add _1 to chip_well_barcode, chip_well_barcode_in_whitelist to match what's on the vireo donor list
cw_map['chip_well_barcode_in_whitelist'] = cw_map['chip_well_barcode_in_whitelist'] + '_1'
cw_map['chip_well_barcode'] = cw_map['chip_well_barcode'] + '_1'

In [12]:
# TODO
# one chip_well_barcode in the whitelist has duplicated participant ids (awaiting fingerprinting)
cw_map[cw_map.chip_well_barcode_in_whitelist == "206954930011_R11C01_1"]

Unnamed: 0,chip_well_barcode,manifest_participant_id,chip_well_barcode_in_whitelist,fingerprinting_participant_id,participant_sample_id,coalesced_participant_id,participant_id,age,sex,case_control
13,206954930010_R11C01_1,PD0927,206954930011_R11C01_1,,,PD0927,PD0927,78.0,Male,pd
32,206954930011_R11C01_1,PD0901,206954930011_R11C01_1,,,PD0901,PD0901,86.0,Male,pd


In [14]:
# find chip_well_barcode with more than one participant_id
df_with_multiple_participants = cw_map.groupby('chip_well_barcode')['coalesced_participant_id'].nunique()
any(df_with_multiple_participants>1)

False

In [20]:
cw_map.shape

(120, 10)

In [None]:
# add in GTEX participant metadata

In [32]:
gtex_participant_metadata["chip_well_barcode"] = gtex_participant_metadata.participant_id
gtex_participant_metadata["chip_well_barcode_in_whitelist"] = gtex_participant_metadata.participant_id
gtex_participant_metadata["coalesced_participant_id"] = gtex_participant_metadata.participant_id
gtex_participant_metadata["participant_sample_id"] = gtex_participant_metadata.participant_id
gtex_participant_metadata["fingerprinting_participant_id"] = pd.NA 
gtex_participant_metadata["manifest_participant_id"] = gtex_participant_metadata.participant_id

gtex_participant_metadata = gtex_participant_metadata[cw_map.columns]

In [36]:
cw_map = pd.concat([cw_map, gtex_participant_metadata])

In [None]:
cw_map.to_csv('~/sc-online/notebook_data/chip_well_barcode-to-participant_id__fp_coalesced.csv')