# Create Pipeline Metadata

# Purpose: 

* Load & Engineer File + Clinical Metadata 
* Output File Metadata
* Assign Cell-Type Markers 
* Output Panel Metadata

## Packages and Options

In [1]:
import pandas as pd
import glob
from numpy import nan

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load & Engineer Metadata

In [2]:
metadata =pd.read_csv("../../../metadata/metadata.csv")
metadata.head()

Unnamed: 0,Sample,CAD,Stim
0,S121_stim,LO,stim
1,S121_us,LO,us
2,S125_stim,HI,stim
3,S125_us,HI,us
4,S127_stim,LO,stim


In [3]:
# get number for Sample ID
metadata["Sample"] = metadata["Sample"].str.split("_").str[0].str.split("S").str[1]
# remove duplicate lines 
metadata = metadata[["Sample", "CAD"]].drop_duplicates()
# set as index 
metadata = metadata.set_index("Sample")

metadata

Unnamed: 0_level_0,CAD
Sample,Unnamed: 1_level_1
121,LO
125,HI
127,LO
143,HI
160,LO
168,LO
181,LO
321,HI
324,HI
330,LO


In [4]:
# get all CVS files, remove root path, and sort
files = [file.split("/")[-1] for file in sorted(glob.glob("/home/jve4pt/B-Cells-In-Atherosclerosis/analysis/1_CSV_to_FCS/output/*"))]
    
len(files)
files

20

['export_DB__121 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__125 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__127 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__143 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__160 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__168 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__181 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__321 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__324 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__330 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__331 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__334 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__336 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__349 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__351 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__359 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__372 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__375 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__394 us APC_CD45+_CD14-CD16-.fcs',
 'export_DB__413 us APC_CD45+_CD14-CD16-.fcs']

## Engineer Pipeline Metadata File

In [5]:
# create pipelin metadata file 
pipeline_metadata = pd.DataFrame(columns=["file_name", "patient_id"])
pipeline_metadata

Unnamed: 0,file_name,patient_id


In [6]:
# set "file_name"
pipeline_metadata["file_name"] = files
pipeline_metadata.head()

Unnamed: 0,file_name,patient_id
0,export_DB__121 us APC_CD45+_CD14-CD16-.fcs,
1,export_DB__125 us APC_CD45+_CD14-CD16-.fcs,
2,export_DB__127 us APC_CD45+_CD14-CD16-.fcs,
3,export_DB__143 us APC_CD45+_CD14-CD16-.fcs,
4,export_DB__160 us APC_CD45+_CD14-CD16-.fcs,


In [7]:
# parse out patient id as numeric value for joining to clinical metadata 
pipeline_metadata["patient_id"] = pipeline_metadata["file_name"].str.split("__").str[1].str.split(" ").str[0]
# set as index 
pipeline_metadata = pipeline_metadata.set_index("patient_id")

pipeline_metadata.head()

Unnamed: 0_level_0,file_name
patient_id,Unnamed: 1_level_1
121,export_DB__121 us APC_CD45+_CD14-CD16-.fcs
125,export_DB__125 us APC_CD45+_CD14-CD16-.fcs
127,export_DB__127 us APC_CD45+_CD14-CD16-.fcs
143,export_DB__143 us APC_CD45+_CD14-CD16-.fcs
160,export_DB__160 us APC_CD45+_CD14-CD16-.fcs


In [8]:
# left join pipelin metadata with clinical metadata 
pipeline_metadata = pipeline_metadata.join(metadata)

pipeline_metadata.head()

Unnamed: 0_level_0,file_name,CAD
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
121,export_DB__121 us APC_CD45+_CD14-CD16-.fcs,LO
125,export_DB__125 us APC_CD45+_CD14-CD16-.fcs,HI
127,export_DB__127 us APC_CD45+_CD14-CD16-.fcs,LO
143,export_DB__143 us APC_CD45+_CD14-CD16-.fcs,HI
160,export_DB__160 us APC_CD45+_CD14-CD16-.fcs,LO


In [9]:
# rename column 
pipeline_metadata = pipeline_metadata.rename(columns={"CAD":"condition"})
# replace "HI" and "LO" with case/ctrl
pipeline_metadata = pipeline_metadata.replace({"LO":"CTRL", "HI":"CASE"})
# reset index 
pipeline_metadata = pipeline_metadata.reset_index()[["file_name", "condition", "patient_id"]]

pipeline_metadata.head()

Unnamed: 0,file_name,condition,patient_id
0,export_DB__121 us APC_CD45+_CD14-CD16-.fcs,CTRL,121
1,export_DB__125 us APC_CD45+_CD14-CD16-.fcs,CASE,125
2,export_DB__127 us APC_CD45+_CD14-CD16-.fcs,CTRL,127
3,export_DB__143 us APC_CD45+_CD14-CD16-.fcs,CASE,143
4,export_DB__160 us APC_CD45+_CD14-CD16-.fcs,CTRL,160


In [10]:
# output to CSV 
pipeline_metadata.to_csv("../output/metafile_B_cell.csv", index=False)

## Feature Validation

#### Validate that the same features are in all files

In [11]:
# get reference features
with open("/home/jve4pt/Aditi_APC_Panel/APC_Bcells_CD19_CSV/{}".format(files[0].replace(".fcs", ".csv"))) as in_file: 
    # remove all miscellaneous characters that are irrelevant 
    # split by comma
    reference_features = sorted(in_file.readline().replace("\n","").replace("'", "").replace('"', "").split(","))

len(reference_features)
reference_features

66

['102Pd',
 '104Pd',
 '105Pd',
 '106Pd',
 '108Pd',
 '110Pd',
 '113In',
 '127I',
 '131Xe',
 '138Ba',
 '140Ce',
 '141Pr_CD70',
 '142Ce',
 '142Nd_CD19',
 '143Nd_CD123',
 '144Nd_CD11b',
 '145Nd',
 '146Nd_IgD',
 '147Sm_CD11c',
 '148Nd_PD-L1',
 '149Sm_CD200',
 '150Nd_CD43',
 '151Eu_CD14',
 '152Sm_CD95',
 '153Eu_TIM-3',
 '154Sm',
 '155Gd_BAFFR',
 '156Gd_CD86',
 '158Gd_CD137L',
 '159Tb_CD22',
 '160Dy',
 '160Gd_CD200R',
 '161Dy',
 '162Dy_CD80',
 '163Dy_CD95L',
 '164Dy_GITRL',
 '165Ho_CD40',
 '166Er_SIGLEC_10',
 '167Er_CD27',
 '168Er_OX40L',
 '169Tm_CD24',
 '170Er_CD3',
 '171Yb_CD20',
 '172Yb_IgM',
 '173Yb_CD137',
 '174Yb_HLA-DR',
 '175Lu_PD-1',
 '176Lu',
 '176Yb_CD56',
 '190BCKG',
 '191Ir',
 '193Ir',
 '194Pt',
 '195Pt',
 '198Pt',
 '208Pb',
 '209Bi_CD16',
 '89Y_CD45',
 'Center',
 'Event_length',
 'Offset',
 'Residual',
 'Time',
 'bc_separation_dist',
 'beadDist',
 'mahalanobis_dist']

In [12]:
# check all files 

for file in files:  
    with open("/home/jve4pt/Aditi_APC_Panel/APC_Bcells_CD19_CSV/{}".format(file.replace(".fcs", ".csv"))) as in_file: 
        # remove all miscellaneous characters that are irrelevant 
        # split by comma
        features = sorted(in_file.readline().replace("\n","").replace("'", "").replace('"', "").split(","))
        
        # if the two sets are not equal, then misalignemnt 
        if not set(features)==set(reference_features):
            print("Features misaligned for file: {}".format(file))


## Engineer Panel File

In [13]:
# panel file column initialization 
panel = pd.DataFrame(columns=["fcs_colname", "antigen"])
panel

Unnamed: 0,fcs_colname,antigen


In [14]:
# only take features that are actually metals and not other metadata 
panel["fcs_colname"] = [feature for feature in reference_features if feature[0].isdigit()]
# name antigen column the same 
panel["antigen"] = panel["fcs_colname"]
# set index for joining later 
panel = panel.set_index("fcs_colname")

panel.head()

Unnamed: 0_level_0,antigen
fcs_colname,Unnamed: 1_level_1
102Pd,102Pd
104Pd,104Pd
105Pd,105Pd
106Pd,106Pd
108Pd,108Pd


#### Assign Markers

In [15]:
# read in file of markers Oom was looking at 
markers = pd.read_table("../../../metadata/oom_markers.txt",header=None)
# set all of them as "type" markers
markers["marker_class"] = "type"
# set the index for joining 
markers = markers.set_index(0)

markers.head()

Unnamed: 0_level_0,marker_class
0,Unnamed: 1_level_1
162Dy_CD80,type
166Er_SIGLEC_10,type
168Er_OX40L,type
155Gd_BAFFR,type
156Gd_CD86,type


#### Make Final Table

In [16]:
# join panel by markers
# reset index 
# replace numpy.nan with "none"
panel = panel.join(markers).reset_index().replace(nan, "none")

panel

Unnamed: 0,fcs_colname,antigen,marker_class
0,102Pd,102Pd,none
1,104Pd,104Pd,none
2,105Pd,105Pd,none
3,106Pd,106Pd,none
4,108Pd,108Pd,none
5,110Pd,110Pd,none
6,113In,113In,none
7,127I,127I,none
8,131Xe,131Xe,none
9,138Ba,138Ba,none


In [17]:
# output to CSV 
panel.to_csv("../output/panelfile_B_cell.csv", index=False)