In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import seaborn as sns
import anndata
import scanpy as sc
import genetools
from covid_serology import config

# Load data

In [2]:
vaccine_participants = pd.read_csv(
    f"{config.paths.data_dir}/pfizer_demographics.csv"
).dropna(how="all")
vaccine_participants

Unnamed: 0,PID,COVID Positive Ever?
0,pfizer00,No
1,pfizer01,No
2,pfizer02,No
3,pfizer03,No
4,pfizer04,No
5,pfizer05,No
6,pfizer06,No
7,pfizer07,No
8,pfizer08,No
9,pfizer09,No


In [3]:
vaccine_df = pd.read_csv(f"{config.paths.data_dir}/pfizer_data_final_ed_final.csv")
vaccine_df

Unnamed: 0,study_id,tp,plate,CoV2_S_IgG,CoV2_S_IgG.1,CoV2_S_IgG_Mean,NL63_S_IgG,NL63_S_IgG.1,NL63_S_IgG_Mean,CoV2_N_IgG,...,P9_RBD_ACE_B.1.429_percent,P9_RBD_ACE_B.1.351_percent,P9_RBD_ACE_B.1.243_percent,P9_RBD_ACE_P.1_percent,P9_RBD_ACE_B.1.526.2_percent,P9_RBD_ACE_B.1.1.7_percent,P9_RBD_ACE_B.1.1.7_E484_percent,P9_RBD_ACE_B.1.617_percent,P9_RBD_ACE_B.1.214_percent,P9_CoV2 _RBD_ACE_percent
0,pfizer00,D0,1,698.328,667.418,682.8730,6913.225,6517.880,6715.5525,881.258,...,13.725107,-77.090366,-11.747820,-4.809098,2.388070,4.780092,-4.851285,-3.914414,-3.179730,12.738288
1,pfizer01,D0,1,34.175,34.416,34.2955,707.973,694.801,701.3870,109.893,...,9.695624,-57.853730,-22.009318,-6.278746,0.269049,2.957650,-10.803043,-9.749725,-6.754975,0.127979
2,pfizer02,D0,1,55.502,59.408,57.4550,1475.624,1483.167,1479.3955,92.850,...,6.906935,-90.757272,-22.336469,-10.862575,-5.499141,-0.081304,-22.540702,-10.965084,-13.285669,-1.653736
3,pfizer03,D0,1,22.215,29.853,26.0340,4315.792,4523.093,4419.4425,36.477,...,4.808532,-67.201807,-25.932013,-10.461341,1.089484,0.636182,-17.882878,-14.317886,-6.529133,-3.052361
4,pfizer04,D0,1,17.951,23.404,20.6775,972.941,1039.918,1006.4295,23.440,...,10.327609,-71.728128,-16.488504,-3.533596,-3.646481,-2.752426,-20.316026,-6.476440,-5.125715,5.697594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,pfizer57,D42,9,215554.461,224977.465,220265.9630,6559.721,6096.448,6328.0845,186.007,...,99.813571,91.810914,99.710513,96.322237,99.663171,98.506102,95.947713,99.766737,99.704093,99.848034
309,pfizer58,D0,9,54.800,57.220,56.0100,2018.522,2037.909,2028.2155,50.843,...,1.395875,-36.233175,-2.671736,-0.159208,-0.164996,-7.442042,-19.109093,4.861639,7.782129,-2.103302
310,pfizer58,D7,9,57.220,50.926,54.0730,2090.358,2115.281,2102.8195,44.541,...,19.162172,-1.432594,13.262700,12.931888,18.802561,14.343946,13.432648,16.714440,19.046663,17.758508
311,pfizer58,D21,9,4247.902,4379.121,4313.5115,1816.664,1853.637,1835.1505,29.923,...,18.268883,-17.419456,15.834752,11.776376,18.018992,13.704999,4.231354,20.106839,22.986129,21.518551


# Reshape vaccine data

In [4]:
vaccine_df.columns

Index(['study_id', 'tp', 'plate', 'CoV2_S_IgG', 'CoV2_S_IgG.1',
       'CoV2_S_IgG_Mean', 'NL63_S_IgG', 'NL63_S_IgG.1', 'NL63_S_IgG_Mean',
       'CoV2_N_IgG',
       ...
       'P9_RBD_ACE_B.1.429_percent', 'P9_RBD_ACE_B.1.351_percent',
       'P9_RBD_ACE_B.1.243_percent', 'P9_RBD_ACE_P.1_percent',
       'P9_RBD_ACE_B.1.526.2_percent', 'P9_RBD_ACE_B.1.1.7_percent',
       'P9_RBD_ACE_B.1.1.7_E484_percent', 'P9_RBD_ACE_B.1.617_percent',
       'P9_RBD_ACE_B.1.214_percent', 'P9_CoV2 _RBD_ACE_percent'],
      dtype='object', length=210)

In [5]:
vaccine_df = pd.melt(
    vaccine_df, id_vars=["study_id", "tp", "plate"], var_name="measurement"
)
vaccine_df

Unnamed: 0,study_id,tp,plate,measurement,value
0,pfizer00,D0,1,CoV2_S_IgG,698.328
1,pfizer01,D0,1,CoV2_S_IgG,34.175
2,pfizer02,D0,1,CoV2_S_IgG,55.502
3,pfizer03,D0,1,CoV2_S_IgG,22.215
4,pfizer04,D0,1,CoV2_S_IgG,17.951
...,...,...,...,...,...
64786,pfizer57,D42,9,P9_CoV2 _RBD_ACE_percent,99.848034
64787,pfizer58,D0,9,P9_CoV2 _RBD_ACE_percent,-2.103302
64788,pfizer58,D7,9,P9_CoV2 _RBD_ACE_percent,17.758508
64789,pfizer58,D21,9,P9_CoV2 _RBD_ACE_percent,21.518551


In [6]:
vaccine_df.dtypes

study_id       object
tp             object
plate           int64
measurement    object
value          object
dtype: object

In [7]:
# there are some non-numeric values in "value"
vaccine_df["value"].apply(np.isreal).value_counts()

True     57910
False     6881
Name: value, dtype: int64

In [8]:
"#VALUE!" in vaccine_df["value"].values, "#DIV/0!" in vaccine_df["value"].values

(True, True)

In [9]:
# convert to float and switch non-numeric values to nan
vaccine_df["value"] = pd.to_numeric(vaccine_df["value"], errors="coerce")
vaccine_df["value"].isna().value_counts()

False    64741
True        50
Name: value, dtype: int64

In [10]:
vaccine_df["plate"].value_counts()

4    7659
5    7659
6    7659
7    7659
1    7452
2    7452
3    7452
8    6417
9    5382
Name: plate, dtype: int64

In [11]:
vaccine_df["measurement"].unique()

array(['CoV2_S_IgG', 'CoV2_S_IgG.1', 'CoV2_S_IgG_Mean', 'NL63_S_IgG',
       'NL63_S_IgG.1', 'NL63_S_IgG_Mean', 'CoV2_N_IgG', 'CoV2_N_IgG.1',
       'CoV2_N_IgG_Mean', 'CoV1_S_IgG', 'CoV1_S_IgG.1', 'CoV1_S_IgG_Mean',
       'CoV2_NTD_IgG', 'CoV2_NTD_IgG.1', 'CoV2_NTD_IgG_Mean',
       'HKU1_S_IgG', 'HKU1_S_IgG.1', 'HKU1_S_IgG_Mean', 'OC43_S_IgG',
       'OC43_S_IgG.1', 'OC43_S_IgG_Mean', 'h229E_S_IgG', 'h229E_S_IgG.1',
       'h229E_S_IgG_Mean', 'CoV2_RBD_IgG', 'CoV2_RBD_IgG.1',
       'CoV2_RBD_IgG_Mean', 'CoV2_S_IgM', 'CoV2_S_IgM.1',
       'CoV2_S_IgM_Mean', 'NL63_S_IgM', 'NL63_S_IgM.1', 'NL63_S_IgM_Mean',
       'CoV2_N_IgM', 'CoV2_N_IgM.1', 'CoV2_N_IgM_Mean', 'CoV1_S_IgM',
       'CoV1_S_IgM.1', 'CoV1_S_IgM_Mean', 'CoV2_NTD_IgM',
       'CoV2_NTD_IgM.1', 'CoV2_NTD_IgM_Mean', 'HKU1_S_IgM',
       'HKU1_S_IgM.1', 'HKU1_S_IgM_Mean', 'OC43_S_IgM', 'OC43_S_IgM.1',
       'OC43_S_IgM_Mean', 'h229E_S_IgM', 'h229E_S_IgM.1',
       'h229E_S_IgM_Mean', 'CoV2_RBD_IgM', 'CoV2_RBD_IgM.1',
    

In [12]:
vaccine_df = vaccine_df[
    (vaccine_df["measurement"].str.contains("IgG"))
    | (vaccine_df["measurement"].str.contains("IgA"))
    | (vaccine_df["measurement"].str.contains("IgM"))
]
vaccine_df

Unnamed: 0,study_id,tp,plate,measurement,value
0,pfizer00,D0,1,CoV2_S_IgG,698.3280
1,pfizer01,D0,1,CoV2_S_IgG,34.1750
2,pfizer02,D0,1,CoV2_S_IgG,55.5020
3,pfizer03,D0,1,CoV2_S_IgG,22.2150
4,pfizer04,D0,1,CoV2_S_IgG,17.9510
...,...,...,...,...,...
25348,pfizer57,D42,9,CoV2_RBD_IgA_Mean,1685.2345
25349,pfizer58,D0,9,CoV2_RBD_IgA_Mean,46.6320
25350,pfizer58,D7,9,CoV2_RBD_IgA_Mean,44.0780
25351,pfizer58,D21,9,CoV2_RBD_IgA_Mean,102.0070


In [13]:
vaccine_df = vaccine_df[
    vaccine_df["measurement"].str.lower().str.contains("mean")
].copy()
vaccine_df

Unnamed: 0,study_id,tp,plate,measurement,value
626,pfizer00,D0,1,CoV2_S_IgG_Mean,682.8730
627,pfizer01,D0,1,CoV2_S_IgG_Mean,34.2955
628,pfizer02,D0,1,CoV2_S_IgG_Mean,57.4550
629,pfizer03,D0,1,CoV2_S_IgG_Mean,26.0340
630,pfizer04,D0,1,CoV2_S_IgG_Mean,20.6775
...,...,...,...,...,...
25348,pfizer57,D42,9,CoV2_RBD_IgA_Mean,1685.2345
25349,pfizer58,D0,9,CoV2_RBD_IgA_Mean,46.6320
25350,pfizer58,D7,9,CoV2_RBD_IgA_Mean,44.0780
25351,pfizer58,D21,9,CoV2_RBD_IgA_Mean,102.0070


In [14]:
vaccine_df["measurement"].value_counts()

CoV2_NTD_IgA_Mean    313
h229E_S_IgG_Mean     313
CoV2_RBD_IgM_Mean    313
CoV2_S_IgG_Mean      313
NL63_S_IgG_Mean      313
CoV2_RBD_IgG_Mean    313
CoV2_S_IgA_Mean      313
CoV2_NTD_IgM_Mean    313
CoV1_S_IgG_Mean      313
OC43_S_IgM_Mean      313
CoV2_N_IgM_Mean      313
OC43_S_IgG_Mean      313
CoV1_S_IgA_Mean      313
CoV2_S_IgM_Mean      313
HKU1_S_IgA_Mean      313
CoV2_RBD_IgA_Mean    313
h229E_S_IgM_Mean     313
NL63_S_IgA_Mean      313
NL63_S_IgM_Mean      313
CoV2_N_IgG_Mean      313
CoV2_NTD_IgG_Mean    313
OC43_S_IgA_Mean      313
CoV2_N_IgA_Mean      313
HKU1_S_IgG_Mean      313
CoV1_S_IgM_Mean      313
h229E_S_IgA_Mean     313
HKU1_S_IgM_Mean      313
Name: measurement, dtype: int64

In [15]:
vaccine_df["measurement_original_column_name"] = vaccine_df["measurement"].copy()

In [16]:
vaccine_df["measurement"] = (
    vaccine_df["measurement"].str.replace("_Mean", "").str.replace("_mean", "")
)
vaccine_df["measurement"].value_counts()

NL63_S_IgM      313
h229E_S_IgM     313
HKU1_S_IgG      313
CoV2_NTD_IgA    313
CoV1_S_IgM      313
OC43_S_IgA      313
CoV1_S_IgG      313
CoV2_NTD_IgG    313
CoV2_N_IgG      313
CoV2_N_IgM      313
h229E_S_IgG     313
CoV2_NTD_IgM    313
CoV1_S_IgA      313
CoV2_RBD_IgA    313
CoV2_S_IgM      313
CoV2_RBD_IgM    313
CoV2_RBD_IgG    313
NL63_S_IgA      313
HKU1_S_IgM      313
CoV2_S_IgA      313
CoV2_S_IgG      313
NL63_S_IgG      313
h229E_S_IgA     313
OC43_S_IgM      313
HKU1_S_IgA      313
CoV2_N_IgA      313
OC43_S_IgG      313
Name: measurement, dtype: int64

In [17]:
# extract measurement column info
measurement_parts = (
    vaccine_df["measurement"]
    .str.split("_", expand=True)
    .rename(columns={0: "virus", 1: "target", 2: "antibody"})
    .assign(variant_plate_type="Wuhan")
    .apply(lambda col: col.str.strip())
)
measurement_parts

Unnamed: 0,virus,target,antibody,variant_plate_type
626,CoV2,S,IgG,Wuhan
627,CoV2,S,IgG,Wuhan
628,CoV2,S,IgG,Wuhan
629,CoV2,S,IgG,Wuhan
630,CoV2,S,IgG,Wuhan
...,...,...,...,...
25348,CoV2,RBD,IgA,Wuhan
25349,CoV2,RBD,IgA,Wuhan
25350,CoV2,RBD,IgA,Wuhan
25351,CoV2,RBD,IgA,Wuhan


In [18]:
measurement_parts["virus"] = measurement_parts["virus"].replace({"CoV2": "Wuhan"})

In [19]:
measurement_parts["virus"].value_counts()

Wuhan    3756
h229E     939
NL63      939
OC43      939
CoV1      939
HKU1      939
Name: virus, dtype: int64

In [20]:
measurement_parts["target"].value_counts()

S      5634
N       939
RBD     939
NTD     939
Name: target, dtype: int64

In [21]:
measurement_parts["antibody"].value_counts()

IgM    2817
IgA    2817
IgG    2817
Name: antibody, dtype: int64

In [22]:
vaccine_df = pd.concat([vaccine_df, measurement_parts], axis=1)
vaccine_df

Unnamed: 0,study_id,tp,plate,measurement,value,measurement_original_column_name,virus,target,antibody,variant_plate_type
626,pfizer00,D0,1,CoV2_S_IgG,682.8730,CoV2_S_IgG_Mean,Wuhan,S,IgG,Wuhan
627,pfizer01,D0,1,CoV2_S_IgG,34.2955,CoV2_S_IgG_Mean,Wuhan,S,IgG,Wuhan
628,pfizer02,D0,1,CoV2_S_IgG,57.4550,CoV2_S_IgG_Mean,Wuhan,S,IgG,Wuhan
629,pfizer03,D0,1,CoV2_S_IgG,26.0340,CoV2_S_IgG_Mean,Wuhan,S,IgG,Wuhan
630,pfizer04,D0,1,CoV2_S_IgG,20.6775,CoV2_S_IgG_Mean,Wuhan,S,IgG,Wuhan
...,...,...,...,...,...,...,...,...,...,...
25348,pfizer57,D42,9,CoV2_RBD_IgA,1685.2345,CoV2_RBD_IgA_Mean,Wuhan,RBD,IgA,Wuhan
25349,pfizer58,D0,9,CoV2_RBD_IgA,46.6320,CoV2_RBD_IgA_Mean,Wuhan,RBD,IgA,Wuhan
25350,pfizer58,D7,9,CoV2_RBD_IgA,44.0780,CoV2_RBD_IgA_Mean,Wuhan,RBD,IgA,Wuhan
25351,pfizer58,D21,9,CoV2_RBD_IgA,102.0070,CoV2_RBD_IgA_Mean,Wuhan,RBD,IgA,Wuhan


In [23]:
vaccine_df["tp"].value_counts()

D21    1593
D0     1566
D42    1485
D28    1404
D7     1296
D90    1107
Name: tp, dtype: int64

In [24]:
vaccine_df["plate"].value_counts()

4    999
5    999
6    999
7    999
1    972
2    972
3    972
8    837
9    702
Name: plate, dtype: int64

In [25]:
def process_vaccine_timepoint(df_partial, timepoint):
    # at a given time point: only one measurement per patient-virus-target-antibody combo (i.e. not repeated across plates)
    assert all(
        df_partial.groupby(
            ["study_id", "virus", "target", "antibody", "variant_plate_type"]
        ).size()
        == 1
    )

    # patients are distributed across plates - each patient found on a single plate
    assert all(df_partial.groupby("study_id")["plate"].nunique() == 1)

    # patients are distributed across plates
    # combine measurements across plates
    vaccine_df_pivot = pd.pivot(
        df_partial,
        index="study_id",
        columns=[
            "virus",
            "target",
            "variant_plate_type",
            "antibody",
            "measurement_original_column_name",
        ],
        values="value",
    )

    ## set column names
    variable_info = vaccine_df_pivot.columns.to_frame().reset_index(drop=True)
    # create combined name
    variable_info["timepoint"] = timepoint
    variable_info["combined_name"] = variable_info[
        variable_info.columns.drop("measurement_original_column_name")
    ].apply("_".join, axis=1)
    variable_info = variable_info.set_index("combined_name")

    # set var names
    vaccine_df_pivot.columns = variable_info.index.copy()

    # drop patients with any NaNs
    vaccine_df_pivot = vaccine_df_pivot.dropna(how="any")
    assert not vaccine_df_pivot.isna().any().any()

    return vaccine_df_pivot, variable_info

In [26]:
vaccine_df["tp"].value_counts()

D21    1593
D0     1566
D42    1485
D28    1404
D7     1296
D90    1107
Name: tp, dtype: int64

In [27]:
# timepoint label map
map_vaccine_to_global_timepoint_labels = {
    "D0": "day 0 / pre-pandemic",
    "D7": "day 7 / week 1",
    "D21": "day 21 / weeks 2&3",
    "D28": "day 28 / week 4",
    "D42": "day 42 / weeks 5&6",
    "D90": "week 7 and later",
}
assert all(
    tp in map_vaccine_to_global_timepoint_labels.keys()
    for tp in vaccine_df["tp"].unique()
)

In [28]:
X_partial = []
var_partial = []
for vaccine_timepoint in vaccine_df["tp"].unique():
    associated_global_timepoint_label = map_vaccine_to_global_timepoint_labels[
        vaccine_timepoint
    ]
    print(vaccine_timepoint, "->", associated_global_timepoint_label)
    df_partial = vaccine_df[vaccine_df["tp"] == vaccine_timepoint]
    vaccine_df_pivot, variable_info = process_vaccine_timepoint(
        df_partial, associated_global_timepoint_label
    )
    X_partial.append(vaccine_df_pivot)
    var_partial.append(variable_info)
vaccine_df_pivot = pd.concat(X_partial, axis=1)
variable_info = pd.concat(var_partial, axis=0)

D0 -> day 0 / pre-pandemic
D7 -> day 7 / week 1
D42 -> day 42 / weeks 5&6
D21 -> day 21 / weeks 2&3
D28 -> day 28 / week 4
D90 -> week 7 and later


In [29]:
# note: there are NaNs - patients don't have entries for all timepoints
vaccine_df_pivot

combined_name,Wuhan_S_Wuhan_IgG_day 0 / pre-pandemic,NL63_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan_N_Wuhan_IgG_day 0 / pre-pandemic,CoV1_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan_NTD_Wuhan_IgG_day 0 / pre-pandemic,HKU1_S_Wuhan_IgG_day 0 / pre-pandemic,OC43_S_Wuhan_IgG_day 0 / pre-pandemic,h229E_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan_RBD_Wuhan_IgG_day 0 / pre-pandemic,Wuhan_S_Wuhan_IgM_day 0 / pre-pandemic,...,Wuhan_RBD_Wuhan_IgM_week 7 and later,Wuhan_S_Wuhan_IgA_week 7 and later,NL63_S_Wuhan_IgA_week 7 and later,Wuhan_N_Wuhan_IgA_week 7 and later,CoV1_S_Wuhan_IgA_week 7 and later,Wuhan_NTD_Wuhan_IgA_week 7 and later,HKU1_S_Wuhan_IgA_week 7 and later,OC43_S_Wuhan_IgA_week 7 and later,h229E_S_Wuhan_IgA_week 7 and later,Wuhan_RBD_Wuhan_IgA_week 7 and later
study_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pfizer00,682.873,6715.5525,868.66,1047.2245,83.289,23255.856,76538.4185,67044.9865,1635.212,125.7255,...,738.293,2451.9795,3575.114,1482.2745,315.202,688.6205,3176.8125,4639.2735,6983.933,1921.265
pfizer01,34.2955,701.387,109.608,41.2405,2.741,3047.023,14337.638,22816.1735,101.3445,27.3405,...,305.65,1436.2765,137.884,104.4405,134.1925,272.704,527.75,3669.457,2794.198,744.878
pfizer02,57.455,1479.3955,94.408,30.187,2.367,2754.034,18616.267,16538.5145,93.827,136.5535,...,603.353,5036.0035,655.2465,104.8235,412.053,1872.2105,3525.606,9635.7725,2061.163,1967.118
pfizer03,26.034,4419.4425,37.7095,104.049,4.2295,4661.562,8355.9325,28847.0875,142.7325,132.9435,...,,,,,,,,,,
pfizer04,20.6775,1006.4295,37.95,75.308,9.556,69115.7905,65751.5645,45806.156,283.422,59.132,...,118.762,2879.694,1607.336,114.1405,144.763,127.1325,4280.2025,3307.3135,2278.9975,390.6285
pfizer05,1123.284,2847.137,45.6835,1199.98,11.071,40578.486,23135.993,101683.9995,778.71,165.7595,...,,,,,,,,,,
pfizer06,19.641,1669.333,44.876,80.3855,4.334,4743.604,24943.2335,16381.7135,96.1865,164.9535,...,215.721,434.947,2418.7435,281.255,61.9825,135.164,450.7205,676.941,3883.904,343.797
pfizer07,159.3835,617.9735,41.691,43.8285,5.329,7970.1065,18396.3235,25714.976,209.2095,400.729,...,1450.207,3079.0375,4169.0345,88.789,476.7685,893.795,1490.601,2691.8405,13482.8515,1819.363
pfizer08,59.6205,5423.119,2240.0555,95.1185,1.183,20453.3065,57279.435,5553.335,45.08,50.7455,...,693.968,408.127,1073.6115,89.751,47.593,164.6755,2953.023,4467.596,18444.9745,327.741
pfizer09,31.149,650.883,161.3655,51.546,3.1935,6520.051,8252.2035,5756.2085,84.159,267.6635,...,254.863,1254.2955,792.994,360.239,253.936,135.957,768.326,3350.315,8832.5995,894.1975


In [30]:
variable_info

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,measurement_original_column_name,timepoint
combined_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Wuhan_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,S,Wuhan,IgG,CoV2_S_IgG_Mean,day 0 / pre-pandemic
NL63_S_Wuhan_IgG_day 0 / pre-pandemic,NL63,S,Wuhan,IgG,NL63_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_N_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,N,Wuhan,IgG,CoV2_N_IgG_Mean,day 0 / pre-pandemic
CoV1_S_Wuhan_IgG_day 0 / pre-pandemic,CoV1,S,Wuhan,IgG,CoV1_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_NTD_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,NTD,Wuhan,IgG,CoV2_NTD_IgG_Mean,day 0 / pre-pandemic
...,...,...,...,...,...,...
Wuhan_NTD_Wuhan_IgA_week 7 and later,Wuhan,NTD,Wuhan,IgA,CoV2_NTD_IgA_Mean,week 7 and later
HKU1_S_Wuhan_IgA_week 7 and later,HKU1,S,Wuhan,IgA,HKU1_S_IgA_Mean,week 7 and later
OC43_S_Wuhan_IgA_week 7 and later,OC43,S,Wuhan,IgA,OC43_S_IgA_Mean,week 7 and later
h229E_S_Wuhan_IgA_week 7 and later,h229E,S,Wuhan,IgA,h229E_S_IgA_Mean,week 7 and later


In [31]:
# attach status
vaccine_participants = vaccine_participants.set_index("PID")
vaccine_participants["Status"] = "Vaccinee"
# reorder
vaccine_participants = vaccine_participants.loc[vaccine_df_pivot.index]
vaccine_participants

Unnamed: 0_level_0,COVID Positive Ever?,Status
study_id,Unnamed: 1_level_1,Unnamed: 2_level_1
pfizer00,No,Vaccinee
pfizer01,No,Vaccinee
pfizer02,No,Vaccinee
pfizer03,No,Vaccinee
pfizer04,No,Vaccinee
pfizer05,No,Vaccinee
pfizer06,No,Vaccinee
pfizer07,No,Vaccinee
pfizer08,No,Vaccinee
pfizer09,No,Vaccinee


In [32]:
# anndata requires string indices
vaccine_participants.index = vaccine_participants.index.astype(str)
vaccine_df_pivot.index = vaccine_df_pivot.index.astype(str)

In [33]:
adata_vaccine = anndata.AnnData(
    X=vaccine_df_pivot, obs=vaccine_participants, var=variable_info
)
adata_vaccine

AnnData object with n_obs × n_vars = 59 × 162
    obs: 'COVID Positive Ever?', 'Status'
    var: 'virus', 'target', 'variant_plate_type', 'antibody', 'measurement_original_column_name', 'timepoint'

In [34]:
adata_vaccine.var

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,measurement_original_column_name,timepoint
combined_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Wuhan_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,S,Wuhan,IgG,CoV2_S_IgG_Mean,day 0 / pre-pandemic
NL63_S_Wuhan_IgG_day 0 / pre-pandemic,NL63,S,Wuhan,IgG,NL63_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_N_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,N,Wuhan,IgG,CoV2_N_IgG_Mean,day 0 / pre-pandemic
CoV1_S_Wuhan_IgG_day 0 / pre-pandemic,CoV1,S,Wuhan,IgG,CoV1_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_NTD_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,NTD,Wuhan,IgG,CoV2_NTD_IgG_Mean,day 0 / pre-pandemic
...,...,...,...,...,...,...
Wuhan_NTD_Wuhan_IgA_week 7 and later,Wuhan,NTD,Wuhan,IgA,CoV2_NTD_IgA_Mean,week 7 and later
HKU1_S_Wuhan_IgA_week 7 and later,HKU1,S,Wuhan,IgA,HKU1_S_IgA_Mean,week 7 and later
OC43_S_Wuhan_IgA_week 7 and later,OC43,S,Wuhan,IgA,OC43_S_IgA_Mean,week 7 and later
h229E_S_Wuhan_IgA_week 7 and later,h229E,S,Wuhan,IgA,h229E_S_IgA_Mean,week 7 and later


In [35]:
adata_vaccine.obs["Status"].value_counts()

Vaccinee    59
Name: Status, dtype: int64

In [36]:
adata_vaccine.write(
    f"{config.paths.generated_data_dir}/partial.pfizer_vaccine.coronavirus_plate.original.h5"
)

... storing 'COVID Positive Ever?' as categorical


... storing 'Status' as categorical


... storing 'virus' as categorical


... storing 'target' as categorical


... storing 'variant_plate_type' as categorical


... storing 'antibody' as categorical


... storing 'measurement_original_column_name' as categorical


... storing 'timepoint' as categorical
