In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import seaborn as sns
import anndata
import scanpy as sc
import genetools
from covid_serology import config
from numpy.testing import assert_array_equal

# Load data

In [2]:
vaccine_participants = pd.read_csv(
    f"{config.paths.data_dir}/pfizer_demographics.csv"
).dropna(how="all")
vaccine_participants

Unnamed: 0,PID,COVID Positive Ever?
0,pfizer00,No
1,pfizer01,No
2,pfizer02,No
3,pfizer03,No
4,pfizer04,No
5,pfizer05,No
6,pfizer06,No
7,pfizer07,No
8,pfizer08,No
9,pfizer09,No


In [3]:
vaccine_df = pd.read_csv(f"{config.paths.data_dir}/pfizer_coronavirus.csv")
vaccine_df

Unnamed: 0,participant,timepoint,Wuhan_S_AU,NL63_S_AU,Wuhan_N_AU,CoV_S_AU,Wuhan_NTD_AU,HKU1_S_AU,OC43_S_AU,h229E_S_AU,Wuhan_RBD_AU
0,pfizer01,boostD1/2,13024.606820,777.295701,140.872774,2062.607562,313.991165,3945.585095,17301.86201,20378.41066,4690.960207
1,pfizer01,boostD7/8,866042.530800,788.501818,210.102603,179294.543100,23268.040320,28430.056760,96357.30685,19560.92339,347203.782900
2,pfizer02,boostD1/2,23597.614860,1651.146945,74.422681,3383.081474,603.800822,3775.718673,22373.67391,15618.59734,9726.310077
3,pfizer02,boostD7/8,593669.910400,1973.689145,121.396195,125502.432300,14097.737930,8318.044619,44433.84804,17350.64436,315836.125200
4,pfizer04,boostD1/2,14715.513360,909.931083,28.874592,2072.400417,237.931139,47458.355310,53553.56524,32826.84670,6096.580961
...,...,...,...,...,...,...,...,...,...,...,...
384,pfizer42,D210,13349.064570,3794.105836,7030.735363,1888.613487,317.414103,12672.472710,38933.08625,35163.20045,5457.409734
385,pfizer43,D210,6484.640349,1763.311165,55.391124,1212.182440,240.678884,5904.718543,16390.78487,10323.42453,2517.016196
386,pfizer45,D210,11866.800270,1877.302078,93.225933,973.719246,164.584676,21648.812040,64267.42992,15582.46392,4123.343836
387,pfizer46,D210,5533.840659,10226.669370,233.147245,1065.196248,123.362074,3523.259554,33930.91563,28372.25322,3086.963182


In [4]:
# only one sample per patient per timepoint
assert all(vaccine_df.groupby(["participant", "timepoint"]).size() == 1)

# Reshape vaccine data

In [5]:
vaccine_df.columns

Index(['participant', 'timepoint', 'Wuhan_S_AU', 'NL63_S_AU', 'Wuhan_N_AU',
       'CoV_S_AU', 'Wuhan_NTD_AU', 'HKU1_S_AU', 'OC43_S_AU', 'h229E_S_AU',
       'Wuhan_RBD_AU'],
      dtype='object')

In [6]:
vaccine_df = pd.melt(
    vaccine_df, id_vars=["participant", "timepoint"], var_name="measurement"
)
vaccine_df

Unnamed: 0,participant,timepoint,measurement,value
0,pfizer01,boostD1/2,Wuhan_S_AU,13024.606820
1,pfizer01,boostD7/8,Wuhan_S_AU,866042.530800
2,pfizer02,boostD1/2,Wuhan_S_AU,23597.614860
3,pfizer02,boostD7/8,Wuhan_S_AU,593669.910400
4,pfizer04,boostD1/2,Wuhan_S_AU,14715.513360
...,...,...,...,...
3496,pfizer42,D210,Wuhan_RBD_AU,5457.409734
3497,pfizer43,D210,Wuhan_RBD_AU,2517.016196
3498,pfizer45,D210,Wuhan_RBD_AU,4123.343836
3499,pfizer46,D210,Wuhan_RBD_AU,3086.963182


In [7]:
# just in case, convert to float and switch non-numeric values to nan
vaccine_df["value"] = pd.to_numeric(vaccine_df["value"], errors="coerce")
vaccine_df.dtypes

participant     object
timepoint       object
measurement     object
value          float64
dtype: object

In [8]:
vaccine_df["value"].isna().value_counts()

False    3498
True        3
Name: value, dtype: int64

In [9]:
vaccine_df[vaccine_df["value"].isna()]

Unnamed: 0,participant,timepoint,measurement,value
788,pfizer09,boostD1/2,Wuhan_N_AU,
1670,pfizer17,D0,Wuhan_NTD_AU,
2026,pfizer12,D0,HKU1_S_AU,


In [10]:
vaccine_df["measurement"].unique()

array(['Wuhan_S_AU', 'NL63_S_AU', 'Wuhan_N_AU', 'CoV_S_AU',
       'Wuhan_NTD_AU', 'HKU1_S_AU', 'OC43_S_AU', 'h229E_S_AU',
       'Wuhan_RBD_AU'], dtype=object)

In [11]:
vaccine_df["measurement"] = vaccine_df["measurement"].str.replace("_AU", "")
vaccine_df["measurement"].value_counts()

Wuhan_RBD    389
NL63_S       389
Wuhan_S      389
h229E_S      389
Wuhan_NTD    389
Wuhan_N      389
OC43_S       389
CoV_S        389
HKU1_S       389
Name: measurement, dtype: int64

In [12]:
# extract parts of measurement column
# all coronavirus plate measurements are IgG only
measurement_parts = (
    vaccine_df["measurement"]
    .str.split("_", expand=True)
    .rename(columns={0: "virus", 1: "target"})
    .assign(variant_plate_type="Wuhan", antibody="IgG")
    .apply(lambda col: col.str.strip())
)
measurement_parts

Unnamed: 0,virus,target,variant_plate_type,antibody
0,Wuhan,S,Wuhan,IgG
1,Wuhan,S,Wuhan,IgG
2,Wuhan,S,Wuhan,IgG
3,Wuhan,S,Wuhan,IgG
4,Wuhan,S,Wuhan,IgG
...,...,...,...,...
3496,Wuhan,RBD,Wuhan,IgG
3497,Wuhan,RBD,Wuhan,IgG
3498,Wuhan,RBD,Wuhan,IgG
3499,Wuhan,RBD,Wuhan,IgG


In [13]:
measurement_parts["virus"].value_counts()

Wuhan    1556
h229E     389
CoV       389
OC43      389
NL63      389
HKU1      389
Name: virus, dtype: int64

In [14]:
measurement_parts["target"].value_counts()

S      2334
NTD     389
RBD     389
N       389
Name: target, dtype: int64

In [15]:
measurement_parts["variant_plate_type"].value_counts()

Wuhan    3501
Name: variant_plate_type, dtype: int64

In [16]:
vaccine_df = pd.concat([vaccine_df, measurement_parts], axis=1)
vaccine_df

Unnamed: 0,participant,timepoint,measurement,value,virus,target,variant_plate_type,antibody
0,pfizer01,boostD1/2,Wuhan_S,13024.606820,Wuhan,S,Wuhan,IgG
1,pfizer01,boostD7/8,Wuhan_S,866042.530800,Wuhan,S,Wuhan,IgG
2,pfizer02,boostD1/2,Wuhan_S,23597.614860,Wuhan,S,Wuhan,IgG
3,pfizer02,boostD7/8,Wuhan_S,593669.910400,Wuhan,S,Wuhan,IgG
4,pfizer04,boostD1/2,Wuhan_S,14715.513360,Wuhan,S,Wuhan,IgG
...,...,...,...,...,...,...,...,...
3496,pfizer42,D210,Wuhan_RBD,5457.409734,Wuhan,RBD,Wuhan,IgG
3497,pfizer43,D210,Wuhan_RBD,2517.016196,Wuhan,RBD,Wuhan,IgG
3498,pfizer45,D210,Wuhan_RBD,4123.343836,Wuhan,RBD,Wuhan,IgG
3499,pfizer46,D210,Wuhan_RBD,3086.963182,Wuhan,RBD,Wuhan,IgG


In [17]:
vaccine_df["timepoint"].value_counts()

D21          531
D0           522
D42          495
D28          468
D7           441
D90          369
D210         315
boostD1/2    198
boostD7/8    135
boostD21      27
Name: timepoint, dtype: int64

In [18]:
# timepoint label map
map_vaccine_to_global_timepoint_labels = {
    "D0": "day 0 / pre-pandemic",
    "D7": "day 7 / week 1",
    "D21": "day 21 / weeks 2&3",
    "D28": "day 28 / week 4",
    "D42": "day 42 / weeks 5&6",
    "D90": "week 7 and later / 3 months",
    "D210": "day 210 / 7 months",
    "boostD1/2": "boostD1/2",
    "boostD7/8": "boostD7/8",
    "boostD21": "boostD21",
}
assert all(
    tp in map_vaccine_to_global_timepoint_labels.keys()
    for tp in vaccine_df["timepoint"].unique()
)

In [19]:
def process_vaccine_timepoint(df_partial, timepoint):
    # at a given time point: only one measurement per patient-virus-target combo
    assert all(
        df_partial.groupby(
            ["participant", "virus", "target", "variant_plate_type", "antibody"]
        ).size()
        == 1
    )

    # unmelt
    vaccine_df_pivot = pd.pivot(
        df_partial,
        index="participant",
        columns=[
            "virus",
            "target",
            "variant_plate_type",
            "antibody",
        ],
        values="value",
    )

    ## set column names
    variable_info = vaccine_df_pivot.columns.to_frame().reset_index(drop=True)
    # create combined name
    variable_info["timepoint"] = timepoint
    variable_info["combined_name"] = variable_info.apply("_".join, axis=1)
    variable_info = variable_info.set_index("combined_name")

    # set var names
    vaccine_df_pivot.columns = variable_info.index.copy()

    # drop patients with any NaNs in this timepoint
    vaccine_df_pivot = vaccine_df_pivot.dropna(how="any")
    assert not vaccine_df_pivot.isna().any().any()

    return vaccine_df_pivot, variable_info

In [20]:
X_partial = []
var_partial = []
for vaccine_timepoint in vaccine_df["timepoint"].unique():
    associated_global_timepoint_label = map_vaccine_to_global_timepoint_labels[
        vaccine_timepoint
    ]
    print(vaccine_timepoint, "->", associated_global_timepoint_label)
    df_partial = vaccine_df[vaccine_df["timepoint"] == vaccine_timepoint]
    vaccine_df_pivot, variable_info = process_vaccine_timepoint(
        df_partial, associated_global_timepoint_label
    )
    X_partial.append(vaccine_df_pivot)
    var_partial.append(variable_info)
vaccine_df_pivot = pd.concat(X_partial, axis=1)
variable_info = pd.concat(var_partial, axis=0)

boostD1/2 -> boostD1/2
boostD7/8 -> boostD7/8
boostD21 -> boostD21
D0 -> day 0 / pre-pandemic
D7 -> day 7 / week 1
D42 -> day 42 / weeks 5&6
D21 -> day 21 / weeks 2&3
D28 -> day 28 / week 4
D90 -> week 7 and later / 3 months
D210 -> day 210 / 7 months


In [21]:
# note: there are NaNs - patients don't have entries for all timepoints
vaccine_df_pivot

combined_name,Wuhan_S_Wuhan_IgG_boostD1/2,NL63_S_Wuhan_IgG_boostD1/2,Wuhan_N_Wuhan_IgG_boostD1/2,CoV_S_Wuhan_IgG_boostD1/2,Wuhan_NTD_Wuhan_IgG_boostD1/2,HKU1_S_Wuhan_IgG_boostD1/2,OC43_S_Wuhan_IgG_boostD1/2,h229E_S_Wuhan_IgG_boostD1/2,Wuhan_RBD_Wuhan_IgG_boostD1/2,Wuhan_S_Wuhan_IgG_boostD7/8,...,Wuhan_RBD_Wuhan_IgG_week 7 and later / 3 months,Wuhan_S_Wuhan_IgG_day 210 / 7 months,NL63_S_Wuhan_IgG_day 210 / 7 months,Wuhan_N_Wuhan_IgG_day 210 / 7 months,CoV_S_Wuhan_IgG_day 210 / 7 months,Wuhan_NTD_Wuhan_IgG_day 210 / 7 months,HKU1_S_Wuhan_IgG_day 210 / 7 months,OC43_S_Wuhan_IgG_day 210 / 7 months,h229E_S_Wuhan_IgG_day 210 / 7 months,Wuhan_RBD_Wuhan_IgG_day 210 / 7 months
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pfizer01,13024.60682,777.295701,140.872774,2062.607562,313.991165,3945.585095,17301.86201,20378.41066,4690.960207,866042.5308,...,35663.9485,,,,,,,,,
pfizer02,23597.61486,1651.146945,74.422681,3383.081474,603.800822,3775.718673,22373.67391,15618.59734,9726.310077,593669.9104,...,62742.4345,,,,,,,,,
pfizer04,14715.51336,909.931083,28.874592,2072.400417,237.931139,47458.35531,53553.56524,32826.8467,6096.580961,494663.9364,...,68891.2145,46368.35582,1035.028782,30.930252,4023.564051,748.47892,53666.42036,61062.19153,37785.41475,17640.40514
pfizer06,11603.9558,1764.573828,42.927934,2007.127267,355.211225,5432.205784,26171.72256,14369.25605,4542.135836,633563.7726,...,30695.5805,25890.35444,2885.216385,76.399471,3080.216325,755.766165,7595.767357,33152.87903,23106.24475,9763.875976
pfizer07,10262.01286,810.933733,56.99418,1628.356167,269.130474,8037.521706,19109.72486,34683.20557,3830.72736,,...,10148.5595,12371.46854,832.362203,69.743534,1615.378625,355.51128,7814.3706,19171.26147,31617.35816,4799.948595
pfizer10,12664.52631,2961.836655,25.531058,4149.336191,268.473446,9079.018693,32259.94328,37747.77699,5766.672416,324339.2581,...,,18609.208,3218.789626,56.790667,5963.678024,395.003098,9087.051591,31580.97778,40435.10012,10792.52511
pfizer13,10290.0972,3588.562709,89.178828,1594.202103,245.756464,12742.1011,23365.70441,13495.27309,4532.835995,312964.6832,...,19486.785,18424.63201,4192.094816,105.153348,2286.500366,483.946814,11078.78055,25181.13704,16971.19334,7267.111653
pfizer14,2303.834527,1508.092936,226.243773,758.169351,65.04036,3732.511087,24216.54312,3301.944167,1276.576739,193319.8821,...,,4430.975699,2621.168447,410.55363,1133.15675,134.775728,5751.972822,35996.52556,5526.20653,2117.68694
pfizer24,2122.167958,3426.351197,70.399613,110.238726,14.637253,4336.915468,14114.47201,7364.154416,632.835109,89363.01321,...,,3379.426,4681.743307,115.682757,210.723368,40.71748,5870.636559,16667.52822,10147.86801,1311.305916
pfizer26,11618.83509,1628.03764,52.304235,1160.945445,225.840074,4619.505684,19590.19278,15947.23548,4391.038478,,...,,,,,,,,,,


In [22]:
variable_info

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,timepoint
combined_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Wuhan_S_Wuhan_IgG_boostD1/2,Wuhan,S,Wuhan,IgG,boostD1/2
NL63_S_Wuhan_IgG_boostD1/2,NL63,S,Wuhan,IgG,boostD1/2
Wuhan_N_Wuhan_IgG_boostD1/2,Wuhan,N,Wuhan,IgG,boostD1/2
CoV_S_Wuhan_IgG_boostD1/2,CoV,S,Wuhan,IgG,boostD1/2
Wuhan_NTD_Wuhan_IgG_boostD1/2,Wuhan,NTD,Wuhan,IgG,boostD1/2
...,...,...,...,...,...
Wuhan_NTD_Wuhan_IgG_day 210 / 7 months,Wuhan,NTD,Wuhan,IgG,day 210 / 7 months
HKU1_S_Wuhan_IgG_day 210 / 7 months,HKU1,S,Wuhan,IgG,day 210 / 7 months
OC43_S_Wuhan_IgG_day 210 / 7 months,OC43,S,Wuhan,IgG,day 210 / 7 months
h229E_S_Wuhan_IgG_day 210 / 7 months,h229E,S,Wuhan,IgG,day 210 / 7 months


In [23]:
# attach status

vaccine_participants = vaccine_participants.set_index("PID")
vaccine_participants["Status"] = "Vaccinee"
# reorder
vaccine_participants = vaccine_participants.loc[vaccine_df_pivot.index]

# confirm same order
assert_array_equal(vaccine_participants.index, vaccine_df_pivot.index)

vaccine_participants

Unnamed: 0_level_0,COVID Positive Ever?,Status
participant,Unnamed: 1_level_1,Unnamed: 2_level_1
pfizer01,No,Vaccinee
pfizer02,No,Vaccinee
pfizer04,No,Vaccinee
pfizer06,No,Vaccinee
pfizer07,No,Vaccinee
pfizer10,No,Vaccinee
pfizer13,No,Vaccinee
pfizer14,No,Vaccinee
pfizer24,No,Vaccinee
pfizer26,,Vaccinee


In [24]:
# anndata requires string indices
vaccine_participants.index = vaccine_participants.index.astype(str)
vaccine_df_pivot.index = vaccine_df_pivot.index.astype(str)

In [25]:
adata_vaccine = anndata.AnnData(
    X=vaccine_df_pivot, obs=vaccine_participants, var=variable_info
)
adata_vaccine

AnnData object with n_obs × n_vars = 60 × 90
    obs: 'COVID Positive Ever?', 'Status'
    var: 'virus', 'target', 'variant_plate_type', 'antibody', 'timepoint'

In [26]:
adata_vaccine.var

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,timepoint
combined_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Wuhan_S_Wuhan_IgG_boostD1/2,Wuhan,S,Wuhan,IgG,boostD1/2
NL63_S_Wuhan_IgG_boostD1/2,NL63,S,Wuhan,IgG,boostD1/2
Wuhan_N_Wuhan_IgG_boostD1/2,Wuhan,N,Wuhan,IgG,boostD1/2
CoV_S_Wuhan_IgG_boostD1/2,CoV,S,Wuhan,IgG,boostD1/2
Wuhan_NTD_Wuhan_IgG_boostD1/2,Wuhan,NTD,Wuhan,IgG,boostD1/2
...,...,...,...,...,...
Wuhan_NTD_Wuhan_IgG_day 210 / 7 months,Wuhan,NTD,Wuhan,IgG,day 210 / 7 months
HKU1_S_Wuhan_IgG_day 210 / 7 months,HKU1,S,Wuhan,IgG,day 210 / 7 months
OC43_S_Wuhan_IgG_day 210 / 7 months,OC43,S,Wuhan,IgG,day 210 / 7 months
h229E_S_Wuhan_IgG_day 210 / 7 months,h229E,S,Wuhan,IgG,day 210 / 7 months


In [27]:
adata_vaccine.obs["Status"].value_counts()

Vaccinee    60
Name: Status, dtype: int64

In [28]:
adata_vaccine.write(
    f"{config.paths.generated_data_dir}/partial.pfizer_vaccine.coronavirus_plate.h5"
)

... storing 'COVID Positive Ever?' as categorical


... storing 'Status' as categorical


... storing 'virus' as categorical


... storing 'target' as categorical


... storing 'variant_plate_type' as categorical


... storing 'antibody' as categorical


... storing 'timepoint' as categorical
