In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import seaborn as sns
import anndata
import scanpy as sc
import genetools
from covid_serology import config, helpers

# Load data

In [2]:
infection_df = pd.read_csv(
    f"{config.paths.data_dir}/variant_infections_seropositives.plate11.csv"
)
infection_df.head()

Unnamed: 0,ID,variant,vaccine,vaccinated,mAb_cocktail,Epsilon_ECL,Epsilon_AU,Beta_ECL,Beta_AU,Iota_ECL,...,Alpha_ECL,Alpha_AU,P.3_ECL,P.3_AU,Kappa_ECL,Kappa_AU,Delta_ECL,Delta_AU,Wuhan_ECL,Wuhan_AU
0,77,delta,Not vaccinated,,yes,1746523,6171326.0,804106,3451997.0,1366582,...,1394052,4925397.0,756470,4400233.0,1750546,7561206.0,1602908,4913945.0,1790094,6236417.0
1,101,delta,Not vaccinated,,yes,183877,572287.3,74056,265625.2,127515,...,127502,386110.9,74434,339201.2,156077,581529.4,169561,462327.6,182724,565356.2
2,2,alpha,Not vaccinated,,yes,425442,1436242.0,182480,535403.1,324485,...,329195,1020293.0,201313,687862.4,405399,1588069.0,405975,1102106.0,422598,1379488.0
3,7,alpha,Not vaccinated,no,no,23775,8571.427,7342,2238.512,7236,...,96398,33028.93,9838,4497.432,13885,5766.121,18436,5271.091,37426,13726.63
4,8,alpha,Not vaccinated,no,no,330037,112664.4,148888,46352.08,110146,...,694244,235288.3,89004,40260.55,173923,71511.29,284913,76371.92,382711,133299.2


In [3]:
# prefix so these IDs are unique from other datasets
infection_df["ID"] = "VariantInfection_" + infection_df["ID"].astype(str)

In [4]:
infection_df.groupby(["vaccinated", "vaccine"]).size()

vaccinated  vaccine       
no          Not vaccinated    25
yes         Janssen            2
            Moderna            6
            Pfizer             8
dtype: int64

In [5]:
infection_df["mAb_cocktail"].fillna("no", inplace=True)

In [6]:
# Remove mAb-treated patients - the external antibodies may interfere with our measurements
infection_df = infection_df[
    (infection_df["vaccinated"] != "mAb") & (infection_df["mAb_cocktail"] != "yes")
].copy()

In [7]:
assert not infection_df["vaccinated"].isna().any()
assert not infection_df["vaccine"].isna().any()
assert not infection_df["mAb_cocktail"].isna().any()

In [8]:
infection_df.groupby(["vaccinated", "vaccine", "mAb_cocktail"]).size()

vaccinated  vaccine         mAb_cocktail
no          Not vaccinated  no              25
yes         Janssen         no               2
            Moderna         no               6
            Pfizer          no               8
dtype: int64

In [9]:
infection_df.head()

Unnamed: 0,ID,variant,vaccine,vaccinated,mAb_cocktail,Epsilon_ECL,Epsilon_AU,Beta_ECL,Beta_AU,Iota_ECL,...,Alpha_ECL,Alpha_AU,P.3_ECL,P.3_AU,Kappa_ECL,Kappa_AU,Delta_ECL,Delta_AU,Wuhan_ECL,Wuhan_AU
3,VariantInfection_7,alpha,Not vaccinated,no,no,23775,8571.427475,7342,2238.511553,7236,...,96398,33028.9313,9838,4497.432041,13885,5766.120823,18436,5271.090797,37426,13726.62929
4,VariantInfection_8,alpha,Not vaccinated,no,no,330037,112664.4263,148888,46352.07644,110146,...,694244,235288.3479,89004,40260.55206,173923,71511.29152,284913,76371.91856,382711,133299.2366
5,VariantInfection_10,alpha,Not vaccinated,no,no,136856,47598.26179,87724,27236.87662,95013,...,198950,67881.72989,88086,39847.5838,119593,49030.68098,136376,37215.30122,171850,60942.65391
6,VariantInfection_11,delta,Not vaccinated,no,no,78325,27564.09763,13134,4030.672198,20757,...,31604,10891.23966,10128,4629.803618,53180,21840.3357,137252,37448.53258,45518,16624.60414
9,VariantInfection_33,delta,Not vaccinated,no,no,11467,4188.274494,526,136.977506,2262,...,1214,403.513183,440,176.86918,7633,3181.325633,25926,7357.724867,3928,1494.242724


In [10]:
infection_df.columns

Index(['ID', 'variant', 'vaccine', 'vaccinated', 'mAb_cocktail', 'Epsilon_ECL',
       'Epsilon_AU', 'Beta_ECL', 'Beta_AU', 'Iota_ECL', 'Iota_AU', 'Gamma_ECL',
       'Gamma_AU', 'B.1.526.2_ECL', 'B.1.526.2_AU', 'Alpha_ECL', 'Alpha_AU',
       'P.3_ECL', 'P.3_AU', 'Kappa_ECL', 'Kappa_AU', 'Delta_ECL', 'Delta_AU',
       'Wuhan_ECL', 'Wuhan_AU'],
      dtype='object')

In [11]:
infection_df["variant"].value_counts()

delta    34
alpha     7
Name: variant, dtype: int64

In [12]:
infection_df["variant"] = infection_df["variant"].str.title()

In [13]:
infection_df["variant"].value_counts()

Delta    34
Alpha     7
Name: variant, dtype: int64

In [14]:
infection_df["vaccine"].value_counts()

Not vaccinated    25
Pfizer             8
Moderna            6
Janssen            2
Name: vaccine, dtype: int64

In [15]:
infection_df["vaccine"] = infection_df["vaccine"].replace(
    {"Pfizer": "mRNA", "Moderna": "mRNA"}
)

In [16]:
infection_df["vaccine"].value_counts()

Not vaccinated    25
mRNA              14
Janssen            2
Name: vaccine, dtype: int64

In [17]:
measurement_cols = infection_df.columns[infection_df.columns.str.contains("AU")]
helpers.confirm_all_measurement_columns_are_present(measurement_cols)
measurement_cols

Index(['Epsilon_AU', 'Beta_AU', 'Iota_AU', 'Gamma_AU', 'B.1.526.2_AU',
       'Alpha_AU', 'P.3_AU', 'Kappa_AU', 'Delta_AU', 'Wuhan_AU'],
      dtype='object')

In [18]:
# only one sample per patient
assert all(infection_df["ID"].value_counts() == 1)

# reformat

In [19]:
# Separate into groups by variant and by vaccination status if any

infection_df_obs = infection_df[
    [
        "ID",
        "variant",
        "vaccine",
    ]
].copy()

infection_df_obs["Status"] = "Variant Infection" + " - " + infection_df_obs["variant"]
# add vaccine info suffix if vaccinated
infection_df_obs.loc[infection_df_obs["vaccine"] != "Not vaccinated", "Status"] += (
    " - "
    + infection_df_obs.loc[infection_df_obs["vaccine"] != "Not vaccinated", "vaccine"]
    + " vaccinated"
)

# capitalize columns for consistency - only the first letter
# and set index
infection_df_obs = infection_df_obs.rename(
    columns=lambda s: s[0].upper() + s[1:]
).set_index("ID")

# anndata wants string index
infection_df_obs.index = infection_df_obs.index.astype(str)

infection_df_obs

Unnamed: 0_level_0,Variant,Vaccine,Status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
VariantInfection_7,Alpha,Not vaccinated,Variant Infection - Alpha
VariantInfection_8,Alpha,Not vaccinated,Variant Infection - Alpha
VariantInfection_10,Alpha,Not vaccinated,Variant Infection - Alpha
VariantInfection_11,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_33,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_34,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_38,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_43,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_45,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_55,Delta,Not vaccinated,Variant Infection - Delta


In [20]:
# extract AU measurement cols, and set index to match obs
infection_df_X = (
    infection_df[measurement_cols]
    .rename(columns=lambda col: col.replace("_AU", ""))
    .set_index(infection_df_obs.index)
)

infection_df_X

Unnamed: 0_level_0,Epsilon,Beta,Iota,Gamma,B.1.526.2,Alpha,P.3,Kappa,Delta,Wuhan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
VariantInfection_7,8571.427,2238.512,2916.765,4671.838,12094.55,33028.93,4497.432,5766.121,5271.091,13726.63
VariantInfection_8,112664.4,46352.08,43715.07,87093.62,140077.4,235288.3,40260.55,71511.29,76371.92,133299.2
VariantInfection_10,47598.26,27236.88,37756.6,41785.09,52743.43,67881.73,39847.58,49030.68,37215.3,60942.65
VariantInfection_11,27564.1,4030.672,8344.526,6929.228,13347.04,10891.24,4629.804,21840.34,37448.53,16624.6
VariantInfection_33,4188.274,136.9775,897.9137,241.9405,1248.2,403.5132,176.8692,3181.326,7357.725,1494.243
VariantInfection_34,129754.4,18746.87,37201.03,39347.89,60276.08,46975.83,15268.49,115669.8,200135.4,65938.48
VariantInfection_38,8033.188,627.5274,1610.966,1015.129,2550.453,1511.53,635.727,6505.54,14928.81,2688.44
VariantInfection_43,5595.303,791.3871,1213.401,1265.423,1951.49,1494.494,646.3956,3986.886,8078.818,2201.217
VariantInfection_45,11980.95,1586.022,3273.222,3273.99,6307.536,2310.618,660.3088,14109.56,57659.1,6330.453
VariantInfection_55,219364.0,29207.34,109485.7,58273.35,134737.6,89635.93,43328.94,244274.6,248749.9,145430.0


In [21]:
adata = anndata.AnnData(X=infection_df_X, obs=infection_df_obs)
adata

AnnData object with n_obs × n_vars = 41 × 10
    obs: 'Variant', 'Vaccine', 'Status'

In [22]:
adata.obs

Unnamed: 0_level_0,Variant,Vaccine,Status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
VariantInfection_7,Alpha,Not vaccinated,Variant Infection - Alpha
VariantInfection_8,Alpha,Not vaccinated,Variant Infection - Alpha
VariantInfection_10,Alpha,Not vaccinated,Variant Infection - Alpha
VariantInfection_11,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_33,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_34,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_38,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_43,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_45,Delta,Not vaccinated,Variant Infection - Delta
VariantInfection_55,Delta,Not vaccinated,Variant Infection - Delta


In [23]:
adata.var

Epsilon
Beta
Iota
Gamma
B.1.526.2
Alpha
P.3
Kappa
Delta
Wuhan


In [24]:
adata.var["virus"] = adata.var_names
adata.var["target"] = "RBD"
adata.var["variant_plate_type"] = "Variant"
adata.var["antibody"] = "IgG"
adata.var

Unnamed: 0,virus,target,variant_plate_type,antibody
Epsilon,Epsilon,RBD,Variant,IgG
Beta,Beta,RBD,Variant,IgG
Iota,Iota,RBD,Variant,IgG
Gamma,Gamma,RBD,Variant,IgG
B.1.526.2,B.1.526.2,RBD,Variant,IgG
Alpha,Alpha,RBD,Variant,IgG
P.3,P.3,RBD,Variant,IgG
Kappa,Kappa,RBD,Variant,IgG
Delta,Delta,RBD,Variant,IgG
Wuhan,Wuhan,RBD,Variant,IgG


In [25]:
adata.X

array([[8.57142773e+03, 2.23851147e+03, 2.91676514e+03, 4.67183838e+03,
        1.20945449e+04, 3.30289297e+04, 4.49743213e+03, 5.76612061e+03,
        5.27109082e+03, 1.37266289e+04],
       [1.12664430e+05, 4.63520781e+04, 4.37150703e+04, 8.70936172e+04,
        1.40077438e+05, 2.35288344e+05, 4.02605508e+04, 7.15112891e+04,
        7.63719219e+04, 1.33299234e+05],
       [4.75982617e+04, 2.72368770e+04, 3.77566016e+04, 4.17850859e+04,
        5.27434297e+04, 6.78817266e+04, 3.98475820e+04, 4.90306797e+04,
        3.72153008e+04, 6.09426523e+04],
       [2.75640977e+04, 4.03067212e+03, 8.34452539e+03, 6.92922803e+03,
        1.33470439e+04, 1.08912393e+04, 4.62980371e+03, 2.18403359e+04,
        3.74485312e+04, 1.66246035e+04],
       [4.18827441e+03, 1.36977509e+02, 8.97913696e+02, 2.41940536e+02,
        1.24820032e+03, 4.03513184e+02, 1.76869186e+02, 3.18132568e+03,
        7.35772510e+03, 1.49424268e+03],
       [1.29754398e+05, 1.87468691e+04, 3.72010273e+04, 3.93478945e+04,
   

In [26]:
adata

AnnData object with n_obs × n_vars = 41 × 10
    obs: 'Variant', 'Vaccine', 'Status'
    var: 'virus', 'target', 'variant_plate_type', 'antibody'

In [27]:
adata.var["timepoint"] = "day 28 / week 4"
adata.var

Unnamed: 0,virus,target,variant_plate_type,antibody,timepoint
Epsilon,Epsilon,RBD,Variant,IgG,day 28 / week 4
Beta,Beta,RBD,Variant,IgG,day 28 / week 4
Iota,Iota,RBD,Variant,IgG,day 28 / week 4
Gamma,Gamma,RBD,Variant,IgG,day 28 / week 4
B.1.526.2,B.1.526.2,RBD,Variant,IgG,day 28 / week 4
Alpha,Alpha,RBD,Variant,IgG,day 28 / week 4
P.3,P.3,RBD,Variant,IgG,day 28 / week 4
Kappa,Kappa,RBD,Variant,IgG,day 28 / week 4
Delta,Delta,RBD,Variant,IgG,day 28 / week 4
Wuhan,Wuhan,RBD,Variant,IgG,day 28 / week 4


In [28]:
# create combined name
adata.var["combined_name"] = adata.var.apply("_".join, axis=1)
adata.var = adata.var.set_index("combined_name")

In [29]:
adata.var

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,timepoint
combined_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Epsilon_RBD_Variant_IgG_day 28 / week 4,Epsilon,RBD,Variant,IgG,day 28 / week 4
Beta_RBD_Variant_IgG_day 28 / week 4,Beta,RBD,Variant,IgG,day 28 / week 4
Iota_RBD_Variant_IgG_day 28 / week 4,Iota,RBD,Variant,IgG,day 28 / week 4
Gamma_RBD_Variant_IgG_day 28 / week 4,Gamma,RBD,Variant,IgG,day 28 / week 4
B.1.526.2_RBD_Variant_IgG_day 28 / week 4,B.1.526.2,RBD,Variant,IgG,day 28 / week 4
Alpha_RBD_Variant_IgG_day 28 / week 4,Alpha,RBD,Variant,IgG,day 28 / week 4
P.3_RBD_Variant_IgG_day 28 / week 4,P.3,RBD,Variant,IgG,day 28 / week 4
Kappa_RBD_Variant_IgG_day 28 / week 4,Kappa,RBD,Variant,IgG,day 28 / week 4
Delta_RBD_Variant_IgG_day 28 / week 4,Delta,RBD,Variant,IgG,day 28 / week 4
Wuhan_RBD_Variant_IgG_day 28 / week 4,Wuhan,RBD,Variant,IgG,day 28 / week 4


In [30]:
adata

AnnData object with n_obs × n_vars = 41 × 10
    obs: 'Variant', 'Vaccine', 'Status'
    var: 'virus', 'target', 'variant_plate_type', 'antibody', 'timepoint'

In [31]:
# filter down variant infection types
adata.obs["Status"].value_counts()

Variant Infection - Delta                         21
Variant Infection - Delta - mRNA vaccinated       12
Variant Infection - Alpha                          4
Variant Infection - Alpha - mRNA vaccinated        2
Variant Infection - Delta - Janssen vaccinated     1
Variant Infection - Alpha - Janssen vaccinated     1
Name: Status, dtype: int64

In [32]:
# filter down variant infection types
adata = adata[
    adata.obs["Status"].isin(
        [
            "Variant Infection - Delta",
            "Variant Infection - Delta - mRNA vaccinated",
            "Variant Infection - Alpha",
        ]
    )
]

In [33]:
adata

View of AnnData object with n_obs × n_vars = 37 × 10
    obs: 'Variant', 'Vaccine', 'Status'
    var: 'virus', 'target', 'variant_plate_type', 'antibody', 'timepoint'

In [34]:
adata.write(f"{config.paths.generated_data_dir}/partial.variant_infections.h5")

Trying to set attribute `.obs` of view, copying.


... storing 'Variant' as categorical


Trying to set attribute `.obs` of view, copying.


... storing 'Vaccine' as categorical


Trying to set attribute `.obs` of view, copying.


... storing 'Status' as categorical


Trying to set attribute `.var` of view, copying.


... storing 'target' as categorical


Trying to set attribute `.var` of view, copying.


... storing 'variant_plate_type' as categorical


Trying to set attribute `.var` of view, copying.


... storing 'antibody' as categorical


Trying to set attribute `.var` of view, copying.


... storing 'timepoint' as categorical
