In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import seaborn as sns
import anndata
import scanpy as sc
import genetools
from covid_serology import config

In [2]:
from IPython.display import display, Markdown
from numpy.testing import assert_array_equal

# Concatenate a subset of datasets: coronavirus plate only, for `infection_cohort1` patients and for Pfizer vaccinees (original data that includes IgM and IgA -- new only includes IgG)

# Load data

In [3]:
adata_sources = {
    "Pfizer_vaccine": f"{config.paths.generated_data_dir}/partial.pfizer_vaccine.coronavirus_plate.original.h5",
    "infection_cohort1": f"{config.paths.generated_data_dir}/partial.infection_cohort1.h5",
}

In [4]:
adatas = {key: sc.read(val) for key, val in adata_sources.items()}
adatas

Only considering the two last: ['.original', '.h5'].


Only considering the two last: ['.original', '.h5'].


{'Pfizer_vaccine': AnnData object with n_obs × n_vars = 59 × 162
     obs: 'COVID Positive Ever?', 'Status'
     var: 'virus', 'target', 'variant_plate_type', 'antibody', 'measurement_original_column_name', 'timepoint',
 'infection_cohort1': AnnData object with n_obs × n_vars = 99 × 162
     obs: 'Status', 'Death'
     var: 'virus', 'target', 'variant_plate_type', 'antibody', 'measurement_original_column_name', 'timepoint'}

# combine datasets and align columns

In [5]:
for name, adata in adatas.items():
    display(Markdown(f"## {name}"))
    display(adata.var)
    print(adata.var["timepoint"].unique().tolist())

## Pfizer_vaccine

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,measurement_original_column_name,timepoint
combined_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Wuhan_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,S,Wuhan,IgG,CoV2_S_IgG_Mean,day 0 / pre-pandemic
NL63_S_Wuhan_IgG_day 0 / pre-pandemic,NL63,S,Wuhan,IgG,NL63_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_N_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,N,Wuhan,IgG,CoV2_N_IgG_Mean,day 0 / pre-pandemic
CoV1_S_Wuhan_IgG_day 0 / pre-pandemic,CoV1,S,Wuhan,IgG,CoV1_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_NTD_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,NTD,Wuhan,IgG,CoV2_NTD_IgG_Mean,day 0 / pre-pandemic
...,...,...,...,...,...,...
Wuhan_NTD_Wuhan_IgA_week 7 and later,Wuhan,NTD,Wuhan,IgA,CoV2_NTD_IgA_Mean,week 7 and later
HKU1_S_Wuhan_IgA_week 7 and later,HKU1,S,Wuhan,IgA,HKU1_S_IgA_Mean,week 7 and later
OC43_S_Wuhan_IgA_week 7 and later,OC43,S,Wuhan,IgA,OC43_S_IgA_Mean,week 7 and later
h229E_S_Wuhan_IgA_week 7 and later,h229E,S,Wuhan,IgA,h229E_S_IgA_Mean,week 7 and later


['day 0 / pre-pandemic', 'day 7 / week 1', 'day 42 / weeks 5&6', 'day 21 / weeks 2&3', 'day 28 / week 4', 'week 7 and later']


## infection_cohort1

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,measurement_original_column_name,timepoint
combined_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Wuhan_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,S,Wuhan,IgG,CoV2_S_IgG_Mean,day 0 / pre-pandemic
NL63_S_Wuhan_IgG_day 0 / pre-pandemic,NL63,S,Wuhan,IgG,NL63_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_N_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,N,Wuhan,IgG,CoV2_N_IgG_Mean,day 0 / pre-pandemic
CoV1_S_Wuhan_IgG_day 0 / pre-pandemic,CoV1,S,Wuhan,IgG,CoV1_S_IgG_Mean,day 0 / pre-pandemic
Wuhan_NTD_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,NTD,Wuhan,IgG,CoV2_NTD_IgG_Mean,day 0 / pre-pandemic
...,...,...,...,...,...,...
Wuhan_NTD_Wuhan_IgA_week 7 and later / 3 months,Wuhan,NTD,Wuhan,IgA,CoV2_NTD_IgA_Mean,week 7 and later / 3 months
HKU1_S_Wuhan_IgA_week 7 and later / 3 months,HKU1,S,Wuhan,IgA,HKU1_S_IgA_Mean,week 7 and later / 3 months
OC43_S_Wuhan_IgA_week 7 and later / 3 months,OC43,S,Wuhan,IgA,OC43_S_IgA_Mean,week 7 and later / 3 months
h229E_S_Wuhan_IgA_week 7 and later / 3 months,h229E,S,Wuhan,IgA,h229E_S_IgA_Mean,week 7 and later / 3 months


['day 0 / pre-pandemic', 'day 7 / week 1', 'day 21 / weeks 2&3', 'day 28 / week 4', 'day 42 / weeks 5&6', 'week 7 and later / 3 months']


In [6]:
# Confirm no obs names overlap between datasets
import itertools

for ((name_a, adata_a), (name_b, adata_b)) in itertools.combinations(adatas.items(), 2):
    intersection_of_obsnames = set.intersection(
        set(adata_a.obs_names), set(adata_b.obs_names)
    )
    print(name_a, name_b, intersection_of_obsnames)
    assert len(intersection_of_obsnames) == 0

Pfizer_vaccine infection_cohort1 set()


In [7]:
# # Note issue: https://github.com/theislab/anndata/issues/614
# # This doesn't work - adata.var has a lot of NaNs.

# adata_full = anndata.concat(
#     adatas.values(),
#     join="outer",
#     merge="first",
#     axis=0,
#     label="source_cohort",
#     keys=adatas.keys(),
# )
# adata_full

In [8]:
def _merge_two_anndatas(ad1, ad2, var_col_join):
    """concatenate two anndatas. they must have different obsnames.
    some of their vars can intersect, and will be combined.
    we will concat along var, specifically using only the [var_col_join] columns to describe each variable, as well as the var_name
    """
    # confirm obsnames are distinct
    if len(set(ad1.obs_names).intersection(ad2.obs_names)) > 0:
        raise ValueError("Obsnames intersect")

    # we will concat along var, specifically using only the [var_col_join] columns to describe each variable, as well as the var_name
    if "varname" in var_col_join:
        # TODO: relax this
        raise ValueError("Cannot use varname as a var col - will be overwritten")

    def _get_df_from_anndata(adata, var_cols):
        df = adata.to_df()
        df.columns = adata.var[var_cols].assign(varname=adata.var_names)
        return df

    df1 = _get_df_from_anndata(ad1, var_col_join)
    df2 = _get_df_from_anndata(ad2, var_col_join)

    df_concat = pd.concat([df1, df2], axis=0)
    if df_concat.shape[0] != df1.shape[0] + df2.shape[0]:
        raise ValueError("Concat produced unexpected number of rows")

    new_var = df_concat.columns.to_frame(index=False)
    new_var.columns = var_col_join + ["varname"]
    # recover varname
    new_var = new_var.set_index("varname")
    df_concat.columns = new_var.index

    new_obs = pd.concat([ad1.obs, ad2.obs], axis=0)
    if not np.array_equal(new_obs.index, df_concat.index):
        raise ValueError("Concat unexpectedly rearranged rows")

    if new_var.duplicated().any():
        raise ValueError("Some var rows are duplicated - unexpected")

    return anndata.AnnData(df_concat, var=new_var, obs=new_obs)


from functools import reduce


def merge_anndatas(
    adatas,
    var_col_join,
):
    """progresively merge a list of anndatas"""
    return reduce(
        lambda x, y: _merge_two_anndatas(x, y, var_col_join=var_col_join), adatas
    )

In [9]:
# use our workaround
# first, label each adata with a source cohort key
for name, adata in adatas.items():
    adata.obs["source_cohort"] = name

# now merge
adata_full = merge_anndatas(
    adatas.values(),
    var_col_join=["virus", "target", "variant_plate_type", "antibody", "timepoint"],
)

adata_full

AnnData object with n_obs × n_vars = 158 × 189
    obs: 'COVID Positive Ever?', 'Status', 'source_cohort', 'Death'
    var: 'virus', 'target', 'variant_plate_type', 'antibody', 'timepoint'

In [10]:
adata_full.var

Unnamed: 0_level_0,virus,target,variant_plate_type,antibody,timepoint
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Wuhan_S_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,S,Wuhan,IgG,day 0 / pre-pandemic
NL63_S_Wuhan_IgG_day 0 / pre-pandemic,NL63,S,Wuhan,IgG,day 0 / pre-pandemic
Wuhan_N_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,N,Wuhan,IgG,day 0 / pre-pandemic
CoV1_S_Wuhan_IgG_day 0 / pre-pandemic,CoV1,S,Wuhan,IgG,day 0 / pre-pandemic
Wuhan_NTD_Wuhan_IgG_day 0 / pre-pandemic,Wuhan,NTD,Wuhan,IgG,day 0 / pre-pandemic
...,...,...,...,...,...
Wuhan_NTD_Wuhan_IgA_week 7 and later / 3 months,Wuhan,NTD,Wuhan,IgA,week 7 and later / 3 months
HKU1_S_Wuhan_IgA_week 7 and later / 3 months,HKU1,S,Wuhan,IgA,week 7 and later / 3 months
OC43_S_Wuhan_IgA_week 7 and later / 3 months,OC43,S,Wuhan,IgA,week 7 and later / 3 months
h229E_S_Wuhan_IgA_week 7 and later / 3 months,h229E,S,Wuhan,IgA,week 7 and later / 3 months


In [11]:
adata_full.obs

Unnamed: 0,COVID Positive Ever?,Status,source_cohort,Death
pfizer00,No,Vaccinee,Pfizer_vaccine,
pfizer01,No,Vaccinee,Pfizer_vaccine,
pfizer02,No,Vaccinee,Pfizer_vaccine,
pfizer03,No,Vaccinee,Pfizer_vaccine,
pfizer04,No,Vaccinee,Pfizer_vaccine,
...,...,...,...,...
84,,ICU,infection_cohort1,0.0
87,,ICU,infection_cohort1,0.0
101,,Admit,infection_cohort1,0.0
6,,ICU,infection_cohort1,1.0


In [12]:
adatas.keys()

dict_keys(['Pfizer_vaccine', 'infection_cohort1'])

In [13]:
var_names_in_common = adatas["infection_cohort1"].var_names.intersection(
    adatas["Pfizer_vaccine"].var_names
)
len(var_names_in_common), var_names_in_common

(135,
 Index(['Wuhan_S_Wuhan_IgG_day 0 / pre-pandemic',
        'NL63_S_Wuhan_IgG_day 0 / pre-pandemic',
        'Wuhan_N_Wuhan_IgG_day 0 / pre-pandemic',
        'CoV1_S_Wuhan_IgG_day 0 / pre-pandemic',
        'Wuhan_NTD_Wuhan_IgG_day 0 / pre-pandemic',
        'HKU1_S_Wuhan_IgG_day 0 / pre-pandemic',
        'OC43_S_Wuhan_IgG_day 0 / pre-pandemic',
        'h229E_S_Wuhan_IgG_day 0 / pre-pandemic',
        'Wuhan_RBD_Wuhan_IgG_day 0 / pre-pandemic',
        'Wuhan_S_Wuhan_IgM_day 0 / pre-pandemic',
        ...
        'Wuhan_RBD_Wuhan_IgM_day 42 / weeks 5&6',
        'Wuhan_S_Wuhan_IgA_day 42 / weeks 5&6',
        'NL63_S_Wuhan_IgA_day 42 / weeks 5&6',
        'Wuhan_N_Wuhan_IgA_day 42 / weeks 5&6',
        'CoV1_S_Wuhan_IgA_day 42 / weeks 5&6',
        'Wuhan_NTD_Wuhan_IgA_day 42 / weeks 5&6',
        'HKU1_S_Wuhan_IgA_day 42 / weeks 5&6',
        'OC43_S_Wuhan_IgA_day 42 / weeks 5&6',
        'h229E_S_Wuhan_IgA_day 42 / weeks 5&6',
        'Wuhan_RBD_Wuhan_IgA_day 42 / weeks 5&6'],

In [14]:
var_names_different = adatas["infection_cohort1"].var_names.difference(
    adatas["Pfizer_vaccine"].var_names
)
len(var_names_different), var_names_different

(27,
 Index(['CoV1_S_Wuhan_IgA_week 7 and later / 3 months',
        'CoV1_S_Wuhan_IgG_week 7 and later / 3 months',
        'CoV1_S_Wuhan_IgM_week 7 and later / 3 months',
        'HKU1_S_Wuhan_IgA_week 7 and later / 3 months',
        'HKU1_S_Wuhan_IgG_week 7 and later / 3 months',
        'HKU1_S_Wuhan_IgM_week 7 and later / 3 months',
        'NL63_S_Wuhan_IgA_week 7 and later / 3 months',
        'NL63_S_Wuhan_IgG_week 7 and later / 3 months',
        'NL63_S_Wuhan_IgM_week 7 and later / 3 months',
        'OC43_S_Wuhan_IgA_week 7 and later / 3 months',
        'OC43_S_Wuhan_IgG_week 7 and later / 3 months',
        'OC43_S_Wuhan_IgM_week 7 and later / 3 months',
        'Wuhan_NTD_Wuhan_IgA_week 7 and later / 3 months',
        'Wuhan_NTD_Wuhan_IgG_week 7 and later / 3 months',
        'Wuhan_NTD_Wuhan_IgM_week 7 and later / 3 months',
        'Wuhan_N_Wuhan_IgA_week 7 and later / 3 months',
        'Wuhan_N_Wuhan_IgG_week 7 and later / 3 months',
        'Wuhan_N_Wuhan_IgM_week 

In [15]:
adata_full

AnnData object with n_obs × n_vars = 158 × 189
    obs: 'COVID Positive Ever?', 'Status', 'source_cohort', 'Death'
    var: 'virus', 'target', 'variant_plate_type', 'antibody', 'timepoint'

In [16]:
# subset to columns in common
adata_full = adata_full[:, var_names_in_common].copy()
adata_full

AnnData object with n_obs × n_vars = 158 × 135
    obs: 'COVID Positive Ever?', 'Status', 'source_cohort', 'Death'
    var: 'virus', 'target', 'variant_plate_type', 'antibody', 'timepoint'

In [17]:
assert not adata_full.obs_names.duplicated().any()

In [18]:
assert not adata_full.obs["Status"].isna().any()

In [19]:
adata_full.var["timepoint"].unique().tolist()

['day 0 / pre-pandemic',
 'day 7 / week 1',
 'day 21 / weeks 2&3',
 'day 28 / week 4',
 'day 42 / weeks 5&6']

# Expand granularity of Status obs column, and add any other hue columns

In [20]:
adata_full.obs["Status"].value_counts()

Vaccinee      59
Admit         40
ICU           35
Outpatient    24
Name: Status, dtype: int64

In [21]:
adata_full.obs["Exposure"] = adata_full.obs["Status"].copy()

In [22]:
adata_full.obs.loc[
    (adata_full.obs["Exposure"] == "Vaccinee")
    & (adata_full.obs["COVID Positive Ever?"] != "No")
    & ~(adata_full.obs["COVID Positive Ever?"].isna()),
    "Exposure",
] = "Vaccinee (CoV2+)"
adata_full.obs["Exposure"].value_counts()

Vaccinee            55
Admit               40
ICU                 35
Outpatient          24
Vaccinee (CoV2+)     4
Name: Exposure, dtype: int64

In [23]:
adata_full.obs["Exposure"] = adata_full.obs["Exposure"].replace(
    {
        "Admit": "Wuhan Infection - Admit",
        "ICU": "Wuhan Infection - ICU",
        "Outpatient": "Wuhan Infection - Outpatient",
    }
)
adata_full.obs["Exposure"].value_counts()

Vaccinee                        55
Wuhan Infection - Admit         40
Wuhan Infection - ICU           35
Wuhan Infection - Outpatient    24
Vaccinee (CoV2+)                 4
Name: Exposure, dtype: int64

In [24]:
adata_full.obs["Exposure"] = adata_full.obs["Exposure"].replace(
    {
        "Vaccinee": "Pfizer-Pfizer (Stanford)",
        "Vaccinee (CoV2+)": "Pfizer-Pfizer (Stanford), CoV2+",
    }
)
adata_full.obs["Exposure"].value_counts()

Pfizer-Pfizer (Stanford)           55
Wuhan Infection - Admit            40
Wuhan Infection - ICU              35
Wuhan Infection - Outpatient       24
Pfizer-Pfizer (Stanford), CoV2+     4
Name: Exposure, dtype: int64

In [25]:
patient_types = [
    "Wuhan Infection - Admit",
    "Wuhan Infection - ICU",
    "Wuhan Infection - Outpatient",
]
adata_full.obs["Exposure Type"] = adata_full.obs["Exposure"].replace(
    {k: "Infection" for k in patient_types}
)
adata_full.obs["Exposure Type"].value_counts()

Infection                          99
Pfizer-Pfizer (Stanford)           55
Pfizer-Pfizer (Stanford), CoV2+     4
Name: Exposure Type, dtype: int64

# Export

In [26]:
adata_full.write(f"{config.paths.generated_data_dir}/coronavirus_plate_only.subset.h5")

... storing 'COVID Positive Ever?' as categorical


... storing 'Status' as categorical


... storing 'source_cohort' as categorical


... storing 'Exposure' as categorical


... storing 'Exposure Type' as categorical


... storing 'virus' as categorical


... storing 'target' as categorical


... storing 'variant_plate_type' as categorical


... storing 'antibody' as categorical


... storing 'timepoint' as categorical
