## Compile Polypharmacology LSA results

In [1]:
import pathlib
import pandas as pd

In [14]:
assays = ["L1000", "cell-painting"]
models = ["beta", "vanilla", "mmd"]

analysis_dir = "3.application"

lsa_dfs = []
for assay in assays:
    data_dir = pathlib.Path(f"../{assay}/{analysis_dir}")
    lsa_results_files = [x for x in data_dir.iterdir() if "_general" in x.name]
    for lsa_results_file in lsa_results_files:
        
        # Extract info from filename
        file_info = lsa_results_file.name.split("_")
        if assay == "L1000":
            data_level = "level5"
            try:
                model = file_info[2].replace(".tsv", "")
            except IndexError:
                model = "beta"
        else:
            data_level = file_info[2].replace(".tsv", "")
            try:
                model = file_info[3].replace(".tsv", "")
            except IndexError:
                model = "beta"
        
        # Read data and process
        lsa_df = pd.read_csv(lsa_results_file, index_col=0, sep="\t")
        
        lsa_melt_df = (
            lsa_df
            .melt(var_name="input_data_type_full", value_name="dist")
        )
        id_df = (
            pd.DataFrame.from_records(
                lsa_melt_df.input_data_type_full.str.split(" "),
                columns = ["input_data_type", "shuffled"]
            )
        )
        
        lsa_melt_df = (
            pd.concat([lsa_melt_df, id_df], axis="columns")
            .assign(assay=assay, data_level=data_level, model=model)
        )
        
        # Replace the model variable with input data type for non-VAEs (it doesn't make sense otherwise)
        lsa_melt_df.loc[lsa_melt_df.input_data_type != "VAE", "model"] = (
            lsa_melt_df.loc[lsa_melt_df.input_data_type != "VAE", "input_data_type"]
        )
        
        lsa_dfs.append(lsa_melt_df)
        
lsa_dfs = pd.concat(lsa_dfs).reset_index(drop=True).dropna()

# Output file for downstream figure
output_file = pathlib.Path("data", "lsa_distribution_full_results.tsv.gz")
lsa_dfs.to_csv(output_file, sep="\t", index=False)

print(lsa_dfs.shape)
lsa_dfs.head()

(25047, 7)


Unnamed: 0,input_data_type_full,dist,input_data_type,shuffled,assay,data_level,model
0,VAE Unshuffled,30.60872,VAE,Unshuffled,L1000,level5,mmd
1,VAE Unshuffled,4.9227,VAE,Unshuffled,L1000,level5,mmd
2,VAE Unshuffled,11.219075,VAE,Unshuffled,L1000,level5,mmd
3,VAE Unshuffled,5.653036,VAE,Unshuffled,L1000,level5,mmd
4,VAE Unshuffled,5.948821,VAE,Unshuffled,L1000,level5,mmd


In [16]:
lsa_dfs.input_data_type.value_counts()

VAE         8349
PCA         8349
Complete    8349
Name: input_data_type, dtype: int64

In [17]:
lsa_dfs.input_data_type_full.value_counts()

VAE Shuffled           7590
PCA Shuffled           7590
Complete Shuffled      7590
VAE Unshuffled          759
PCA Unshuffled          759
Complete Unshuffled     759
Name: input_data_type_full, dtype: int64

In [18]:
lsa_dfs.shuffled.value_counts()

Shuffled      22770
Unshuffled     2277
Name: shuffled, dtype: int64

In [19]:
pd.crosstab(
    lsa_dfs.data_level,
    lsa_dfs.assay
)

assay,L1000,cell-painting
data_level,Unnamed: 1_level_1,Unnamed: 2_level_1
level4,0,8316
level5,8415,8316


In [20]:
lsa_dfs.model.value_counts()

PCA         8349
Complete    8349
mmd         2783
vanilla     2783
beta        2783
Name: model, dtype: int64