## Import

In [1]:
import os
import scanpy as sc
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
sc_path = "../phenotype_classification/output_phenotypes"

## Inflamed vs excluded

In [24]:
dea_path = 'output/infl_vs_excl'

### Load

In [9]:
results = pd.read_csv(os.path.join(dea_path, "de_results.csv"))

In [29]:
filtered_results = results[(abs(results['log2FoldChange']) >= 1) & (results['padj'] < 0.05)]
genes = filtered_results['GeneSymbol'].tolist()
len(genes)

1110

In [26]:
adata = sc.read_h5ad(os.path.join(sc_path, 'sc_phenotypes.h5ad'))

In [40]:
pheno_dict = {
    'inflamed':['WARS', 'CXCL13', 'CCL5', 'GZMB', 'TRBC1', 'LCK', 'CORO1A', 'SIRPG', 'PLAC8', 'PVRIG', 'CCL18', 'IL7R', 'IL2RG', 'NKG7', 'IGHG1'],
    #'ignored':['COL5A1', 'SPON1', 'CAMK2N1', 'FAP', 'SPOCK1', 'COL1A1', 'TPSAB1', 'ASPN', 'CPE', 'MMP2', 'AKR1C2', 'SCGB2A1', 'SCGB2A2', 'TCN1'],
    'excluded':['TMSB15A', 'CALML5', 'SDC1', 'GPRC5A', 'CXADR', 'PBX1', 'IGFBP5', 'ENTPD3', 'TUFT1', 'GREM1', 'COL10A1', 'THBS2', 'PERP', 'CEACAM6']
}

### Exclude from genes_classifier the genes not present in the adata

In [41]:
for pheno, genes in pheno_dict.items():
    missing = [g for g in genes if g not in adata.var_names]

print(f"Missing genes: {missing}")

Missing genes: []


### Check the overlapping genes

In [42]:
rows = []
for pheno, gene_list in pheno_dict.items():
    common = set(gene_list).intersection(filtered_results['GeneSymbol'])

    for g in common:
        log2fc = filtered_results.loc[filtered_results['GeneSymbol'] == g, 'log2FoldChange'].values[0]
        rows.append({"GeneSymbol": g, "Phenotype": pheno, "log2FoldChange": log2fc})

common_df = pd.DataFrame(rows)

In [43]:
common_df

Unnamed: 0,GeneSymbol,Phenotype,log2FoldChange
0,LCK,inflamed,1.939752
1,NKG7,inflamed,2.520862
2,SIRPG,inflamed,2.569643
3,CCL5,inflamed,2.609743
4,GZMB,inflamed,2.260018
5,TRBC1,inflamed,2.794885
6,IL7R,inflamed,1.976177
7,SDC1,excluded,-1.24825
8,CXADR,excluded,-1.801789
9,PBX1,excluded,-1.395997


## Inflamed vs ignored

In [3]:
dea_path = 'output/infl_vs_ign'

### Load

In [4]:
results = pd.read_csv(os.path.join(dea_path, "de_results.csv"))

In [5]:
filtered_results = results[(abs(results['log2FoldChange']) >= 1) & (results['padj'] < 0.05)]
genes = filtered_results['GeneSymbol'].tolist()
len(genes)

183

In [6]:
adata = sc.read_h5ad(os.path.join(sc_path, 'sc_phenotypes.h5ad'))

In [7]:
pheno_dict = {
    'inflamed':['WARS', 'CXCL13', 'CCL5', 'GZMB', 'TRBC1', 'LCK', 'CORO1A', 'SIRPG', 'PLAC8', 'PVRIG', 'CCL18', 'IL7R', 'IL2RG', 'NKG7', 'IGHG1'],
    'ignored':['COL5A1', 'SPON1', 'CAMK2N1', 'FAP', 'SPOCK1', 'COL1A1', 'TPSAB1', 'ASPN', 'CPE', 'MMP2', 'AKR1C2', 'SCGB2A1', 'SCGB2A2', 'TCN1'],
    #'excluded':['TMSB15A', 'CALML5', 'SDC1', 'GPRC5A', 'CXADR', 'PBX1', 'IGFBP5', 'ENTPD3', 'TUFT1', 'GREM1', 'COL10A1', 'THBS2', 'PERP', 'CEACAM6']
}

### Exclude from genes_classifier the genes not present in the adata

In [8]:
for pheno, genes in pheno_dict.items():
    missing = [g for g in genes if g not in adata.var_names]

print(f"Missing genes: {missing}")

Missing genes: []


### Check the overlapping genes

In [9]:
rows = []
for pheno, gene_list in pheno_dict.items():
    common = set(gene_list).intersection(filtered_results['GeneSymbol'])

    for g in common:
        log2fc = filtered_results.loc[filtered_results['GeneSymbol'] == g, 'log2FoldChange'].values[0]
        rows.append({"GeneSymbol": g, "Phenotype": pheno, "log2FoldChange": log2fc})

common_df = pd.DataFrame(rows)

In [10]:
common_df

## Excluded vs ignored

In [11]:
dea_path = 'output/excl_vs_ign'

### Load

In [12]:
results = pd.read_csv(os.path.join(dea_path, "de_results.csv"))

In [13]:
filtered_results = results[(abs(results['log2FoldChange']) >= 1) & (results['padj'] < 0.05)]
genes = filtered_results['GeneSymbol'].tolist()
len(genes)

49

In [14]:
adata = sc.read_h5ad(os.path.join(sc_path, 'sc_phenotypes.h5ad'))

In [15]:
pheno_dict = {
    #'inflamed':['WARS', 'CXCL13', 'CCL5', 'GZMB', 'TRBC1', 'LCK', 'CORO1A', 'SIRPG', 'PLAC8', 'PVRIG', 'CCL18', 'IL7R', 'IL2RG', 'NKG7', 'IGHG1'],
    'ignored':['COL5A1', 'SPON1', 'CAMK2N1', 'FAP', 'SPOCK1', 'COL1A1', 'TPSAB1', 'ASPN', 'CPE', 'MMP2', 'AKR1C2', 'SCGB2A1', 'SCGB2A2', 'TCN1'],
    'excluded':['TMSB15A', 'CALML5', 'SDC1', 'GPRC5A', 'CXADR', 'PBX1', 'IGFBP5', 'ENTPD3', 'TUFT1', 'GREM1', 'COL10A1', 'THBS2', 'PERP', 'CEACAM6']
}

### Exclude from genes_classifier the genes not present in the adata

In [16]:
for pheno, genes in pheno_dict.items():
    missing = [g for g in genes if g not in adata.var_names]

print(f"Missing genes: {missing}")

Missing genes: []


### Check the overlapping genes

In [17]:
rows = []
for pheno, gene_list in pheno_dict.items():
    common = set(gene_list).intersection(filtered_results['GeneSymbol'])

    for g in common:
        log2fc = filtered_results.loc[filtered_results['GeneSymbol'] == g, 'log2FoldChange'].values[0]
        rows.append({"GeneSymbol": g, "Phenotype": pheno, "log2FoldChange": log2fc})

common_df = pd.DataFrame(rows)

In [18]:
common_df