In [1]:
import sys,os
adfile_path = r"C:\Users\21lyb\Downloads\PBMC_simulated_cnas_041025.h5ad"
os.environ['R_HOME'] = r"C:\Program Files\R\R-4.1.1"
os.environ['PATH'] += r";C:\Program Files\R\R-4.1.1\bin"
print(os.environ['R_HOME'])
print(os.environ['PATH'])

C:\Program Files\R\R-4.1.1
c:\Users\21lyb\Documents\CSCB-2025-Final\.conda;c:\Users\21lyb\Documents\CSCB-2025-Final\.conda\Library\mingw-w64\bin;c:\Users\21lyb\Documents\CSCB-2025-Final\.conda\Library\usr\bin;c:\Users\21lyb\Documents\CSCB-2025-Final\.conda\Library\bin;c:\Users\21lyb\Documents\CSCB-2025-Final\.conda\Scripts;c:\Users\21lyb\Documents\CSCB-2025-Final\.conda\bin;C:\Users\21lyb\miniconda3\condabin;C:\Program Files (x86)\Common Files\Intel\Shared Libraries\intel64\libfabric\bin;C:\Program Files (x86)\Common Files\Intel\Shared Libraries\intel64\libfabric\bin\utils;C:\Program Files (x86)\Common Files\Intel\Shared Libraries\intel64\bin;C:\Program Files (x86)\Common Files\Intel\Shared Libraries\ia32;C:\Program Files (x86)\Common Files\Intel\Shared Libraries\intel64;C:\Program Files (x86)\Common Files\Intel\Shared Libraries;C:\Program Files\PerkinElmerInformatics\ChemOffice\ChemScript\Lib;C:\Program Files (x86)\Common Files\Oracle\Java\javapath;C:\Program Files\Java\jdk-14.0.2\bin

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import infercnvpy as cnv
import matplotlib.pyplot as plt
from biomart import BiomartServer
from io import StringIO
import anndata as ad
import rpy2
import datetime
import warnings

warnings.simplefilter("ignore")

sc.settings.set_figure_params(figsize=(5, 5))

sc.logging.print_header()

Package,Version
Component,Info
numpy,1.26.4
pandas,2.2.3
scanpy,1.11.1
infercnvpy,0.6.0
matplotlib,3.10.0
biomart,0.9.2
anndata,0.11.4
rpy2,3.5.17
Python,"3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]"
OS,Windows-10-10.0.26100-SP0

Dependency,Version
igraph,0.11.8
jupyter_core,5.7.2
colorama,0.4.6
joblib,1.4.2
pywin32,304
jaraco.text,3.12.1
jupyter_client,8.6.3
asttokens,3.0.0
texttable,1.7.0
threadpoolctl,3.6.0


In [3]:
def fetch_positions(adata):
    # Connect to Ensembl Biomart server
    server = BiomartServer("http://grch37.ensembl.org/biomart")
    dataset = server.datasets['hsapiens_gene_ensembl']

    # Query gene names for only missing gene positions
    no_positions = adata[:, adata.var[['start', 'end']].isna().any(axis=1)]
    with_positions = adata[:, ~adata.var[['start', 'end']].isna().any(axis=1)]
    response = dataset.search({
        'filters':{'ensembl_gene_id':list(no_positions.var['gene_ids'])},
        'attributes':['ensembl_gene_id','chromosome_name','start_position','end_position','strand']
    })

    # Convert response to DataFrame and merge with adata.var if response is successful
    if response.status_code == 200:
        print("Request successful!")
        gene_annotations_df = pd.read_csv(StringIO(response.text),sep='\t',header=None)
        gene_annotations_df.columns = ['gene_ids','chromosome','start','end','strand']
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(response.text)

    # Isolate fetched genes from BioMart in no_positions adata
    fetched_positions = no_positions[:, no_positions.var['gene_ids'].isin(gene_annotations_df['gene_ids'])].copy()

    # Sort fetched genes based on ensembl gene IDs
    fetched_positions = fetched_positions[:, fetched_positions.var['gene_ids'].argsort()].copy()

    # Add the fetched gene positions to the adata
    fetched_positions.var['chromosome'] = gene_annotations_df['chromosome'].values
    fetched_positions.var['start'] = gene_annotations_df['start'].values
    fetched_positions.var['end'] = gene_annotations_df['end'].values
    fetched_positions.var['strand'] = gene_annotations_df['strand'].values

    # Concatenate fetched genes with isolated genes already with positions
    adClean = ad.concat([with_positions, fetched_positions], axis=1)

    # Include obs into the cleaned adata
    adClean.obs = with_positions.obs.copy()

    return adClean

def standardize_chromosomes(adata):
    
    adata1 = adata.copy()

    # Add 'chr' prefix to chromosome names
    adata1.var['chromosome'] = 'chr' + adata1.var['chromosome'].astype(str)

    # Define standard chromosome names with 'chr' prefix
    standard_chromosomes = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY', 'chrMT']

    # Filter adata to include only genes on standard chromosomes
    adata1 = adata1[:, adata1.var['chromosome'].isin(standard_chromosomes)].copy()

    return adata1

def qc(adata,
    mt_threshold_pct=20,
    min_genes=500,
    max_counts=30000,
    min_cells=3):

    adata1 = adata.copy()

    # Find MT genes
    adata1.var['mt'] = adata1.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata1, qc_vars=['mt'],
                            percent_top=None,
                            log1p=False,
                            inplace=True)

    # Filter out cells based on MT genes
    adClean = adata1[adata1.obs['pct_counts_mt']<mt_threshold_pct,:].copy()

    # Filter out cells based on number of genes expressed
    sc.pp.filter_cells(adClean, min_genes=min_genes)

    # Filter out cells based on total counts
    sc.pp.filter_cells(adClean, max_counts=max_counts)

    # Filter out genes expressed in few cells
    sc.pp.filter_genes(adClean, min_cells=min_cells)

    return adClean

def preprocess(adata,
               min_mean=0.0125,
               max_mean=6,
               min_disp=0.25):
    
    adNorm = adata.copy()

    # Normalize and log transform
    adata.layers['counts'] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    # sc.pp.log1p(adata)

    # Detect highly variable genes
    # sc.pp.highly_variable_genes(adata, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp)

    # PCA
    # sc.tl.pca(adata, use_highly_variable=True)
    # sc.pl.pca_variance_ratio(adata,50)

    return adNorm

def KNN(adata, n_neighbors=20, n_pcs=10, annotation="cell_type"):
    sc.pp.neighbors(adata,n_neighbors=n_neighbors,n_pcs=n_pcs)
    sc.tl.umap(adata)
    sc.pl.umap(adata, color=annotation)

def downsample(adata,n_cells):
    if adata.n_obs <= n_cells:
        return adata
    else:
        return adata[np.random.choice(adata.obs_names, n_cells, replace=False), :].copy()

In [31]:
adata = sc.read_h5ad(adfile_path)

In [32]:
# adata = fetch_positions(adata)

In [33]:
# adata = standardize_chromosomes(adata)

In [34]:
# Run if needed:
adata = qc(adata)
adata = downsample(adata,n_cells=1000)

In [35]:
adata = preprocess(adata)

In [36]:
# KNN(adata,n_pcs=10)

In [37]:
adata_backup = adata

In [None]:
file_prefix = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
cnv.tl.copykat(adata,gene_ids='S',organism='human',s_name=file_prefix)

In [None]:
copykat_prediction = pd.read_csv(file_prefix+'_copykat_prediction.txt',sep='\t')
copykat_prediction

Unnamed: 0,cell.names,copykat.pred
0,CCATAAGGTGGAAATT-1,aneuploid
1,TAACGACCATGCGGTC-1,diploid
2,TGTAGACTCCCATTCG-1,aneuploid
3,GCGAGAACATCCGTTC-1,aneuploid
4,CGCATAATCGCCCAGA-1,aneuploid
...,...,...
995,GTGGAGACATCCGGCA-1,not.defined
996,GTCACGGGTTCGGGTC-1,not.defined
997,TCACATTTCGTAGCCG-1,not.defined
998,TAGCACAAGTACAACA-1,not.defined


In [41]:
adata.obs

Unnamed: 0,n_genes_by_counts,total_counts,total_counts_ribo,pct_counts_ribo,total_counts_mt,pct_counts_mt,n_genes,n_counts,cell_type,simulated_cnvs
ACTTCGCGTGTCCGTG-1,1262,2404.0,1637.0,36.410141,0.0,0.0,1262,2404.0,B cell,
CCTCAACCAGTAGATA-1,1788,3818.0,3175.0,42.086426,0.0,0.0,1788,3818.0,CD4 T cell,X:106533974-112956833 (CN 4)
GTCTACCTCTGTGCGG-1,721,1022.0,362.0,25.156359,0.0,0.0,721,1022.0,CD4 T cell,X:106533974-112956833 (CN 4)
AGGTCATAGAATCCCT-1,1162,2449.0,392.0,12.128713,0.0,0.0,1162,2449.0,CD14 monocyte,
ATCATTCAGGGAACAA-1,2351,5543.0,1270.0,17.269512,0.0,0.0,2351,5543.0,CD14 monocyte,6:25435484-35035259 (CN 1)
...,...,...,...,...,...,...,...,...,...,...
CATACCCCATAGCTGT-1,2943,8206.0,1726.0,16.002224,0.0,0.0,2943,8206.0,CD14 monocyte,22:19807132-29743868 (CN 0)
AGGTCATTCCGATGTA-1,1746,3647.0,3711.0,44.210152,0.0,0.0,1746,3647.0,CD4 T cell,
TGCACGGAGCCTGCCA-1,1905,3959.0,2579.0,35.611710,0.0,0.0,1905,3959.0,CD4 T cell,
AGATCCAAGGCAATGC-1,2541,6046.0,1046.0,12.318926,0.0,0.0,2541,6046.0,CD14 monocyte,6:25435484-35035259 (CN 1)


In [42]:
adata.obs['copykat.pred'] = adata.obs.index.map(
    copykat_prediction.set_index('cell.names')['copykat.pred']
)

adata.obs['cnv'] = adata.obs.apply(lambda row: 'cnv' if row['copykat.pred'] == 'aneuploid' else np.nan, axis=1)

adata.obs

Unnamed: 0,n_genes_by_counts,total_counts,total_counts_ribo,pct_counts_ribo,total_counts_mt,pct_counts_mt,n_genes,n_counts,cell_type,simulated_cnvs,copykat.pred,cnv
ACTTCGCGTGTCCGTG-1,1262,2404.0,1637.0,36.410141,0.0,0.0,1262,2404.0,B cell,,,
CCTCAACCAGTAGATA-1,1788,3818.0,3175.0,42.086426,0.0,0.0,1788,3818.0,CD4 T cell,X:106533974-112956833 (CN 4),,
GTCTACCTCTGTGCGG-1,721,1022.0,362.0,25.156359,0.0,0.0,721,1022.0,CD4 T cell,X:106533974-112956833 (CN 4),,
AGGTCATAGAATCCCT-1,1162,2449.0,392.0,12.128713,0.0,0.0,1162,2449.0,CD14 monocyte,,,
ATCATTCAGGGAACAA-1,2351,5543.0,1270.0,17.269512,0.0,0.0,2351,5543.0,CD14 monocyte,6:25435484-35035259 (CN 1),,
...,...,...,...,...,...,...,...,...,...,...,...,...
CATACCCCATAGCTGT-1,2943,8206.0,1726.0,16.002224,0.0,0.0,2943,8206.0,CD14 monocyte,22:19807132-29743868 (CN 0),,
AGGTCATTCCGATGTA-1,1746,3647.0,3711.0,44.210152,0.0,0.0,1746,3647.0,CD4 T cell,,,
TGCACGGAGCCTGCCA-1,1905,3959.0,2579.0,35.611710,0.0,0.0,1905,3959.0,CD4 T cell,,,
AGATCCAAGGCAATGC-1,2541,6046.0,1046.0,12.318926,0.0,0.0,2541,6046.0,CD14 monocyte,6:25435484-35035259 (CN 1),,


In [51]:
# False positives
fp = adata[adata.obs['simulated_cnvs']==''].obs[adata[adata.obs['simulated_cnvs']==''].obs['cnv']=='cnv']
print(f'False positives: {len(fp)/adata.shape[0]}')

False positives: 0.045


In [50]:
# False negatives
fn = adata[adata.obs['simulated_cnvs']!=''].obs[adata[adata.obs['simulated_cnvs']!=''].obs['cnv']!='cnv']
print(f'False negatives: {len(fn)/adata.shape[0]}')

False negatives: 0.398
