In [1]:
import pandas as pd
pd.options.display.max_rows = 10

In [2]:
# URL for GeneNames file
GENE_NAMES_URL = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/non_alt_loci_set.txt"
# URL for TCR and BCR genes
TCR_GENES_URL = "https://github.com/nealpsmith/neals_python_functions/raw/refs/heads/master/neals_python_functions/analysis/db/tcr_genes.tsv"
BCR_GENES_URL = "https://github.com/nealpsmith/neals_python_functions/raw/refs/heads/master/neals_python_functions/analysis/db/bcr_genes.tsv"

# Human Gene MAP path
GENE_MAP_PATH = "gene-lists/homo_sapiens.csv"

# Excluded TCR genes
EXCLUDED_TCR_GENES = {'TRAV1-2', 'TRGV9', 'TRDV2'}
EXCLUDED_BCR_LABEL = "constant"

# File name with genes which should be filtered out
RESULTS_PATH = "gene-lists/filtered_out.csv"

In [3]:
# Download static tcr and bcr gene lists
!wget https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/non_alt_loci_set.txt -O gene-lists/non_alt_loci_set.txt
!wget https://github.com/nealpsmith/neals_python_functions/raw/refs/heads/master/neals_python_functions/analysis/db/tcr_genes.tsv -O gene-lists/tcr_genes.tsv
!wget https://github.com/nealpsmith/neals_python_functions/raw/refs/heads/master/neals_python_functions/analysis/db/bcr_genes.tsv -O gene-lists/bcr_genes.tsv

--2025-05-15 10:54:32--  https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/non_alt_loci_set.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.18.27, 142.250.185.187, 142.250.185.91, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.18.27|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16568000 (16M) [text/plain]
Saving to: ‘gene-lists/non_alt_loci_set.txt’


2025-05-15 10:54:34 (10.2 MB/s) - ‘gene-lists/non_alt_loci_set.txt’ saved [16568000/16568000]

--2025-05-15 10:54:34--  https://github.com/nealpsmith/neals_python_functions/raw/refs/heads/master/neals_python_functions/analysis/db/tcr_genes.tsv
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/nealpsmith/neals_python_functions/refs/heads/master/neals_python_functions/analysis/db/tcr_g

In [4]:
genenames = pd.read_csv("gene-lists/non_alt_loci_set.txt", sep="\t").fillna("")
genenames.set_index("ensembl_gene_id", inplace=True)
genenames.head(3)

  genenames = pd.read_csv("gene-lists/non_alt_loci_set.txt", sep="\t").fillna("")


Unnamed: 0_level_0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000121410,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
ENSG00000268895,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,19q13.43,19q13.43,FLJ23569,,...,,,,,URS00007E4F6E,A1BG-AS1,,HGNC:37133,,
ENSG00000148584,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,...,,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,


In [5]:
# All pseudogenes (subset by “Locus Type == pseudogene”
pseudogenes = genenames.loc[genenames.locus_type.str.contains("pseudogene")]
pseudogenes.head(3)

Unnamed: 0_level_0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000256069,HGNC:8,A2MP1,alpha-2-macroglobulin pseudogene 1,pseudogene,pseudogene,Approved,12p13.31,12p13.31,,,...,,,,,,,,HGNC:8,,
ENSG00000250420,HGNC:18226,AACSP1,acetoacetyl-CoA synthetase pseudogene 1,pseudogene,pseudogene,Approved,5q35.3,05q35.3,,,...,,,,,,,,HGNC:18226,,
ENSG00000240602,HGNC:50305,AADACP1,arylacetamide deacetylase pseudogene 1,pseudogene,pseudogene,Approved,3q25.1,03q25.1,,,...,,,,,,,,HGNC:50305,,


In [6]:
# All genes where the “Gene group name” contains the word “mitochondria”
mitochondrial = genenames.loc[genenames.gene_group.str.contains(r"\b[Mm]itochondria"), "symbol"]
mitochondrial

ensembl_gene_id
ENSG00000177646      ACAD9
ENSG00000164896      FASTK
ENSG00000138399    FASTKD1
ENSG00000118246    FASTKD2
ENSG00000124279    FASTKD3
                    ...   
ENSG00000179091       CYC1
ENSG00000132676       DAP3
ENSG00000137038      DMAC1
ENSG00000105341      DMAC2
ENSG00000130159      ECSIT
Name: symbol, Length: 346, dtype: object

In [7]:
# All the pseudogenes associated with the genes selected in the Mitochondrial genes section
mitochondrial_pseudo = pd.Series()
for name in mitochondrial.values:
    df = pseudogenes.loc[pseudogenes.name.str.contains(name), "symbol"]
    mitochondrial_pseudo = pd.concat((mitochondrial_pseudo, df), ignore_index=True)
mitochondrial_pseudo.drop(columns=0, inplace=True)
mitochondrial_pseudo

0       SFXN4P1
1       GLRX5P1
2       GLRX5P3
3      HIGD2AP1
4      HIGD2AP2
         ...   
632     COX16P1
633     COX17P1
634     COX20P1
635     COX20P2
636     DMAC1P1
Length: 637, dtype: object

In [8]:
# All genes where the “Locus type” contains the word “ribosomal”
ribosomal = genenames.loc[
    genenames.gene_group.str.contains("ribosomal|Ribosomal") |
    genenames.symbol.isin(["RPL17-C18orf32", "RPL17P52", "RPL36A-HNRNPH2", "RPS10-NUDT3"]),  # additional ribosomal genes missed by above filter
    ["symbol", "name"]
]
ribosomal["symbol"].head()

ensembl_gene_id
ENSG00000085231           AK6
ENSG00000149806           FAU
ENSG00000179271    GADD45GIP1
ENSG00000001497         LAS1L
ENSG00000169288         MRPL1
Name: symbol, dtype: object

In [9]:
# All pseudogenes (subset by “Locus Type == pseudogene”) that contain the word “ribosomal” in the “Approved name” column
ribosomal_pseudo = pseudogenes.loc[
    pseudogenes["name"].str.contains(
        "|".join(ribosomal["name"].tolist())
    ) |
    pseudogenes["name"].str.contains(
        "|".join(ribosomal["symbol"].tolist())
    ) |
    pseudogenes["name"].str.contains("ribosomal"),
    "symbol"
]

ribosomal = ribosomal["symbol"]  # name column is not needed anymore

ribosomal_pseudo

  pseudogenes["name"].str.contains(


ensembl_gene_id
ENSG00000256614      AK6P1
ENSG00000257173      AK6P2
ENSG00000235297      FAUP1
ENSG00000237806      FAUP2
                     FAUP3
                    ...   
ENSG00000244329    UBA52P9
ENSG00000225082     DAP3P1
ENSG00000228507     DAP3P2
ENSG00000259404     EFL1P1
ENSG00000248998     EFL1P2
Name: symbol, Length: 2692, dtype: object

In [10]:
# All genes where the “Locus type” contains “RNA, long non-coding”
incrna = genenames.loc[genenames.locus_type.str.contains("RNA, long non-coding"), "symbol"]
incrna

ensembl_gene_id
ENSG00000268895       A1BG-AS1
ENSG00000245105        A2M-AS1
ENSG00000256661      A2ML1-AS1
ENSG00000256904      A2ML1-AS2
ENSG00000242908    AADACL2-AS1
                      ...     
ENSG00000234567       EYA4-AS2
ENSG00000233893        EZR-AS1
ENSG00000231882        F10-AS1
ENSG00000283828        F10-AS2
ENSG00000251165        F11-AS1
Name: symbol, Length: 6000, dtype: object

In [11]:
# All the pseudogenes associated with the genes selected in the IncRNA genes section
incrna_pseudo = pd.Series()
for name in incrna.values:
    df = pseudogenes.loc[pseudogenes.name.str.contains(name), "symbol"]
    incrna_pseudo = pd.concat((incrna_pseudo, df), ignore_index=True)
incrna_pseudo.drop(columns=0, inplace=True)
incrna_pseudo

0       SERBP1P1
1       SERBP1P2
2       SERBP1P3
3       SERBP1P4
4       SERBP1P5
         ...    
295    RN7SKP295
296    RN7SKP296
297    RN7SKP297
298    RN7SKP298
299    RN7SKP299
Length: 300, dtype: object

In [12]:
tcr_genes = pd.read_csv("gene-lists/tcr_genes.tsv", sep="\t")
tcr_genes.set_index("Ensembl gene ID", inplace=True)
tcr_genes = tcr_genes["Approved symbol"]
tcr_genes = tcr_genes.loc[~tcr_genes.isin(EXCLUDED_TCR_GENES)]
tcr_genes

Ensembl gene ID
NaN                    TRA
ENSG00000211782    TRAV8-1
ENSG00000211844     TRAJ45
ENSG00000211843     TRAJ46
ENSG00000211842     TRAJ47
                    ...   
ENSG00000211698      TRGV4
ENSG00000211697      TRGV5
ENSG00000228668     TRGV5P
ENSG00000226212      TRGV6
ENSG00000249978      TRGV7
Name: Approved symbol, Length: 244, dtype: object

In [13]:
bcr_genes = pd.read_csv("gene-lists/bcr_genes.tsv", sep="\t")
bcr_genes.set_index("Ensembl gene ID", inplace=True)
bcr_genes = bcr_genes["Approved symbol"]
bcr_genes

Ensembl gene ID
ENSG00000271336    IGHD1OR15-1A
ENSG00000270185    IGHD1OR15-1B
ENSG00000282599    IGHD2OR15-2A
ENSG00000282268    IGHD2OR15-2B
ENSG00000282520    IGHD3OR15-3A
                       ...     
ENSG00000254029           IGLC4
ENSG00000254030           IGLC5
ENSG00000222037           IGLC6
ENSG00000211685           IGLC7
ENSG00000253823        IGLV1-62
Name: Approved symbol, Length: 431, dtype: object

In [14]:
cap_genes = pd.read_csv(GENE_MAP_PATH, index_col=0)
cap_genes.head()

Unnamed: 0_level_0,HGNC_symbol,a,b,HGNC_symbol_unique
ENSEMBL_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000290825,DDX11L2,1,1657,DDX11L2-290825
ENSG00000223972,DDX11L1,6,632,DDX11L1-223972
ENSG00000227232,WASH7P,5,1351,WASH7P-227232
ENSG00000278267,MIR6859-1,1,68,MIR6859-1-278267
ENSG00000243485,MIR1302-2HG,5,1021,MIR1302-2HG-243485


In [15]:
# Filter unknown names as well
unknown = cap_genes.loc[cap_genes.HGNC_symbol.str.startswith("ENSG"), "HGNC_symbol"]
unknown

ENSEMBL_gene
ENSG00000290826    ENSG00000290826.1
ENSG00000238009    ENSG00000238009.6
ENSG00000239945    ENSG00000239945.1
ENSG00000268903    ENSG00000268903.1
ENSG00000269981    ENSG00000269981.1
                         ...        
ENSG00000278198    ENSG00000278198.1
ENSG00000273496    ENSG00000273496.1
ENSG00000277666    ENSG00000277666.1
ENSG00000278782    ENSG00000278782.1
ENSG00000277761    ENSG00000277761.1
Name: HGNC_symbol, Length: 20191, dtype: object

In [16]:
# Missing genes = genes which exist of CAP but not in genenames
missing = cap_genes.loc[(~cap_genes.HGNC_symbol.isin(genenames.symbol)) & (~cap_genes.HGNC_symbol.str.startswith("ENSG")), "HGNC_symbol"]
missing

ENSEMBL_gene
ENSG00000278757            U6
ENSG00000205116       TMEM88B
ENSG00000227372      TP73-AS1
ENSG00000200344         Y_RNA
ENSG00000252254         Y_RNA
                      ...    
ENSG00000286265    AC007244.1
ENSG00000286859    AL158214.2
ENSG00000287487    AC104633.1
ENSG00000287753    AC130307.1
ENSG00000288436    AC024558.2
Name: HGNC_symbol, Length: 1652, dtype: object

In [17]:
genes_to_filter = pd.concat(
    (
        mitochondrial, 
        mitochondrial_pseudo,
        ribosomal,
        ribosomal_pseudo,
        incrna,
        incrna_pseudo,
        tcr_genes,
        bcr_genes,
        unknown,
        missing,
    ), 
    ignore_index=False
)
genes_to_filter.drop_duplicates(inplace=True)

In [18]:
genes_to_filter

ENSG00000177646         ACAD9
ENSG00000164896         FASTK
ENSG00000138399       FASTKD1
ENSG00000118246       FASTKD2
ENSG00000124279       FASTKD3
                      ...    
ENSG00000286265    AC007244.1
ENSG00000286859    AL158214.2
ENSG00000287487    AC104633.1
ENSG00000287753    AC130307.1
ENSG00000288436    AC024558.2
Length: 31517, dtype: object

In [19]:
filtered_df = cap_genes.loc[cap_genes.HGNC_symbol.isin(genes_to_filter), ["HGNC_symbol", "HGNC_symbol_unique"]]
filtered_df

Unnamed: 0_level_0,HGNC_symbol,HGNC_symbol_unique
ENSEMBL_gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,MIR1302-2HG-243485
ENSG00000237613,FAM138A,FAM138A-237613
ENSG00000290826,ENSG00000290826.1,ENSG00000290826.1-290826
ENSG00000238009,ENSG00000238009.6,ENSG00000238009.6-238009
ENSG00000239945,ENSG00000239945.1,ENSG00000239945.1-239945
...,...,...
ENSG00000269721,RPL23AP51,RPL23AP51-269721
ENSG00000293596,SCREEM1,SCREEM1-293596
ENSG00000283073,SMUG1-AS1,SMUG1-AS1-283073
ENSG00000254671,STT3A-AS1,STT3A-AS1-254671


In [20]:
filtered_df.to_csv(RESULTS_PATH, index=True)