# gene location comp

this notebooks is here to help generate the gene location parquet file used in scPRINT and scPRINT2. This has to be done for each species one by one. 
Only when one wants to retrain scPRINT2 or make it work on a new species is this needed. Otherwise the data gets stored in the model checkpoints.

In [None]:
from scprint2.tokenizers import protein_embeddings_generator
from scprint2.utils.get_seq import load_fasta_species

%reload_ext autoreload
%autoreload 2

from scprint2.utils.utils import run_command
import pandas as pd
from Bio import SeqIO


[92m→[0m connected lamindb: jkobject/scprint_v2


  from pkg_resources import get_distribution, DistributionNotFound
  @custom_fwd
  @custom_bwd


In [None]:
names = {
    # "homo_sapiens": "NCBITaxon:9606",
    "mus_musculus": "NCBITaxon:10090",
    "arabidopsis_thaliana": "NCBITaxon:3702",
    "bos_taurus": "NCBITaxon:9913",
    "caenorhabditis_elegans": "NCBITaxon:6239",
    "callithrix_jacchus": "NCBITaxon:9483",
    "danio_rerio": "NCBITaxon:7955",
    "drosophila_melanogaster": "NCBITaxon:7227",
    "gallus_gallus": "NCBITaxon:9031",
    "heterocephalus_glaber_male": "NCBITaxon:10181",
    "macaca_mulatta": "NCBITaxon:9544",
    "oryctolagus_cuniculus": "NCBITaxon:9986",
    # "oryza_sativa": "NCBITaxon:39947",
    "ovis_aries": "NCBITaxon:9940",
    "pan_troglodytes": "NCBITaxon:9598",
    "sus_scrofa": "NCBITaxon:9823",
    "zea_mays": "NCBITaxon:4577",
}

In [None]:
def get_locs(file):
    locs = []
    dup = set()
    noloc = 0
    for record in SeqIO.parse(file[:-3], "fasta"):
        name = record.description.split(" gene:")[1].split(" ")[0].split(".")[0]
        if "chromosome:" in record.description:
            val = record.description.split(" chromosome:")[1].split(" ")[0]
            ref, chrom, start, end, _ = val.split(":")
        elif "primary_assembly" in record.description:
            val = record.description.split(" primary_assembly:")[1].split(" ")[0]
            ref, chrom, start, end, _ = val.split(":")
        elif "scaffold" in record.description:
            val = record.description.split(" scaffold:")[1].split(" ")[0]
            ref, chrom, start, end, _ = val.split(":")
        else:
            noloc += 1
            continue
        if name in dup:
            continue
        dup.add(name)
        locs.append([name, chrom, start, end])
    print(len(dup), " genes had duplicates")
    print(noloc, " genes had no location")
    df = pd.DataFrame(locs, columns=["name", "chrom", "start", "end"])
    df = df.astype({"start": "int32", "end": "int32"})
    df = df.sort_values(by=["chrom", "start"]).reset_index(drop=True)
    return df


# test on humans

In [4]:
file = load_fasta_species(species="homo_sapiens", load=["cdna"])

In [5]:
file = file[0]
fname = file.split("/")[-1]
run_command(["gunzip", file])

gzip: /tmp/data/fasta/Homo_sapiens.GRCh38.cdna.all.fa already exists;	not overwritten


2

In [None]:
df = get_locs(file)


In [None]:
import os

os.makedirs("../data/main/gene_locs", exist_ok=True)

In [13]:
df.set_index("name").to_parquet("../data/main/gene_locs/homo.parquet")

In [None]:
for species in names.keys():
    file = load_fasta_species(species=species, load=["cdna"])
    file = file[0]
    fname = file.split("/")[-1]
    run_command(["gunzip", file])
    df = get_locs(file)
    df.set_index("name").to_parquet(
        f"../data/main/gene_locs/{species.split('_')[0]}.parquet"
    )

gzip: /tmp/data/fasta/Mus_musculus.GRCm39.cdna.all.fa already exists;	not overwritten


35937  genes had duplicates
54  genes had no location


gzip: /tmp/data/fasta/Arabidopsis_thaliana.TAIR10.cdna.all.fa already exists;	not overwritten


27655  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Bos_taurus.ARS-UCD1.2.cdna.all.fa already exists;	not overwritten


22372  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Caenorhabditis_elegans.WBcel235.cdna.all.fa already exists;	not overwritten


22113  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Callithrix_jacchus.mCalJac1.pat.X.cdna.all.fa already exists;	not overwritten


23373  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Danio_rerio.GRCz11.cdna.all.fa already exists;	not overwritten


30302  genes had duplicates
428  genes had no location


gzip: /tmp/data/fasta/Drosophila_melanogaster.BDGP6.32.cdna.all.fa already exists;	not overwritten


14310  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.cdna.all.fa already exists;	not overwritten


17068  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Heterocephalus_glaber_male.Naked_mole-rat_paternal.cdna.all.fa already exists;	not overwritten


24687  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Macaca_mulatta.Mmul_10.cdna.all.fa already exists;	not overwritten


22528  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Oryctolagus_cuniculus.OryCun2.0.cdna.all.fa already exists;	not overwritten


15523  genes had duplicates
10474  genes had no location


gzip: /tmp/data/fasta/Ovis_aries.Oar_v3.1.cdna.all.fa already exists;	not overwritten


20440  genes had duplicates
788  genes had no location


gzip: /tmp/data/fasta/Pan_troglodytes.Pan_tro_3.0.cdna.all.fa already exists;	not overwritten


22505  genes had duplicates
2393  genes had no location


gzip: /tmp/data/fasta/Sus_scrofa.Sscrofa11.1.cdna.all.fa already exists;	not overwritten


22516  genes had duplicates
0  genes had no location


gzip: /tmp/data/fasta/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.cdna.all.fa already exists;	not overwritten


39035  genes had duplicates
748  genes had no location


In [None]:
get_locs("/tmp/data/fasta/Bos_taurus.ARS-UCD1.2.cdna.all.fa.gz")

In [29]:
list(names.keys())[2]

'bos_taurus'

In [13]:
pd.read_parquet(f"../data/main/gene_locs/{list(names.keys())[0].split('_')[0]}.parquet")

Unnamed: 0_level_0,chrom,start,end
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000051951,1,3276124,3286567
ENSMUSG00000102851,1,3322980,3323459
ENSMUSG00000103147,1,3602018,3602943
ENSMUSG00000025900,1,4069780,4479464
ENSMUSG00000102948,1,4326457,4330742
...,...,...,...
ENSMUSG00000100246,Y,90010986,90011680
ENSMUSG00000096178,Y,90412517,90444532
ENSMUSG00000100964,Y,90510316,90542914
ENSMUSG00000095134,Y,90764326,90774754


In [15]:
ls ../data/main/gene_locs/

arabidopsis.parquet     gallus.parquet          ovis.parquet
bos.parquet             heterocephalus.parquet  pan.parquet
caenorhabditis.parquet  homo.parquet            sus.parquet
callithrix.parquet      macaca.parquet          zea.parquet
danio.parquet           mus.parquet
drosophila.parquet      oryctolagus.parquet


In [5]:
import glob

# Get all parquet files in the directory
parquet_files = glob.glob("../data/main/gene_locs/*.parquet")

# Read all files and add a column with the filename (without path and extension)
import os

dfs = []
for f in parquet_files:
    df = pd.read_parquet(f)
    # Add a column with the filename (without extension)
    df["filename"] = os.path.splitext(os.path.basename(f))[0]
    dfs.append(df)

# Concatenate all dataframes
locs = pd.concat(dfs)

In [7]:
locs

Unnamed: 0_level_0,chrom,start,end,filename
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000051951,1,3276124,3286567,mus
ENSMUSG00000102851,1,3322980,3323459,mus
ENSMUSG00000103147,1,3602018,3602943,mus
ENSMUSG00000025900,1,4069780,4479464,mus
ENSMUSG00000102948,1,4326457,4330742,mus
...,...,...,...,...
ENSOARG00000002575,X,133931633,133935348,ovis
ENSOARG00000002583,X,134413131,134415171,ovis
ENSOARG00000002599,X,134630071,134677540,ovis
ENSOARG00000010238,X,134842697,134843194,ovis


In [None]:
prevname = ""
c = []
for _, r in locs.iterrows():
    if r["filename"] != prevname:
        prev_position = -100000
        prev_chromosome = None
        gene_position_tolerance = (
            10_000
            if r["filename"] not in ["drosophila", "caenorhabditis", "arabidopsis"]
            else 2_000
        )
        i = 0
    if r["chrom"] != prev_chromosome:
        i += 100
    if r["start"] - prev_position > gene_position_tolerance:
        i += 1
    c.append(i)
    prev_position = r["start"]
    prev_chromosome = r["chrom"]
    prevname = r["filename"]
locs["pos"] = c
for i in locs["filename"].unique():
    # print reduced per filename
    print(i)
    l = locs[locs["filename"] == i]
    print(len(set(l["pos"])) / len(l))

mus
0.8145365500737402
zea
0.6519277571410272
heterocephalus
0.8497184753108924
drosophila
0.6994409503843466
macaca
0.8736683238636364
pan
0.8767829371250833
callithrix
0.8828562871689556
bos
0.862238512426247
gallus
0.7230489805483946
oryctolagus
0.8861044901114475
danio
0.801498250940532
caenorhabditis
0.7014425903314793
arabidopsis
0.762285301030555
sus
0.8676940842067863
homo
0.797114037545531
ovis
0.8772504892367906


In [2]:
locs = pd.read_parquet("../data/main/gene_locs/all_locs.parquet")

In [None]:
locs.loc[locs.filename == "heterocephalus"].index

Index(['ENSHGLG00100011751', 'ENSHGLG00100049051', 'ENSHGLG00100047534',
       'ENSHGLG00100007061', 'ENSHGLG00100050874', 'ENSHGLG00100009882',
       'ENSHGLG00100040381', 'ENSHGLG00100007103', 'ENSHGLG00100050535',
       'ENSHGLG00100006540',
       ...
       'ENSHGLG00100042383', 'ENSHGLG00100012734', 'ENSHGLG00100045952',
       'ENSHGLG00100020782', 'ENSHGLG00100002557', 'ENSHGLG00100002511',
       'ENSHGLG00100020139', 'ENSHGLG00100041293', 'ENSHGLG00100012745',
       'ENSHGLG00100035039'],
      dtype='object', name='name', length=24687)

In [None]:
locs.loc[locs.filename == "heterocephalus"].index = (
    locs.loc[locs.filename == "heterocephalus"].index.astype(str).str[:9]
    + "0"
    + locs.loc[locs.filename == "heterocephalus"].index.astype(str).str[10:]
)

In [25]:
locs.to_parquet("../data/main/gene_locs/all_locs.parquet")

# adding some more

In [None]:
file = load_fasta_species(species="solanum_lycopersicum", load=["cdna"])
file = file[0]
fname = file.split("/")[-1]
run_command(["gunzip", file])
df = get_locs(file)
df.set_index("name").to_parquet(f"../data/main/gene_locs/solanum.parquet")

gzip: /tmp/data/fasta/Solanum_lycopersicum.SL3.0.cdna.all.fa already exists;	not overwritten


33925  genes had duplicates
733  genes had no location


In [None]:
df = pd.read_parquet("../data/main/gene_locs/solanum.parquet")
df["filename"] = "solanum"

In [None]:
prevname = ""
c = []
for _, r in df.iterrows():
    if r["filename"] != prevname:
        prev_position = -100000
        prev_chromosome = None
        gene_position_tolerance = (
            10_000
            if r["filename"]
            not in ["drosophila", "caenorhabditis", "arabidopsis", "solanum"]
            else 2_000
        )
        i = 0
    if r["chrom"] != prev_chromosome:
        i += 100
    if r["start"] - prev_position > gene_position_tolerance:
        i += 1
    c.append(i)
    prev_position = r["start"]
    prev_chromosome = r["chrom"]
    prevname = r["filename"]
df["pos"] = c
for i in df["filename"].unique():
    # print reduced per filename
    print(i)
    l = df[df["filename"] == i]
    print(len(set(l["pos"])) / len(l))

solanum
0.8852763448784082


In [4]:
df.to_parquet("../data/main/gene_locs/solanum.parquet")

In [None]:
file = load_fasta_species(species="equus_caballus", load=["cdna"])
file = file[0]
fname = file.split("/")[-1]
run_command(["gunzip", file])
df = get_locs(file)
df.set_index("name").to_parquet(f"../data/main/gene_locs/equus.parquet")

21893  genes had duplicates
0  genes had no location


In [None]:
df = pd.read_parquet("../data/main/gene_locs/equus.parquet")
df["filename"] = "equus"

In [None]:
prevname = ""
c = []
for _, r in df.iterrows():
    if r["filename"] != prevname:
        prev_position = -100000
        prev_chromosome = None
        gene_position_tolerance = (
            10_000
            if r["filename"]
            not in ["drosophila", "caenorhabditis", "arabidopsis", "solanum"]
            else 2_000
        )
        i = 0
    if r["chrom"] != prev_chromosome:
        i += 100
    if r["start"] - prev_position > gene_position_tolerance:
        i += 1
    c.append(i)
    prev_position = r["start"]
    prev_chromosome = r["chrom"]
    prevname = r["filename"]
df["pos"] = c
for i in df["filename"].unique():
    # print reduced per filename
    print(i)
    l = df[df["filename"] == i]
    print(len(set(l["pos"])) / len(l))

equus
0.8723792993194172


In [7]:
df.to_parquet("../data/main/gene_locs/equus.parquet")

# cat / tiger 

In [None]:
file = load_fasta_species(species="felis_catus", load=["cdna"])
file = file[0]
fname = file.split("/")[-1]
run_command(["gunzip", file])
df = get_locs(file)
df.set_index("name").to_parquet(f"../data/main/gene_locs/felis_catus.parquet")

gzip: /tmp/data/fasta/Felis_catus.Felis_catus_9.0.cdna.all.fa already exists;	not overwritten


19866  genes had duplicates
268  genes had no location


In [None]:
df = pd.read_parquet("../data/main/gene_locs/felis_catus.parquet")
df["filename"] = "felis_catus"

In [None]:
prevname = ""
c = []
for _, r in df.iterrows():
    if r["filename"] != prevname:
        prev_position = -100000
        prev_chromosome = None
        gene_position_tolerance = (
            10_000
            if r["filename"]
            not in ["drosophila", "caenorhabditis", "arabidopsis", "solanum"]
            else 2_000
        )
        i = 0
    if r["chrom"] != prev_chromosome:
        i += 100
    if r["start"] - prev_position > gene_position_tolerance:
        i += 1
    c.append(i)
    prev_position = r["start"]
    prev_chromosome = r["chrom"]
    prevname = r["filename"]
df["pos"] = c
for i in df["filename"].unique():
    # print reduced per filename
    print(i)
    l = df[df["filename"] == i]
    print(len(set(l["pos"])) / len(l))

felis_catus
0.861773885029699


In [7]:
df.to_parquet("../data/main/gene_locs/felis_catus.parquet")

In [None]:
file = load_fasta_species(species="panthera_tigris_altaica", load=["cdna"])
file = file[0]
fname = file.split("/")[-1]
run_command(["gunzip", file])
df = get_locs(file)
df.set_index("name").to_parquet(
    f"../data/main/gene_locs/panthera_tigris_altaica.parquet"
)

gzip: /tmp/data/fasta/Panthera_tigris_altaica.PanTig1.0.cdna.all.fa already exists;	not overwritten


18517  genes had duplicates
0  genes had no location


In [None]:
df = pd.read_parquet("../data/main/gene_locs/panthera_tigris_altaica.parquet")
df["filename"] = "panthera_tigris_altaica"

In [28]:
prevname = ""
c = []
for _, r in df.iterrows():
    if r["filename"] != prevname:
        prev_position = -100000
        prev_chromosome = None
        gene_position_tolerance = (
            10_000
            if r["filename"]
            not in ["drosophila", "caenorhabditis", "arabidopsis", "solanum"]
            else 2_000
        )
        i = 0
    if r["chrom"] != prev_chromosome:
        i += 100
    if r["start"] - prev_position > gene_position_tolerance:
        i += 1
    c.append(i)
    prev_position = r["start"]
    prev_chromosome = r["chrom"]
    prevname = r["filename"]
df["pos"] = c
for i in df["filename"].unique():
    # print reduced per filename
    print(i)
    l = df[df["filename"] == i]
    print(len(set(l["pos"])) / len(l))

panthera_tigris_altaica
0.8775719609007938


In [29]:
df.to_parquet("../data/main/gene_locs/panthera_tigris_altaica.parquet")