In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import pandas as pd

In [2]:
# loading MANE data
mane_df = pd.read_csv(home_dir+"data/MANE/MANE.GRCh38.v1.0.summary.txt.gz", compression='gzip', delim_whitespace=False, sep="\t") # MANE refseq-protein mapping
print(mane_df.columns)
print(mane_df.shape)

print(mane_df["MANE_status"].value_counts())
print(mane_df["symbol"].value_counts()) # some gene can be mapped to multiple proteins because of MANE select and Plus clinical.
mane_df[mane_df["symbol"]=="CACNA1D"]

Index(['#NCBI_GeneID', 'Ensembl_Gene', 'HGNC_ID', 'symbol', 'name',
       'RefSeq_nuc', 'RefSeq_prot', 'Ensembl_nuc', 'Ensembl_prot',
       'MANE_status', 'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand'],
      dtype='object')
(19120, 14)
MANE Select           19062
MANE Plus Clinical       58
Name: MANE_status, dtype: int64
CACNA1D         2
CACNA1C         2
DYNC2H1         2
SLC39A14        2
CLPB            2
               ..
ZNF432          1
SAFB2           1
DZIP3           1
MARF1           1
LOC122539214    1
Name: symbol, Length: 19062, dtype: int64


Unnamed: 0,#NCBI_GeneID,Ensembl_Gene,HGNC_ID,symbol,name,RefSeq_nuc,RefSeq_prot,Ensembl_nuc,Ensembl_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand
542,GeneID:776,ENSG00000157388.20,HGNC:1391,CACNA1D,calcium voltage-gated channel subunit alpha1 D,NM_000720.4,NP_000711.1,ENST00000288139.11,ENSP00000288139.3,MANE Plus Clinical,3,53494611,53813733,+
543,GeneID:776,ENSG00000157388.20,HGNC:1391,CACNA1D,calcium voltage-gated channel subunit alpha1 D,NM_001128840.3,NP_001122312.1,ENST00000350061.11,ENSP00000288133.5,MANE Select,3,53494611,53813733,+


In [3]:
snp_genes_df = pd.read_csv(home_dir+"data/dbsnp/parsed/genes.tsv", sep="\t", index_col=False) # index_col=False is a must
print(snp_genes_df.columns)
print(snp_genes_df.shape)
print(snp_genes_df["SNP_ID"].unique().shape[0])
print(snp_genes_df["GENE_NAME"].unique().shape[0])

snp_genes_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
print(snp_genes_df.shape)
print(snp_genes_df["SNP_ID"].unique().shape[0])
print(snp_genes_df["GENE_NAME"].unique().shape[0])

snp_genes_df

Index(['SNP_ID', 'GENE_NAME', 'GENE_ID'], dtype='object')
(8334039, 3)
6791045
26019
(8242603, 3)
6791045
26019


Unnamed: 0,SNP_ID,GENE_NAME,GENE_ID
0,rs2100203794,INTS14,81556
1,rs2100192439,PTK2,5747
2,rs2100192436,PTK2,5747
3,rs2100178510,PTK2,5747
4,rs2100169198,PTK2,5747
...,...,...,...
8242598,rs1011,VAMP8,8673
8242599,rs980,INPP5B,3633
8242600,rs676,GMPR,2766
8242601,rs664,COL1A2,1278


In [4]:
snp_genes_df["SNP_ID"].value_counts()

rs772843725     22
rs183549806     22
rs143630962     22
rs762138055     22
rs151293422     22
                ..
rs1316711539     1
rs1316711547     1
rs1316711968     1
rs1316712118     1
rs298            1
Name: SNP_ID, Length: 6791045, dtype: int64

In [5]:
merged_genes_df = pd.merge(left=mane_df, right=snp_genes_df, left_on="symbol", right_on="GENE_NAME", how="inner")
print(merged_genes_df.columns)
print(merged_genes_df.shape)
print(merged_genes_df["SNP_ID"].unique().shape[0])
print(merged_genes_df["GENE_NAME"].unique().shape)

Index(['#NCBI_GeneID', 'Ensembl_Gene', 'HGNC_ID', 'symbol', 'name',
       'RefSeq_nuc', 'RefSeq_prot', 'Ensembl_nuc', 'Ensembl_prot',
       'MANE_status', 'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand',
       'SNP_ID', 'GENE_NAME', 'GENE_ID'],
      dtype='object')
(7336390, 17)
6665168
(18961,)


In [6]:
df = merged_genes_df[['SNP_ID', 'GENE_NAME', 'GENE_ID', 'RefSeq_nuc', 'RefSeq_prot', 'MANE_status', 'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand']]
df

Unnamed: 0,SNP_ID,GENE_NAME,GENE_ID,RefSeq_nuc,RefSeq_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand
0,rs2051986291,A1BG,1,NM_130786.4,NP_570602.2,MANE Select,19,58345183,58353492,-
1,rs2051986199,A1BG,1,NM_130786.4,NP_570602.2,MANE Select,19,58345183,58353492,-
2,rs2051985919,A1BG,1,NM_130786.4,NP_570602.2,MANE Select,19,58345183,58353492,-
3,rs2051984100,A1BG,1,NM_130786.4,NP_570602.2,MANE Select,19,58345183,58353492,-
4,rs2051983854,A1BG,1,NM_130786.4,NP_570602.2,MANE Select,19,58345183,58353492,-
...,...,...,...,...,...,...,...,...,...,...
7336385,rs61734108,LOC122539214,122539214,NM_001396016.1,NP_001382945.1,MANE Select,19,52650447,52690496,-
7336386,rs3745105,LOC122539214,122539214,NM_001396016.1,NP_001382945.1,MANE Select,19,52650447,52690496,-
7336387,rs113146313,LOC122539214,122539214,NM_001396016.1,NP_001382945.1,MANE Select,19,52650447,52690496,-
7336388,rs111311236,LOC122539214,122539214,NM_001396016.1,NP_001382945.1,MANE Select,19,52650447,52690496,-


In [7]:
df.to_csv(home_dir+"data/dbsnp/processed/mane_mapped_genes.tsv", sep="\t", index=False)