In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import os
import pandas as pd
from utils.pandas_extented_filters import filter_remove_null_nan_empty_entries
from utils.column_names_reader import get_col_names_from_gzip

In [2]:
# merging the downloaded dataset
def clean_df(data_filepath):
    df = pd.read_csv(data_filepath, delim_whitespace=False, sep="\t")
    df = df.drop_duplicates(subset="snp_id", keep="first")
    return df

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_0.txt"
df_0 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_1.txt"
df_1 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_2.txt"
df_2 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_3.txt"
df_3 = clean_df(data_filepath)

In [3]:
# removing duplicate data points and variants with no protein variants
df = pd.concat([df_0, df_1, df_2, df_3], ignore_index=True)
df = df.drop_duplicates(subset="snp_id", keep="first")
print(df.shape)
df = filter_remove_null_nan_empty_entries(df, a_col_name="variations") # NNE: null, nan or empty variants
print(df.shape)
print(df.columns)

(5140350, 14)
Number of NAN rows removed: 151569
Number of NULL rows removed: 0
Number of empty rows removed: 0
(4988781, 14)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations'],
      dtype='object')


In [4]:
# adding rs prefix to dbSNP-ids
initially_cleaned_df = df.copy()
initially_cleaned_df["snp_id"] = initially_cleaned_df["snp_id"].apply(lambda x: "rs"+str(x))
dbsnp_ids = initially_cleaned_df["snp_id"].unique().tolist()
print(len(dbsnp_ids))
initially_cleaned_df[["genes"]].value_counts()

4988781


genes                                                                  
TTN:7273,TTN-AS1:100506866                                                 9381
MUC16:94025                                                                8341
MUC4:4585                                                                  5782
TTN:7273                                                                   5151
AHNAK2:113146                                                              4726
                                                                           ... 
CASP5:838,LOC124902742:124902742                                              1
SLX1B:79008,BOLA2:552900,SLX1B-SULT1A4:100526831,BOLA2-SMG1P6:107282092       1
RASSF7:8045,LMNTD2:256329,LMNTD2-AS1:692247                                   1
CASP4:837,LOC124902813:124902813                                              1
EDA:1896,MIR676:100500887                                                     1
Length: 27296, dtype: int64

### Reading and filtering ALFA data as it is big

In [None]:
# do not run this unless you changed something before, it takes 1 hour
# better use the following block
# Variants reported on GRCh38, readme: https://www.ncbi.nlm.nih.gov/snp/docs/gsr/ftp_help/
from utils.column_names_reader import get_col_names_from_gzip
filepath = home_dir+"data/ALFA_population_freq/freq.vcf.gz"
col_names = get_col_names_from_gzip(filepath, "#CHROM")
alfa_df_iterator = pd.read_csv(filepath, compression='gzip', chunksize=1000000, comment="#", names=col_names, delim_whitespace=False, sep="\t")
# alfa_df_iterator.__next__() # first df 

chunk_list = []
for i, alfa_df in enumerate(alfa_df_iterator):
    chunk = alfa_df[alfa_df["ID"].isin(dbsnp_ids)]
    chunk_list.append(chunk)
    print(i, chunk.shape)
    # if i==0: break

filtered_alfa_df = pd.concat(chunk_list)  
print(filtered_alfa_df.shape)
print(filtered_alfa_df.columns)
filtered_alfa_df

In [5]:
filtered_alfa_filepath = home_dir+"data/temp_popu_freq_data/alfa_filtered.tsv"
# filtered_alfa_df.to_csv(filtered_alfa_filepath, index=False, sep="\t")
filtered_alfa_df = pd.read_csv(filtered_alfa_filepath, sep="\t")
print(filtered_alfa_df.columns)
print(filtered_alfa_df.shape)
print(filtered_alfa_df["ID"].unique().shape)

Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'SAMN10492695', 'SAMN10492696', 'SAMN10492697', 'SAMN10492698',
       'SAMN10492699', 'SAMN10492700', 'SAMN10492701', 'SAMN10492702',
       'SAMN11605645', 'SAMN10492703', 'SAMN10492704', 'SAMN10492705'],
      dtype='object')
(4988568, 21)
(4988568,)


In [21]:
filtered_alfa_df.head(5)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMN10492695,...,SAMN10492697,SAMN10492698,SAMN10492699,SAMN10492700,SAMN10492701,SAMN10492702,SAMN11605645,SAMN10492703,SAMN10492704,SAMN10492705
0,NC_000001.11,69116,rs757299236,G,A,.,.,.,AN:AC,7618:0,...,84:0,2708:0,146:0,610:0,24:0,94:0,470:1,2816:0,108:0,11862:1
1,NC_000001.11,69134,rs781394307,A,G,.,.,.,AN:AC,7618:9,...,84:0,2708:0,146:0,610:5,24:0,94:0,470:0,2816:0,108:0,11862:14
2,NC_000001.11,69149,rs1458521218,T,A,.,.,.,AN:AC,7618:3,...,84:0,2708:0,146:1,610:0,24:0,94:0,470:0,2816:0,108:0,11862:4
3,NC_000001.11,69184,rs1231643173,G,A,.,.,.,AN:AC,7618:0,...,84:0,2708:0,146:0,610:0,24:0,94:0,470:0,2816:0,108:0,11862:0
4,NC_000001.11,69200,rs1453226491,T,C,.,.,.,AN:AC,7618:0,...,84:0,2708:5,146:0,610:0,24:0,94:0,470:0,2816:5,108:0,11862:5


In [6]:
snps_with_alfa_df = pd.merge(left=initially_cleaned_df, right=filtered_alfa_df, left_on="snp_id", right_on="ID", how="inner")
print(snps_with_alfa_df.shape)
print(snps_with_alfa_df["snp_id"].unique().shape[0])

(4988568, 35)
4988568


In [7]:
def is_all_mt_variants_zero_popu_count(row):
    # print(row)
    # raise
    mt_popu_counts = row.SAMN10492705.split(":")[1].split(",")
    for mt_popu_count in mt_popu_counts:
        mt_popu_count = int(mt_popu_count)
        if mt_popu_count != 0:
            return False
    return True

snps_with_allmtvariants_popucount_zero_df = snps_with_alfa_df[snps_with_alfa_df.apply(is_all_mt_variants_zero_popu_count, axis=1)]
snps_with_allmtvariants_popucount_zero_df.shape

(3275495, 35)

In [8]:
snps_with_atleast1mtvariants_popucount_nonzero_df = snps_with_alfa_df[~snps_with_alfa_df.apply(is_all_mt_variants_zero_popu_count, axis=1)]
snps_with_atleast1mtvariants_popucount_nonzero_df.shape

(1713073, 35)

In [10]:
snps_with_allmtvariants_popucount_zero_df.to_csv(home_dir+"data/temp_popufreq_data/snps_with_allmtvariants_popucount_zero.tsv", index=False, sep="\t")
snps_with_atleast1mtvariants_popucount_nonzero_df.to_csv(home_dir+"data/temp_popufreq_data/snps_with_atleast1mtvariants_popucount_nonzero.tsv", index=False, sep="\t")

### Mapping to MANE proteins

In [13]:
# loading MANE data
mane_df = pd.read_csv(home_dir+"data/refseq/MANE.GRCh38.v1.0.summary.txt.gz", compression='gzip', delim_whitespace=False, sep="\t") # MANE refseq-protein mapping
print(mane_df.shape)
print(mane_df.columns)

print(mane_df["MANE_status"].value_counts())
print(mane_df["symbol"].value_counts()) # some gene can be mapped to multiple proteins because of MANE select and Plus clinical.
mane_df[mane_df["symbol"]=="CACNA1D"]

(19120, 14)
Index(['#NCBI_GeneID', 'Ensembl_Gene', 'HGNC_ID', 'symbol', 'name',
       'RefSeq_nuc', 'RefSeq_prot', 'Ensembl_nuc', 'Ensembl_prot',
       'MANE_status', 'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand'],
      dtype='object')
MANE Select           19062
MANE Plus Clinical       58
Name: MANE_status, dtype: int64
CACNA1D         2
CACNA1C         2
DYNC2H1         2
SLC39A14        2
CLPB            2
               ..
ZNF432          1
SAFB2           1
DZIP3           1
MARF1           1
LOC122539214    1
Name: symbol, Length: 19062, dtype: int64


Unnamed: 0,#NCBI_GeneID,Ensembl_Gene,HGNC_ID,symbol,name,RefSeq_nuc,RefSeq_prot,Ensembl_nuc,Ensembl_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand
542,GeneID:776,ENSG00000157388.20,HGNC:1391,CACNA1D,calcium voltage-gated channel subunit alpha1 D,NM_000720.4,NP_000711.1,ENST00000288139.11,ENSP00000288139.3,MANE Plus Clinical,3,53494611,53813733,+
543,GeneID:776,ENSG00000157388.20,HGNC:1391,CACNA1D,calcium voltage-gated channel subunit alpha1 D,NM_001128840.3,NP_001122312.1,ENST00000350061.11,ENSP00000288133.5,MANE Select,3,53494611,53813733,+


In [14]:
# expanding SNPs from multiple genes to single gene column
data = []
for i, row in enumerate(snps_with_atleast1mtvariants_popucount_nonzero_df.itertuples(index=False)):
    
    for gene in row.genes.split(","):
        gene_symbol, gene_id = gene.split(":")

        datam = row._asdict()
        datam["gene_symbol"] = gene_symbol
        datam["gene_id"] = int(gene_id)
        data.append(datam)
        # print(datam)

    if i%100000==0: print(i)
    # if i==5: break

snps_with_alfa_expanded_on_genes_df = pd.DataFrame(data) 
print(snps_with_alfa_expanded_on_genes_df.shape)
print(snps_with_alfa_expanded_on_genes_df.columns)
print(snps_with_alfa_expanded_on_genes_df["snp_id"].unique().shape)
snps_with_alfa_expanded_on_genes_df

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
(2055157, 37)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations', 'CHROM', 'POS', 'ID', 'REF', 'ALT',
       'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMN10492695', 'SAMN10492696',
       'SAMN10492697', 'SAMN10492698', 'SAMN10492699', 'SAMN10492700',
       'SAMN10492701', 'SAMN10492702', 'SAMN11605645', 'SAMN10492703',
       'SAMN10492704', 'SAMN10492705', 'gene_symbol', 'gene_id'],
      dtype='object')
(1713073,)


Unnamed: 0,snp_id,acc,chrpos,spdi,tax_id,snp_class,create_date,update_date,clinical_significance,fxn_class,...,SAMN10492699,SAMN10492700,SAMN10492701,SAMN10492702,SAMN11605645,SAMN10492703,SAMN10492704,SAMN10492705,gene_symbol,gene_id
0,rs1491003598,NC_000001.11,1:205269218,NC_000001.11:205269217:A:G,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"coding_sequence_variant,missense_variant",...,0:0,0:0,30:0,0:0,2276:1,594:0,56:0,8988:1,TMCC2,9911
1,rs1491003245,NC_000002.12,2:105881549,NC_000002.12:105881548:C:T,9606,snv,2017/11/09 09:55,2022/10/12 20:41,,"coding_sequence_variant,intron_variant,missens...",...,146:0,610:0,26:0,98:0,496:0,2898:0,112:0,14050:1,NCK2,8440
2,rs1491001877,NC_000011.10,11:4385428,NC_000011.10:4385427:G:A,9606,snv,2017/11/09 09:55,2022/10/16 13:47,,"missense_variant,coding_sequence_variant",...,146:0,610:0,26:0,98:0,496:0,2898:0,112:0,14050:1,TRIM21,6737
3,rs1491001822,NC_000019.10,19:54576083,"NC_000019.10:54576082:G:A,NC_000019.10:5457608...",9606,snv,2017/11/09 09:55,2022/10/17 02:29,,"genic_downstream_transcript_variant,missense_v...",...,"146:0,0","610:0,0","56:0,0","98:0,0","2772:0,0","3492:0,0","168:0,0","23038:1,0",LILRA2,11027
4,rs1491000935,NC_000016.10,16:21980608,NC_000016.10:21980607:G:A,9606,snv,2017/11/09 09:55,2022/10/16 22:57,,"genic_downstream_transcript_variant,missense_v...",...,0:0,0:0,30:0,0:0,2276:0,594:0,56:0,8988:1,UQCRC2,7385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055152,rs334,NC_000011.10,11:5227002,"NC_000011.10:5227001:T:A,NC_000011.10:5227001:...",9606,snv,2000/09/19 17:02,2022/10/16 14:17,"protective,other,pathogenic,benign,likely-beni...","missense_variant,coding_sequence_variant",...,498:11,626:0,56:0,98:0,7046:12,3340:10,168:0,44606:35,HBB,3043
2055153,rs300,NC_000008.11,8:19959376,NC_000008.11:19959375:A:G,9606,snv,2000/09/19 17:02,2022/10/14 11:06,"benign,benign-likely-benign","missense_variant,coding_sequence_variant",...,1318:3,2530:2,1952:0,360:0,20152:9,8148:71,6814:0,278238:92,LPL,4023
2055154,rs298,NC_000008.11,8:19959349,NC_000008.11:19959348:G:A,9606,snv,2000/09/19 17:02,2022/10/14 11:06,,"missense_variant,coding_sequence_variant",...,146:0,610:0,56:0,98:0,3052:0,4606:1,168:0,30584:3,LPL,4023
2055155,rs268,NC_000008.11,8:19956018,NC_000008.11:19956017:A:G,9606,snv,2000/09/19 17:02,2022/10/14 11:06,"conflicting-interpretations-of-pathogenicity,u...","missense_variant,coding_sequence_variant",...,818:9,1028:11,1850:0,280:2,16346:209,6136:28,6406:0,218762:3560,LPL,4023


In [15]:
mane_filtered_snps_df = pd.merge(left=snps_with_alfa_expanded_on_genes_df, right=mane_df, left_on="gene_symbol", right_on="symbol", how="inner")
print(mane_filtered_snps_df.shape)
print(mane_filtered_snps_df.columns)
print(mane_filtered_snps_df["snp_id"].unique().shape[0])
print(mane_filtered_snps_df["gene_symbol"].unique().shape[0])

(1864179, 51)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations', 'CHROM', 'POS', 'ID', 'REF', 'ALT',
       'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMN10492695', 'SAMN10492696',
       'SAMN10492697', 'SAMN10492698', 'SAMN10492699', 'SAMN10492700',
       'SAMN10492701', 'SAMN10492702', 'SAMN11605645', 'SAMN10492703',
       'SAMN10492704', 'SAMN10492705', 'gene_symbol', 'gene_id',
       '#NCBI_GeneID', 'Ensembl_Gene', 'HGNC_ID', 'symbol', 'name',
       'RefSeq_nuc', 'RefSeq_prot', 'Ensembl_nuc', 'Ensembl_prot',
       'MANE_status', 'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand'],
      dtype='object')
1696911
18823


In [16]:
def filter_variants_on_canonical_proteins(x):
    variations = x["variations"].split(",")
    new_variations = []
    for var in variations:
        if var.split(":")[0] == x["RefSeq_prot"]:
            new_variations.append(var)
    return ",".join(new_variations)

mane_filtered_snps_df["mane_protein_variants"] = mane_filtered_snps_df[["variations", "RefSeq_prot"]].apply(filter_variants_on_canonical_proteins, axis=1)
print(mane_filtered_snps_df.shape)
mane_filtered_snps_df = filter_remove_null_nan_empty_entries(mane_filtered_snps_df, a_col_name="mane_protein_variants") # NNE: null, nan or empty variants
print(mane_filtered_snps_df.shape)
print(mane_filtered_snps_df["snp_id"].unique().shape[0])
print(mane_filtered_snps_df["RefSeq_prot"].unique().shape[0])
print(mane_filtered_snps_df["gene_symbol"].unique().shape[0])

(1864179, 52)
Number of NAN rows removed: 0
Number of NULL rows removed: 0
Number of empty rows removed: 195936
(1668243, 52)
1656512
18858
18800


In [17]:
snps_with_alfa_mane_df = mane_filtered_snps_df.copy()
snps_with_alfa_mane_df.to_csv(home_dir+"data/temp_popufreq_data/snps_with_alfa_mane_raw.tsv", index=False, sep="\t")

In [18]:
print(snps_with_alfa_mane_df.columns)
snps_with_alfa_mane_df.head(10)

Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations', 'CHROM', 'POS', 'ID', 'REF', 'ALT',
       'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMN10492695', 'SAMN10492696',
       'SAMN10492697', 'SAMN10492698', 'SAMN10492699', 'SAMN10492700',
       'SAMN10492701', 'SAMN10492702', 'SAMN11605645', 'SAMN10492703',
       'SAMN10492704', 'SAMN10492705', 'gene_symbol', 'gene_id',
       '#NCBI_GeneID', 'Ensembl_Gene', 'HGNC_ID', 'symbol', 'name',
       'RefSeq_nuc', 'RefSeq_prot', 'Ensembl_nuc', 'Ensembl_prot',
       'MANE_status', 'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand',
       'mane_protein_variants'],
      dtype='object')


Unnamed: 0,snp_id,acc,chrpos,spdi,tax_id,snp_class,create_date,update_date,clinical_significance,fxn_class,...,RefSeq_nuc,RefSeq_prot,Ensembl_nuc,Ensembl_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand,mane_protein_variants
0,rs1491003598,NC_000001.11,1:205269218,NC_000001.11:205269217:A:G,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"coding_sequence_variant,missense_variant",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Lys339Arg
3,rs1436854664,NC_000001.11,1:205271828,NC_000001.11:205271827:T:A,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"coding_sequence_variant,synonymous_variant,mis...",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Cys612Ser
4,rs1431352460,NC_000001.11,1:205241637,NC_000001.11:205241636:A:G,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"genic_upstream_transcript_variant,coding_seque...",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Met114Val
5,rs1429019865,NC_000001.11,1:205241752,NC_000001.11:205241751:T:C,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"genic_upstream_transcript_variant,coding_seque...",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Leu152Pro
6,rs1423575191,NC_000001.11,1:205269047,NC_000001.11:205269046:C:T,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"missense_variant,coding_sequence_variant",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Pro282Leu
7,rs1415408852,NC_000001.11,1:205241616,NC_000001.11:205241615:C:A,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"genic_upstream_transcript_variant,missense_var...",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Gln107Lys
8,rs1410312003,NC_000001.11,1:205241511,NC_000001.11:205241510:C:G,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"genic_upstream_transcript_variant,missense_var...",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Gln72Glu
9,rs1401358469,NC_000001.11,1:205271242,NC_000001.11:205271241:C:T,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"coding_sequence_variant,missense_variant",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Ala602Val
10,rs1375284370,NC_000001.11,1:205269052,NC_000001.11:205269051:C:T,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"coding_sequence_variant,missense_variant",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Arg284Trp
12,rs1363717681,NC_000001.11,1:205271876,NC_000001.11:205271875:G:A,9606,snv,2017/11/09 09:55,2022/10/12 18:08,,"missense_variant,coding_sequence_variant,3_pri...",...,NM_014858.4,NP_055673.2,ENST00000358024.8,ENSP00000350718.3,MANE Select,1,205227946,205273343,+,NP_055673.2:p.Val628Met
