In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import time
import os
import pandas as pd
from utils.pandas_extented_filters import filter_drop_duplicates, filter_remove_null_nan_empty_entries
from utils.column_names_reader import get_col_names_from_gzip

### Merging downloaded missense variants

In [2]:
def clean_df(data_filepath):
    df = pd.read_csv(data_filepath, delim_whitespace=False, sep="\t")
    # print(df.shape)
    # df.head()
    df = filter_drop_duplicates(df, a_col_name="snp_id")
    # print(df.shape)
    return df

In [3]:
data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_0.txt"
df_0 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_1.txt"
df_1 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_2.txt"
df_2 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_3.txt"
df_3 = clean_df(data_filepath)

(1049000, 14)
Number of duplicates rows removed: 11
(1048989, 14)
(1554500, 14)
Number of duplicates rows removed: 174
(1554326, 14)
(1671500, 14)
Number of duplicates rows removed: 0
(1671500, 14)
(1141322, 14)
Number of duplicates rows removed: 60
(1141262, 14)


In [4]:
df = pd.concat([df_0, df_1, df_2, df_3], ignore_index=True)

### Initial cleaning of downloaded dataset

In [5]:
df_remove_dup = filter_drop_duplicates(df, a_col_name="snp_id")
df_remove_NNE = filter_remove_null_nan_empty_entries(df_remove_dup, a_col_name="variations") # NNE: null, nan or empty variants

Number of duplicates rows removed: 275727
Number of NAN rows removed: 151569
Number of NULL rows removed: 0
Number of empty rows removed: 0


In [6]:
# out_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_full.txt"
# df_remove_NNE.to_csv(out_filepath, sep="\t", index=False, header=True)
df_remove_NNE.shape

(4988781, 14)

### Mapping to MANE proteins

In [7]:
gene2refseq_mapping_filepath = home_dir+"data/refseq/MANE.GRCh38.v1.0.summary.txt.gz" # "symbol" is the Gene-symbol col
# gene2refseq_mapping_filepath = home_dir+"data/gene/gene2refseq_filtered.gz" # "Symbol" is the Gene-symbol col
# gene2refseq_mapping_filepath = home_dir+"data/refseq/HumanLRGRefSeqGene.txt.gz"  # "Symbol" is the Gene-symbol col

gene_symbol_col_name = "symbol" if gene2refseq_mapping_filepath.__contains__("MANE") else "Symbol"
gene2refseq_mapping_df = pd.read_csv(gene2refseq_mapping_filepath, compression='gzip', delim_whitespace=False, sep="\t")#, header=None, names=col_names)

print(gene2refseq_mapping_df.shape)
print(len(gene2refseq_mapping_df[gene_symbol_col_name].unique())) # some gene can be mapped to multiple proteins
gene2refseq_mapping_df.head()

(19120, 14)
19062


Unnamed: 0,#NCBI_GeneID,Ensembl_Gene,HGNC_ID,symbol,name,RefSeq_nuc,RefSeq_prot,Ensembl_nuc,Ensembl_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand
0,GeneID:1,ENSG00000121410.12,HGNC:5,A1BG,alpha-1-B glycoprotein,NM_130786.4,NP_570602.2,ENST00000263100.8,ENSP00000263100.2,MANE Select,19,58345183,58353492,-
1,GeneID:2,ENSG00000175899.15,HGNC:7,A2M,alpha-2-macroglobulin,NM_000014.6,NP_000005.3,ENST00000318602.12,ENSP00000323929.8,MANE Select,12,9067708,9115919,-
2,GeneID:9,ENSG00000171428.15,HGNC:7645,NAT1,N-acetyltransferase 1,NM_000662.8,NP_000653.3,ENST00000307719.9,ENSP00000307218.4,MANE Select,8,18210109,18223689,+
3,GeneID:10,ENSG00000156006.5,HGNC:7646,NAT2,N-acetyltransferase 2,NM_000015.3,NP_000006.2,ENST00000286479.4,ENSP00000286479.3,MANE Select,8,18391282,18401218,+
4,GeneID:12,ENSG00000196136.18,HGNC:16,SERPINA3,serpin family A member 3,NM_001085.5,NP_001076.2,ENST00000393078.5,ENSP00000376793.3,MANE Select,14,94612391,94624053,+


In [8]:
dbsnp_missense_variants_df = df_remove_NNE
dbsnp_missense_variants_df["gene_symbol"] = dbsnp_missense_variants_df["genes"].apply(lambda x: x.split(",")[0].split(":")[0])#HSFX4:101927685"
# gene2refseq_filtered_df[gene2refseq_filtered_df["Symbol"]=="ANK2"] # 73 refseq transcripts and 73 protein isoforms
merged_df = dbsnp_missense_variants_df.merge(gene2refseq_mapping_df, how="inner", left_on="gene_symbol", right_on=gene_symbol_col_name) # mapping clinvar mRNA transcripts to protein accession
merged_df.shape

(4987804, 29)

In [9]:
def filter_protein_isoforms_to_canonical(x):
    # return x["variations"]
    variations = x["variations"].split(",")
    # if len(variations) == 1: return ",".join(variations)
    new_variations = []
    for var in variations:
        if var.split(":")[0] == x["RefSeq_prot"]:
            new_variations.append(var)
    return ",".join(new_variations)

# merged_df[["variations", "RefSeq_prot"]].apply(xx, axis=1)
merged_df["new_variations"] = merged_df[["variations", "RefSeq_prot"]].apply(filter_protein_isoforms_to_canonical, axis=1)
print(merged_df.shape)
merged_df_remove_NNE = filter_remove_null_nan_empty_entries(merged_df, a_col_name="new_variations") # NNE: null, nan or empty variants
print(merged_df_remove_NNE.shape)

(4987804, 30)
Number of NAN rows removed: 0
Number of NULL rows removed: 0
Number of empty rows removed: 312738
(4675066, 30)


### Optimizing

In [10]:
merged_df_remove_NNE.columns
columns = ['snp_id', 'acc', 'chrpos', 'spdi', 'create_date', 'update_date',
           'gene_symbol', 'HGNC_ID', 'symbol', 'name', 'RefSeq_prot', 'new_variations']
optim_df = merged_df_remove_NNE[columns]
optim_df["snp_id"] = optim_df["snp_id"].apply(lambda x: "rs"+str(x))

# an rs_id can be mapped to multiple proteins, because MANE sometimes maps single gene to multiple proteins.
# the following is an example of that phenomena
optim_df = optim_df.rename(columns={'new_variations': 'variations'})
print(optim_df.shape)
print(len(list(optim_df["snp_id"].unique())))
print(optim_df["snp_id"].value_counts())
optim_df[optim_df["snp_id"]=="rs753176482"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  optim_df["snp_id"] = optim_df["snp_id"].apply(lambda x: "rs"+str(x))


(4675066, 12)
4646054
rs753176482     2
rs140002194     2
rs141704244     2
rs141536854     2
rs141497053     2
               ..
rs1186734094    1
rs1186176189    1
rs1186159489    1
rs1184408711    1
rs11798010      1
Name: snp_id, Length: 4646054, dtype: int64


Unnamed: 0,snp_id,acc,chrpos,spdi,create_date,update_date,gene_symbol,HGNC_ID,symbol,name,RefSeq_prot,variations
2580070,rs753176482,NC_000002.12,2:71664422,"NC_000002.12:71664421:C:G,NC_000002.12:7166442...",2015/04/01 22:25,2022/10/17 18:18,DYSF,HGNC:3097,DYSF,dysferlin,NP_003485.1,"NP_003485.1:p.Pro1681Ala,NP_003485.1:p.Pro1681Ser"
2580071,rs753176482,NC_000002.12,2:71664422,"NC_000002.12:71664421:C:G,NC_000002.12:7166442...",2015/04/01 22:25,2022/10/17 18:18,DYSF,HGNC:3097,DYSF,dysferlin,NP_001124459.1,"NP_001124459.1:p.Pro1720Ala,NP_001124459.1:p.P..."


### Mapping with ALFA population frequency data

In [11]:
filepath = home_dir+"data/ALFA_population_freq/freq.vcf.gz"
col_names = get_col_names_from_gzip(filepath, "#CHROM")
print(col_names)
alfa_popu_freq_df_chunk_iterator = pd.read_csv(filepath, compression='gzip', chunksize=1000000, comment="#", names=col_names, delim_whitespace=False, sep="\t",)
first_df = alfa_popu_freq_df_chunk_iterator.__next__()
first_df

['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMN10492695', 'SAMN10492696', 'SAMN10492697', 'SAMN10492698', 'SAMN10492699', 'SAMN10492700', 'SAMN10492701', 'SAMN10492702', 'SAMN11605645', 'SAMN10492703', 'SAMN10492704', 'SAMN10492705']


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMN10492695,...,SAMN10492697,SAMN10492698,SAMN10492699,SAMN10492700,SAMN10492701,SAMN10492702,SAMN11605645,SAMN10492703,SAMN10492704,SAMN10492705
0,NC_000001.9,144135212,rs1553120241,G,A,.,.,.,AN:AC,8560:5387,...,256:224,336:288,32:24,170:117,32:24,18:13,20:15,344:296,288:248,9432:6100
1,NC_000001.9,144148243,rs2236566,G,T,.,.,.,AN:AC,5996:510,...,0:0,0:0,0:0,0:0,0:0,0:0,84:8,0:0,0:0,6080:518
2,NC_000001.9,146267105,rs1553119693,T,G,.,.,.,AN:AC,37168:28800,...,56:44,1378:839,18:14,70:60,10:9,4836:3639,452:322,1414:861,66:53,44024:33749
3,NC_000001.9,148488564,.,C,A,.,.,.,AN:AC,8552:0,...,256:0,338:0,32:0,170:0,32:0,16:0,20:0,346:0,288:0,9424:0
4,NC_000001.10,2701535,rs371068661,C,T,.,.,.,AN:AC,134:9,...,0:0,48:1,0:0,0:0,0:0,0:0,188:15,48:1,0:0,370:25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,NC_000001.11,3065495,.,G,T,.,.,.,AN:AC,6962:0,...,84:0,2210:0,146:0,610:0,24:0,94:0,466:0,2294:0,108:0,10680:0
999996,NC_000001.11,3065497,.,C,T,.,.,.,AN:AC,6962:0,...,84:0,2210:0,146:0,610:0,24:0,94:0,466:0,2294:0,108:0,10680:0
999997,NC_000001.11,3065498,rs981679567,C,"G,T",.,.,.,AN:AC,"9688:0,0",...,"86:0,0","2784:0,0","146:0,0","610:0,0","26:0,0","98:0,0","496:0,0","2898:0,0","112:0,0","14048:0,0"
999998,NC_000001.11,3065499,.,G,A,.,.,.,AN:AC,6962:0,...,84:0,2210:0,146:0,610:0,24:0,94:0,466:0,2294:0,108:0,10680:0


In [12]:
dbsnp_ids = list(optim_df["snp_id"].unique())
print(len(dbsnp_ids))

chunk_list = []
for i, alfa_popu_freq_df in enumerate(alfa_popu_freq_df_chunk_iterator):
    chunk = alfa_popu_freq_df[alfa_popu_freq_df["ID"].isin(dbsnp_ids)]
    print(i, chunk.shape)
    chunk_list.append(chunk)
    # dbsnp_df = dbsnp_df.merge(filtered_alfa_popu_freq_df, how="left", left_on="snp_id", right_on="ID") 
    # print(dbsnp_df.shape)
    # if i==0: break

filtered_alfa_popu_freq_df = pd.concat(chunk_list)    

out_filepath = home_dir+"data/ALFA_population_freq/freq_filtered.vcf.gz"
if os.path.exists(out_filepath): os.remove(out_filepath)
filtered_alfa_popu_freq_df.to_csv(out_filepath, sep="\t", compression='gzip', chunksize=1000000, index=False, mode="a", header=True)
filtered_alfa_popu_freq_df.shape

4646054
0 (5200, 21)
1 (8667, 21)
2 (13006, 21)
3 (10625, 21)
4 (11543, 21)
5 (12827, 21)
6 (8312, 21)
7 (13537, 21)
8 (5981, 21)
9 (9720, 21)
10 (9072, 21)
11 (9952, 21)
12 (12861, 21)
13 (8517, 21)
14 (2349, 21)
15 (10570, 21)
16 (2179, 21)
17 (2070, 21)
18 (5511, 21)
19 (3642, 21)
20 (1520, 21)
21 (2519, 21)
22 (3767, 21)
23 (361, 21)
24 (3408, 21)
25 (3303, 21)
26 (5676, 21)
27 (6519, 21)
28 (639, 21)
29 (4112, 21)
30 (1814, 21)
31 (688, 21)
32 (12630, 21)
33 (7093, 21)
34 (5982, 21)
35 (3813, 21)
36 (0, 21)
37 (0, 21)
38 (3822, 21)
39 (7766, 21)
40 (22367, 21)
41 (17159, 21)
42 (16528, 21)
43 (12488, 21)
44 (1776, 21)
45 (4456, 21)
46 (7628, 21)
47 (4279, 21)
48 (4476, 21)
49 (7196, 21)
50 (6156, 21)
51 (3435, 21)
52 (445, 21)
53 (963, 21)
54 (5332, 21)
55 (9199, 21)
56 (11319, 21)
57 (8436, 21)
58 (4118, 21)
59 (4160, 21)
60 (3690, 21)
61 (3427, 21)
62 (7464, 21)
63 (12263, 21)
64 (5571, 21)
65 (5045, 21)
66 (7062, 21)
67 (2979, 21)
68 (2618, 21)
69 (8968, 21)
70 (5499, 21)
71 (1

(4627886, 21)

In [15]:
print(filtered_alfa_popu_freq_df.shape)
len(list(filtered_alfa_popu_freq_df["ID"].unique()))

(4627886, 21)


4627886

In [21]:
dbsnp_with_prots_df = optim_df
dbsnp_with_prots_popufreq_df = dbsnp_with_prots_df.merge(filtered_alfa_popu_freq_df, how="inner", left_on="snp_id", right_on="ID") # mapping clinvar mRNA transcripts to protein accession
dbsnp_with_prots_popufreq_df.shape

(4656898, 33)

In [22]:
out_filepath = home_dir+"data/ALFA_population_freq/dbsnp_with_prots_and_population_freq_mapping.vcf.gz"
if os.path.exists(out_filepath): os.remove(out_filepath)
dbsnp_with_prots_popufreq_df.to_csv(out_filepath, sep="\t", compression='gzip', chunksize=1000000, index=False, mode="a", header=True)
dbsnp_with_prots_popufreq_df.shape

(4656898, 33)

In [23]:
dbsnp_with_prots_popufreq_df_iterator = pd.read_csv(out_filepath, sep="\t", compression='gzip', chunksize=10000)
dbsnp_with_prots_popufreq_first_df = dbsnp_with_prots_popufreq_df_iterator.__next__()
out_filepath = home_dir+"data/ALFA_population_freq/dbsnp_with_prots_and_population_freq_mapping_small.txt"
dbsnp_with_prots_popufreq_first_df.to_csv(out_filepath, sep="\t", index=False, header=True)

In [24]:
dbsnp_with_prots_popufreq_df.columns

Index(['snp_id', 'acc', 'chrpos', 'spdi', 'create_date', 'update_date',
       'gene_symbol', 'HGNC_ID', 'symbol', 'name', 'RefSeq_prot', 'variations',
       'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'SAMN10492695', 'SAMN10492696', 'SAMN10492697', 'SAMN10492698',
       'SAMN10492699', 'SAMN10492700', 'SAMN10492701', 'SAMN10492702',
       'SAMN11605645', 'SAMN10492703', 'SAMN10492704', 'SAMN10492705'],
      dtype='object')

### Filtering common, rare and ultra-rare variants (deprecated)

In [48]:
def filter_on_ALFA_mafs(all_mafs, maf_type):
    # maf: minor allele frequency
    alfa_mafs = [v.split(":")[1].strip() for v in all_mafs.split(",") if "ALFA" in v]
    if len(alfa_mafs)>1: 
        # It is checked that there is only one ALFA maf for each snp_id, which could be multiple. ie GnomAD_exomes:C=0.000004/1,TOPMED:C=0.000008/2,ALFA:C=0./0,ALFA:A=0./0
        # This error is for only next implementation
        # This error can be commented out without any problem
        raise NotImplementedError("A snp_id is mapped to multiple ALFA")
    alfa_maf= alfa_mafs[0]
    freq = float(alfa_maf[alfa_maf.index("=")+1:alfa_maf.index("/")])
    
    # the following ranges are taken from https://www.ncbi.nlm.nih.gov/snp/docs/gsr/alfa/ALFA_20201027095038/
    if maf_type=="common": return freq >= 0.01
    elif maf_type=="rare": return 0.01 > freq >= 0.001
    elif maf_type=="ultrarare": return 0.001 > freq >= 0.0


maf_type="ultrarare"
out_df = merged_df_remove_NNE[merged_df_remove_NNE["mafs"].apply(filter_on_ALFA_mafs, maf_type=maf_type)]
print(out_df.shape)
out_filepath = home_dir+f"data/dbsnp/filtered/dbsnp_HumanMissenseALFAVariants_{maf_type}.txt"
out_df.to_csv(out_filepath, sep="\t", index=False, header=True)


(4577439, 30)
