In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import time
import os
import pandas as pd
from utils.pandas_extented_filters import filter_drop_duplicates, filter_remove_null_nan_empty_entries
from utils.column_names_reader import get_col_names_from_gzip

### Merging downloaded missense variants

In [2]:
def clean_df(data_filepath):
    df = pd.read_csv(data_filepath, delim_whitespace=False, sep="\t")
    # print(df.shape)
    # df.head()
    df = filter_drop_duplicates(df, a_col_name="snp_id")
    # print(df.shape)
    return df

In [3]:
data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_0.txt"
df_0 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_1.txt"
df_1 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_2.txt"
df_2 = clean_df(data_filepath)

data_filepath = home_dir+"data/dbsnp/search_results/dbsnp_HumanMissenseALFAVariants_3.txt"
df_3 = clean_df(data_filepath)

Number of duplicates rows removed: 11
Number of duplicates rows removed: 174
Number of duplicates rows removed: 0
Number of duplicates rows removed: 60


In [4]:
df = pd.concat([df_0, df_1, df_2, df_3], ignore_index=True)
print(df.shape)
print(df.columns)

(5416077, 14)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations'],
      dtype='object')


### Initial cleaning of downloaded dataset

In [5]:
df_remove_dup = filter_drop_duplicates(df, a_col_name="snp_id")
print(df_remove_dup.shape)
print(df_remove_dup.columns)
# snvs_with_no_NP = df_remove_dup[pd.isna(df_remove_dup["variations"])]
# snvs_with_no_NP.to_csv(home_dir+"data/dbsnp/snvs_with_no_NP.csv", index=False, header=True)
df_remove_NNE = filter_remove_null_nan_empty_entries(df_remove_dup, a_col_name="variations") # NNE: null, nan or empty variants
print(df_remove_NNE.shape)
print(df_remove_NNE.columns)

Number of duplicates rows removed: 275727
(5140350, 14)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations'],
      dtype='object')
Number of NAN rows removed: 151569
Number of NULL rows removed: 0
Number of empty rows removed: 0
(4988781, 14)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations'],
      dtype='object')


In [9]:
df_remove_NNE["genes"]

0                            UGGT1:56886
1                             ITGA2:3673
2          NECAB3:63941,C20orf144:128864
3                           DPYSL5:56896
4                             TMCC2:9911
                       ...              
5416072                         HBB:3043
5416073                         LPL:4023
5416074                         LPL:4023
5416075                         LPL:4023
5416076                    THSD7A:221981
Name: genes, Length: 4988781, dtype: object

### Mapping to MANE proteins

In [7]:
gene2refseq_mapping_filepath = home_dir+"data/refseq/MANE.GRCh38.v1.0.summary.txt.gz" # "symbol" is the Gene-symbol col
# gene2refseq_mapping_filepath = home_dir+"data/gene/gene2refseq_filtered.gz" # "Symbol" is the Gene-symbol col
# gene2refseq_mapping_filepath = home_dir+"data/refseq/HumanLRGRefSeqGene.txt.gz"  # "Symbol" is the Gene-symbol col

gene_symbol_col_name = "symbol" if gene2refseq_mapping_filepath.__contains__("MANE") else "Symbol"
gene2refseq_mapping_df = pd.read_csv(gene2refseq_mapping_filepath, compression='gzip', delim_whitespace=False, sep="\t")#, header=None, names=col_names)

print(gene2refseq_mapping_df.shape)
print(gene2refseq_mapping_df[gene_symbol_col_name].unique().shape) # some gene can be mapped to multiple proteins
print(gene2refseq_mapping_df["MANE_status"].value_counts())
gene2refseq_mapping_df.head()

(19120, 14)
(19062,)
MANE Select           19062
MANE Plus Clinical       58
Name: MANE_status, dtype: int64


Unnamed: 0,#NCBI_GeneID,Ensembl_Gene,HGNC_ID,symbol,name,RefSeq_nuc,RefSeq_prot,Ensembl_nuc,Ensembl_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand
0,GeneID:1,ENSG00000121410.12,HGNC:5,A1BG,alpha-1-B glycoprotein,NM_130786.4,NP_570602.2,ENST00000263100.8,ENSP00000263100.2,MANE Select,19,58345183,58353492,-
1,GeneID:2,ENSG00000175899.15,HGNC:7,A2M,alpha-2-macroglobulin,NM_000014.6,NP_000005.3,ENST00000318602.12,ENSP00000323929.8,MANE Select,12,9067708,9115919,-
2,GeneID:9,ENSG00000171428.15,HGNC:7645,NAT1,N-acetyltransferase 1,NM_000662.8,NP_000653.3,ENST00000307719.9,ENSP00000307218.4,MANE Select,8,18210109,18223689,+
3,GeneID:10,ENSG00000156006.5,HGNC:7646,NAT2,N-acetyltransferase 2,NM_000015.3,NP_000006.2,ENST00000286479.4,ENSP00000286479.3,MANE Select,8,18391282,18401218,+
4,GeneID:12,ENSG00000196136.18,HGNC:16,SERPINA3,serpin family A member 3,NM_001085.5,NP_001076.2,ENST00000393078.5,ENSP00000376793.3,MANE Select,14,94612391,94624053,+


In [19]:
print(gene2refseq_mapping_df["symbol"].value_counts())
gene2refseq_mapping_df[gene2refseq_mapping_df["symbol"]=="DYSF"]

CACNA1D         2
CACNA1C         2
DYNC2H1         2
SLC39A14        2
CLPB            2
               ..
ZNF432          1
SAFB2           1
DZIP3           1
MARF1           1
LOC122539214    1
Name: symbol, Length: 19062, dtype: int64


Unnamed: 0,#NCBI_GeneID,Ensembl_Gene,HGNC_ID,symbol,name,RefSeq_nuc,RefSeq_prot,Ensembl_nuc,Ensembl_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand
5407,GeneID:8291,ENSG00000135636.16,HGNC:3097,DYSF,dysferlin,NM_003494.4,NP_003485.1,ENST00000258104.8,ENSP00000258104.3,MANE Plus Clinical,2,71453561,71686763,+
5408,GeneID:8291,ENSG00000135636.16,HGNC:3097,DYSF,dysferlin,NM_001130987.2,NP_001124459.1,ENST00000410020.8,ENSP00000386881.3,MANE Select,2,71466699,71686763,+


In [14]:
dbsnp_missense_variants_df = df_remove_NNE.copy()
dbsnp_missense_variants_df["gene_symbol"] = dbsnp_missense_variants_df["genes"].apply(lambda x: x.split(",")[0].split(":")[0])#HSFX4:101927685"
print(dbsnp_missense_variants_df["gene_symbol"].unique().shape)
# gene2refseq_filtered_df[gene2refseq_filtered_df["Symbol"]=="ANK2"] # 73 refseq transcripts and 73 protein isoforms
merged_df = dbsnp_missense_variants_df.merge(gene2refseq_mapping_df, how="inner", left_on="gene_symbol", right_on=gene_symbol_col_name) # mapping clinvar mRNA transcripts to protein accession
print(merged_df.shape)
print(merged_df.columns)
print(merged_df["gene_symbol"].unique().shape, merged_df[gene_symbol_col_name].unique().shape)


(18534,)
(4987804, 29)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations', 'gene_symbol', '#NCBI_GeneID',
       'Ensembl_Gene', 'HGNC_ID', 'symbol', 'name', 'RefSeq_nuc',
       'RefSeq_prot', 'Ensembl_nuc', 'Ensembl_prot', 'MANE_status',
       'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand'],
      dtype='object')
(18281,) (18281,)


In [15]:
def filter_protein_isoforms_to_canonical(x):
    # return x["variations"]
    variations = x["variations"].split(",")
    # if len(variations) == 1: return ",".join(variations)
    new_variations = []
    for var in variations:
        if var.split(":")[0] == x["RefSeq_prot"]:
            new_variations.append(var)
    return ",".join(new_variations)

# merged_df[["variations", "RefSeq_prot"]].apply(xx, axis=1)
merged_df["new_variations"] = merged_df[["variations", "RefSeq_prot"]].apply(filter_protein_isoforms_to_canonical, axis=1)
print(merged_df.shape)
merged_df_remove_NNE = filter_remove_null_nan_empty_entries(merged_df, a_col_name="new_variations") # NNE: null, nan or empty variants
print(merged_df_remove_NNE.shape)
print(merged_df_remove_NNE["RefSeq_prot"].unique().shape[0])
print(merged_df_remove_NNE["gene_symbol"].unique().shape[0])

(4987804, 30)
Number of NAN rows removed: 0
Number of NULL rows removed: 0
Number of empty rows removed: 312738
(4675066, 30)
18326
18268


### Optimizing

In [16]:
print(merged_df_remove_NNE.columns)
columns = ['snp_id', 'acc', 'chrpos', 'spdi', 'create_date', 'update_date',
           'gene_symbol', 'HGNC_ID', 'symbol', 'name', 'RefSeq_prot', 'new_variations']
optim_df = merged_df_remove_NNE[columns]
optim_df["snp_id"] = optim_df["snp_id"].apply(lambda x: "rs"+str(x))

# an rs_id can be mapped to multiple proteins, because MANE sometimes maps single gene to multiple proteins.
# the following is an example of that phenomena
optim_df = optim_df.rename(columns={'new_variations': 'variations'})
print(optim_df.shape)
print(optim_df["snp_id"].unique().shape)
print(optim_df["snp_id"].value_counts())
optim_df[optim_df["snp_id"]=="rs753176482"]

Index(['snp_id', 'acc', 'chrpos', 'spdi', 'tax_id', 'snp_class', 'create_date',
       'update_date', 'clinical_significance', 'fxn_class', 'validated',
       'genes', 'mafs', 'variations', 'gene_symbol', '#NCBI_GeneID',
       'Ensembl_Gene', 'HGNC_ID', 'symbol', 'name', 'RefSeq_nuc',
       'RefSeq_prot', 'Ensembl_nuc', 'Ensembl_prot', 'MANE_status',
       'GRCh38_chr', 'chr_start', 'chr_end', 'chr_strand', 'new_variations'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  optim_df["snp_id"] = optim_df["snp_id"].apply(lambda x: "rs"+str(x))


(4675066, 12)
(4646054,)
rs753176482     2
rs140002194     2
rs141704244     2
rs141536854     2
rs141497053     2
               ..
rs1186734094    1
rs1186176189    1
rs1186159489    1
rs1184408711    1
rs11798010      1
Name: snp_id, Length: 4646054, dtype: int64


Unnamed: 0,snp_id,acc,chrpos,spdi,create_date,update_date,gene_symbol,HGNC_ID,symbol,name,RefSeq_prot,variations
2580070,rs753176482,NC_000002.12,2:71664422,"NC_000002.12:71664421:C:G,NC_000002.12:7166442...",2015/04/01 22:25,2022/10/17 18:18,DYSF,HGNC:3097,DYSF,dysferlin,NP_003485.1,"NP_003485.1:p.Pro1681Ala,NP_003485.1:p.Pro1681Ser"
2580071,rs753176482,NC_000002.12,2:71664422,"NC_000002.12:71664421:C:G,NC_000002.12:7166442...",2015/04/01 22:25,2022/10/17 18:18,DYSF,HGNC:3097,DYSF,dysferlin,NP_001124459.1,"NP_001124459.1:p.Pro1720Ala,NP_001124459.1:p.P..."


### Mapping with ALFA population frequency data

In [26]:
filepath = home_dir+"data/ALFA_population_freq/freq.vcf.gz"
col_names = get_col_names_from_gzip(filepath, "#CHROM")
print(col_names)
alfa_popu_freq_df_chunk_iterator = pd.read_csv(filepath, compression='gzip', chunksize=1000, comment="#", names=col_names, delim_whitespace=False, sep="\t")
first_df = alfa_popu_freq_df_chunk_iterator.__next__()
first_df

import gzip
with gzip.open(filepath, "rt") as file_handle:
    for line_no, line in enumerate(file_handle):
        print(line)
        if line_no==10: break

['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMN10492695', 'SAMN10492696', 'SAMN10492697', 'SAMN10492698', 'SAMN10492699', 'SAMN10492700', 'SAMN10492701', 'SAMN10492702', 'SAMN11605645', 'SAMN10492703', 'SAMN10492704', 'SAMN10492705']
##fileformat=VCFv4.0	

##build_id=20201027095038	

##Population=https://www.ncbi.nlm.nih.gov/biosample/?term=GRAF-pop	

##FORMAT=<ID=AN,Number=1,Type=Integer,Description="Total allele count for the population, including REF">	

##FORMAT=<ID=AC,Number=A,Type=Integer,Description="Allele count for each ALT allele for the population">	

#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMN10492695	SAMN10492696	SAMN10492697	SAMN10492698	SAMN10492699	SAMN10492700	SAMN10492701	SAMN10492702	SAMN11605645	SAMN10492703	SAMN10492704	SAMN10492705

NC_000001.9	144135212	rs1553120241	G	A	.	.	.	AN:AC	8560:5387	8:8	256:224	336:288	32:24	170:117	32:24	18:13	20:15	344:296	288:248	9432:6100

NC_000001.9	144148243	rs2236566	G	T	.	.	.	AN:AC	5996:510	

In [27]:
# do not run this unless you changed something before, it takes 1 hour
alfa_popu_freq_df_chunk_iterator = pd.read_csv(filepath, compression='gzip', chunksize=1000000, comment="#", names=col_names, delim_whitespace=False, sep="\t")

dbsnp_ids = list(optim_df["snp_id"].unique())
print(len(dbsnp_ids))

chunk_list = []
for i, alfa_popu_freq_df in enumerate(alfa_popu_freq_df_chunk_iterator):
    chunk = alfa_popu_freq_df[alfa_popu_freq_df["ID"].isin(dbsnp_ids)]
    print(i, chunk.shape)
    chunk_list.append(chunk)
    # dbsnp_df = dbsnp_df.merge(filtered_alfa_popu_freq_df, how="left", left_on="snp_id", right_on="ID") 
    # print(dbsnp_df.shape)
    # if i==0: break

filtered_alfa_popu_freq_df = pd.concat(chunk_list)    

4646054
0 (18099, 21)
1 (5200, 21)
2 (8667, 21)
3 (13006, 21)
4 (10625, 21)
5 (11543, 21)
6 (12827, 21)
7 (8312, 21)
8 (13537, 21)
9 (5981, 21)
10 (9720, 21)
11 (9072, 21)
12 (9952, 21)
13 (12861, 21)
14 (8517, 21)
15 (2349, 21)
16 (10570, 21)
17 (2179, 21)
18 (2070, 21)
19 (5511, 21)
20 (3642, 21)
21 (1520, 21)
22 (2519, 21)
23 (3767, 21)
24 (361, 21)
25 (3408, 21)
26 (3303, 21)
27 (5676, 21)
28 (6519, 21)
29 (639, 21)
30 (4112, 21)
31 (1814, 21)
32 (688, 21)
33 (12630, 21)
34 (7093, 21)
35 (5982, 21)
36 (3813, 21)
37 (0, 21)
38 (0, 21)
39 (3822, 21)
40 (7766, 21)
41 (22367, 21)
42 (17159, 21)
43 (16528, 21)
44 (12488, 21)
45 (1776, 21)
46 (4456, 21)
47 (7628, 21)
48 (4279, 21)
49 (4476, 21)
50 (7196, 21)
51 (6156, 21)
52 (3435, 21)
53 (445, 21)
54 (963, 21)
55 (5332, 21)
56 (9199, 21)
57 (11319, 21)
58 (8436, 21)
59 (4118, 21)
60 (4160, 21)
61 (3690, 21)
62 (3427, 21)
63 (7464, 21)
64 (12263, 21)
65 (5571, 21)
66 (5045, 21)
67 (7062, 21)
68 (2979, 21)
69 (2618, 21)
70 (8968, 21)
71 (

In [28]:
# this is associated with the previous block of codes
# out_filepath = home_dir+"data/ALFA_population_freq/freq_filtered.vcf.gz"
# if os.path.exists(out_filepath): os.remove(out_filepath)
# filtered_alfa_popu_freq_df.to_csv(out_filepath, sep="\t", compression='gzip', chunksize=1000000, index=False, mode="a", header=True)
# print(filtered_alfa_popu_freq_df.shape)
# print(filtered_alfa_popu_freq_df["ID"].unique().shape)
# filtered_alfa_popu_freq_df.head()

(4645985, 21)
(4645985,)


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMN10492695,...,SAMN10492697,SAMN10492698,SAMN10492699,SAMN10492700,SAMN10492701,SAMN10492702,SAMN11605645,SAMN10492703,SAMN10492704,SAMN10492705
14405,NC_000001.11,69116,rs757299236,G,A,.,.,.,AN:AC,7618:0,...,84:0,2708:0,146:0,610:0,24:0,94:0,470:1,2816:0,108:0,11862:1
14410,NC_000001.11,69134,rs781394307,A,G,.,.,.,AN:AC,7618:9,...,84:0,2708:0,146:0,610:5,24:0,94:0,470:0,2816:0,108:0,11862:14
14414,NC_000001.11,69149,rs1458521218,T,A,.,.,.,AN:AC,7618:3,...,84:0,2708:0,146:1,610:0,24:0,94:0,470:0,2816:0,108:0,11862:4
14419,NC_000001.11,69184,rs1231643173,G,A,.,.,.,AN:AC,7618:0,...,84:0,2708:0,146:0,610:0,24:0,94:0,470:0,2816:0,108:0,11862:0
14422,NC_000001.11,69200,rs1453226491,T,C,.,.,.,AN:AC,7618:0,...,84:0,2708:5,146:0,610:0,24:0,94:0,470:0,2816:5,108:0,11862:5


In [4]:
out_filepath = home_dir+"data/ALFA_population_freq/freq_filtered.vcf.gz"
filtered_alfa_popu_freq_df = pd.read_csv(out_filepath, sep="\t")
print(filtered_alfa_popu_freq_df.shape)
print(filtered_alfa_popu_freq_df["ID"].unique().shape)
print(filtered_alfa_popu_freq_df["REF"].value_counts())
print(filtered_alfa_popu_freq_df["ALT"].value_counts())
filtered_alfa_popu_freq_df.head()


(4645985, 21)
(4645985,)
C                           1547270
G                           1534441
A                            783094
T                            781174
TA                                1
CCACTTCTTTAATCACCACCACCA          1
CCTT                              1
ATAGCCTCTCCTCCTGTTCACGT           1
AT                                1
ATCT                              1
Name: REF, dtype: int64
A                           1137912
T                           1135304
G                            826882
C                            823314
A,T                          236409
G,T                          154449
A,C                          153408
C,G                           69366
A,G                           32040
C,T                           31499
A,G,T                         19538
A,C,T                         19148
A,C,G                          2887
C,G,T                          2677
T,A                             275
T,G                             194
A,T,G          

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMN10492695,...,SAMN10492697,SAMN10492698,SAMN10492699,SAMN10492700,SAMN10492701,SAMN10492702,SAMN11605645,SAMN10492703,SAMN10492704,SAMN10492705
0,NC_000001.11,69116,rs757299236,G,A,.,.,.,AN:AC,7618:0,...,84:0,2708:0,146:0,610:0,24:0,94:0,470:1,2816:0,108:0,11862:1
1,NC_000001.11,69134,rs781394307,A,G,.,.,.,AN:AC,7618:9,...,84:0,2708:0,146:0,610:5,24:0,94:0,470:0,2816:0,108:0,11862:14
2,NC_000001.11,69149,rs1458521218,T,A,.,.,.,AN:AC,7618:3,...,84:0,2708:0,146:1,610:0,24:0,94:0,470:0,2816:0,108:0,11862:4
3,NC_000001.11,69184,rs1231643173,G,A,.,.,.,AN:AC,7618:0,...,84:0,2708:0,146:0,610:0,24:0,94:0,470:0,2816:0,108:0,11862:0
4,NC_000001.11,69200,rs1453226491,T,C,.,.,.,AN:AC,7618:0,...,84:0,2708:5,146:0,610:0,24:0,94:0,470:0,2816:5,108:0,11862:5


In [30]:
dbsnp_with_prots_df = optim_df
dbsnp_with_prots_popufreq_df = dbsnp_with_prots_df.merge(filtered_alfa_popu_freq_df, how="inner", left_on="snp_id", right_on="ID") # mapping clinvar mRNA transcripts to protein accession
print(dbsnp_with_prots_popufreq_df.shape)
print(dbsnp_with_prots_popufreq_df.columns)
print(dbsnp_with_prots_popufreq_df["RefSeq_prot"].unique().shape[0])

(4674997, 33)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'create_date', 'update_date',
       'gene_symbol', 'HGNC_ID', 'symbol', 'name', 'RefSeq_prot', 'variations',
       'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'SAMN10492695', 'SAMN10492696', 'SAMN10492697', 'SAMN10492698',
       'SAMN10492699', 'SAMN10492700', 'SAMN10492701', 'SAMN10492702',
       'SAMN11605645', 'SAMN10492703', 'SAMN10492704', 'SAMN10492705'],
      dtype='object')
18326


In [31]:
out_filepath = home_dir+"data/ALFA_population_freq/dbsnp_with_prots_and_population_freq_mapping.vcf.gz"
if os.path.exists(out_filepath): os.remove(out_filepath)
dbsnp_with_prots_popufreq_df.to_csv(out_filepath, sep="\t", compression='gzip', chunksize=1000000, index=False, mode="a", header=True)
dbsnp_with_prots_popufreq_df.shape

(4674997, 33)

In [23]:
# dbsnp_with_prots_popufreq_df_iterator = pd.read_csv(out_filepath, sep="\t", compression='gzip', chunksize=10000)
# dbsnp_with_prots_popufreq_first_df = dbsnp_with_prots_popufreq_df_iterator.__next__()
# out_filepath = home_dir+"data/ALFA_population_freq/dbsnp_with_prots_and_population_freq_mapping_small.txt"
# dbsnp_with_prots_popufreq_first_df.to_csv(out_filepath, sep="\t", index=False, header=True)

In [32]:
dbsnp_with_prots_popufreq_df

Unnamed: 0,snp_id,acc,chrpos,spdi,create_date,update_date,gene_symbol,HGNC_ID,symbol,name,...,SAMN10492697,SAMN10492698,SAMN10492699,SAMN10492700,SAMN10492701,SAMN10492702,SAMN11605645,SAMN10492703,SAMN10492704,SAMN10492705
0,rs1491004261,NC_000002.12,2:128176888,NC_000002.12:128176887:T:C,2017/11/09 09:55,2022/10/12 21:08,UGGT1,HGNC:15663,UGGT1,UDP-glucose glycoprotein glucosyltransferase 1,...,86:0,2784:0,146:0,610:0,26:0,98:0,496:0,2898:0,112:0,14050:0
1,rs1490185682,NC_000002.12,2:128157261,NC_000002.12:128157260:T:C,2017/11/09 09:55,2022/10/12 21:08,UGGT1,HGNC:15663,UGGT1,UDP-glucose glycoprotein glucosyltransferase 1,...,86:0,2784:0,146:0,610:0,26:0,98:0,496:0,2898:0,112:0,14050:0
2,rs1489586107,NC_000002.12,2:128157294,NC_000002.12:128157293:A:G,2017/11/09 09:55,2022/10/12 21:08,UGGT1,HGNC:15663,UGGT1,UDP-glucose glycoprotein glucosyltransferase 1,...,0:0,20:0,354:0,18:0,0:0,0:0,4092:0,20:0,0:0,21382:1
3,rs1487964835,NC_000002.12,2:128138817,NC_000002.12:128138816:G:A,2017/11/09 09:55,2022/10/12 21:08,UGGT1,HGNC:15663,UGGT1,UDP-glucose glycoprotein glucosyltransferase 1,...,26:0,586:0,0:0,0:0,30:0,0:0,2276:0,594:0,56:0,8988:1
4,rs1485620659,NC_000002.12,2:128133175,NC_000002.12:128133174:A:C,2017/11/09 09:55,2022/10/12 21:08,UGGT1,HGNC:15663,UGGT1,UDP-glucose glycoprotein glucosyltransferase 1,...,86:0,2784:0,146:0,610:0,26:0,98:0,496:0,2898:0,112:0,14050:0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4674992,rs201657622,NC_000001.11,1:144373466,NC_000001.11:144373465:C:T,2012/05/04 13:09,2022/10/12 17:08,PPIAL4E,HGNC:33997,PPIAL4E,peptidylprolyl isomerase A like 4E,...,0:0,0:0,0:0,0:0,0:0,0:0,8:0,0:0,0:0,4294:59
4674993,rs200452456,NC_000024.10,Y:23229611,NC_000024.10:23229610:A:T,2012/05/04 13:09,2022/10/13 18:31,DAZ2,HGNC:15964,DAZ2,deleted in azoospermia 2,...,10:0,260:0,0:0,0:0,12:0,0:0,1050:0,264:0,22:0,4262:2
4674994,rs142112856,NC_000004.12,4:9325189,NC_000004.12:9325188:A:G,2011/05/09 23:43,2022/10/13 04:44,USP17L24,HGNC:44453,USP17L24,ubiquitin specific peptidase 17 like family me...,...,84:35,2708:2323,146:34,610:37,24:10,94:7,470:138,2816:2427,108:45,11862:3689
4674995,rs28613881,NC_000004.12,4:9363153,NC_000004.12:9363152:A:G,2005/05/24 13:24,2022/10/13 04:44,USP17L30,HGNC:44458,USP17L30,ubiquitin specific peptidase 17 like family me...,...,84:42,2708:1618,146:77,610:292,24:12,94:47,470:243,2816:1685,108:54,11862:6624
