In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import pandas as pd
from utils.column_names_reader import get_col_names

In [2]:
inp_filepath = home_dir+"data/gene/gene2refseq.gz"
col_names = get_col_names(inp_filepath, "#tax_id")
gene2refseq_df_terator = pd.read_csv(inp_filepath, compression='gzip', comment='#', chunksize=10000, delim_whitespace=False, sep="\t", header=None, names=col_names)
df = gene2refseq_df_terator.__next__()
print(df.shape)
df.head()

(10000, 16)


Unnamed: 0,tax_id,GeneID,status,RNA_nucleotide_accession.version,RNA_nucleotide_gi,protein_accession.version,protein_gi,genomic_nucleotide_accession.version,genomic_nucleotide_gi,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,assembly,mature_peptide_accession.version,mature_peptide_gi,Symbol
0,24,77267466,,-,-,WP_011787424.1,500111419,NZ_CP104755.1,2310801391,0,1385,+,-,-,-,dnaA
1,24,77267467,,-,-,WP_011787425.1,500111420,NZ_CP104755.1,2310801391,1406,2506,+,-,-,-,dnaN
2,24,77267468,,-,-,WP_261762234.1,2310800657,NZ_CP104755.1,2310801391,2703,3785,+,-,-,-,recF
3,24,77267469,,-,-,WP_261762235.1,2310800658,NZ_CP104755.1,2310801391,3802,6219,+,-,-,-,gyrB
4,24,77267470,,-,-,WP_261762236.1,2310800659,NZ_CP104755.1,2310801391,6421,7467,+,-,-,-,N5094_RS00025


In [3]:
# filtering gene2refseq on criteria defined below
out_filepath = home_dir+"data/gene/gene2refseq_filtered.gz"
if os.path.exists(out_filepath): os.remove(out_filepath)

total_rows_filtered = 0
for i, df in enumerate(gene2refseq_df_terator):

    # keeping those that is associated with valid Protein-coding transcripts starting with "NM_"
    # keeping those that is associated with valid complete genomic molecule starting with "NC_"
    # keeping only those that associated with assymble GRCh38
    mask = (df["RNA_nucleotide_accession.version"].str.contains("NM_")) & (df["genomic_nucleotide_accession.version"].str.contains("NC_")) & (df["assembly"].str.contains("GRCh38"))
    df = df[mask]
    
    if not os.path.exists(out_filepath):
        df.to_csv(out_filepath, mode="a", compression='gzip', chunksize=10000, sep="\t", index=False, header=True)
    else:
        df.to_csv(out_filepath, mode="a", compression='gzip', chunksize=10000, sep="\t", index=False, header=False)
    total_rows_filtered += df.shape[0]
    if df.shape[0] !=0: print(i, df.shape)
    
    # if i==200: break

print(total_rows_filtered)

1650 (894, 16)
1651 (2531, 16)
1652 (2756, 16)
1653 (2643, 16)
1654 (2610, 16)
1655 (2767, 16)
1656 (2710, 16)
1657 (2548, 16)
1658 (2601, 16)
1659 (2615, 16)
1660 (2732, 16)
1661 (2460, 16)
1662 (2590, 16)
1663 (2071, 16)
1664 (2635, 16)
1665 (2511, 16)
1666 (2458, 16)
1667 (2539, 16)
1668 (2392, 16)
1669 (2394, 16)
1670 (2470, 16)
1671 (2351, 16)
1672 (2210, 16)
1673 (2238, 16)
1674 (1739, 16)
1675 (1409, 16)
1676 (690, 16)
1677 (490, 16)
1678 (156, 16)
1679 (171, 16)
1680 (68, 16)
1681 (98, 16)
1682 (13, 16)
1683 (3, 16)
1684 (6, 16)
1685 (26, 16)
1686 (54, 16)
1687 (68, 16)
1688 (21, 16)
63738


In [4]:
gene2refseq_filtered_filepath = home_dir+"data/gene/gene2refseq_filtered.gz"
gene2refseq_filtered_df = pd.read_csv(gene2refseq_filtered_filepath, compression='gzip', comment='#', delim_whitespace=False, sep="\t")#, header=None, names=col_names)
print(gene2refseq_filtered_df.shape)

print(gene2refseq_filtered_df[pd.isna(gene2refseq_filtered_df["GeneID"])].index)
print(gene2refseq_filtered_df[pd.isnull(gene2refseq_filtered_df["GeneID"])].index)
print(gene2refseq_filtered_df[gene2refseq_filtered_df['GeneID'] == ""].index)
print(gene2refseq_filtered_df[pd.isna(gene2refseq_filtered_df["Symbol"])].index)
print(gene2refseq_filtered_df[pd.isnull(gene2refseq_filtered_df["Symbol"])].index)
print(gene2refseq_filtered_df[gene2refseq_filtered_df['Symbol'] == ""].index)
print(gene2refseq_filtered_df[pd.isna(gene2refseq_filtered_df["protein_accession.version"])].index)
print(gene2refseq_filtered_df[pd.isnull(gene2refseq_filtered_df["protein_accession.version"])].index)
print(gene2refseq_filtered_df[gene2refseq_filtered_df["protein_accession.version"] == ""].index)

(63738, 16)
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')


In [6]:
# same gene can be in the X and Y chromosome. Must keep them. After merging NM_ to NM_, one must remove wrongly mapped variants. See example in clinvar_mapping_gene2refseq.
print(gene2refseq_filtered_df[["RNA_nucleotide_accession.version", "protein_accession.version"]].value_counts())
gene2refseq_filtered_df[gene2refseq_filtered_df["RNA_nucleotide_accession.version"]=="NM_001161532.2"] # 
# gene2refseq_filtered_df.groupby("RNA_nucleotide_accession.version").filter(lambda x: len(x) > 1)

RNA_nucleotide_accession.version  protein_accession.version
NM_005088.3                       NP_005079.2                  2
NM_001379165.1                    NP_001366094.1               2
NM_001379155.1                    NP_001366084.1               2
NM_001379156.1                    NP_001366085.1               2
NM_001161532.2                    NP_001155004.1               2
                                                              ..
NM_001320005.2                    NP_001306934.1               1
NM_001320006.2                    NP_001306935.1               1
NM_001320007.2                    NP_001306936.1               1
NM_001320010.2                    NP_001306939.1               1
NM_214711.4                       NP_999876.2                  1
Length: 63675, dtype: int64


Unnamed: 0,tax_id,GeneID,status,RNA_nucleotide_accession.version,RNA_nucleotide_gi,protein_accession.version,protein_gi,genomic_nucleotide_accession.version,genomic_nucleotide_gi,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,assembly,mature_peptide_accession.version,mature_peptide_gi,Symbol
3576,9606,1438,REVIEWED,NM_001161532.2,1824163926,NP_001155004.1,238908521,NC_000023.11,568815575,1268813,1325217,+,Reference GRCh38.p14 Primary Assembly,-,-,CSF2RA
3577,9606,1438,REVIEWED,NM_001161532.2,1824163926,NP_001155004.1,238908521,NC_000024.10,568815574,1268813,1325217,+,Reference GRCh38.p14 Primary Assembly,-,-,CSF2RA
