In [2]:
import os
import sys
home_dir = "../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import fileinput
from utils.ncbi_proteins import download_protein_list_mpi, download_protein_list, create_combined_fasta
from Bio import SeqIO

from Bio.PDB.Polypeptide import protein_letters_3to1 # 20 amino acids

In [3]:
def three_to_one(aa):
    if str.upper(aa) in protein_letters_3to1:
        return protein_letters_3to1[str.upper(aa)]
    return "unknown"

def get_variants_df(inp_filepath:str, pathogenicity_type=None) -> pd.DataFrame:   
    df = pd.read_csv(inp_filepath, sep="\t")
    # print(df.shape)
    # print(df.columns)
    # print(df.head())

    # pathogenicity_type: pathogenic, likely_pathogenic   
    # columns = ["Symbol", "GeneID", "GRCh38Chromosome", "GRCh38Location", "Canonical SPDI", "protein_accession.version", "Protein change"]

    variations = []
    for row_i in range(df.shape[0]):
        # print(df.loc[row_i])
        
        try:
            clinical_sig = df.loc[row_i, "Clinical significance (Last reviewed)"]
            if "Likely pathogenic" in clinical_sig:
                pathogenicity_type = "Likely-pathogenic"
            else: pathogenicity_type = "Pathogenic"
            
            # Case: bad->NM_032756.4(HPDL):c.[832G>A;91T>C], no protein variants, good->NM_000478.6(ALPL):c.1276G>A (p.Gly426Ser)
            x = df.loc[row_i, "Name"].split() # 
            if len(x)!=2: continue 
            
            prot_variant = x[1]
            try:
                prot_pos = int(prot_variant[6:-4]) # case: NM_003001.5(SDHC):c.452_455delinsATGA (p.Ser151_Gly152delinsTyrGlu)
            except:
                continue
            
            wt, mut = prot_variant[3:6], prot_variant[-4:-1]
            wt, mut = three_to_one(wt), three_to_one(mut)
            if wt=="unknown" or mut=="unknown": continue # not considering unknown protein variants

            x = df.loc[row_i, "Canonical SPDI"] 
            x = x.split("|")[0] # Case: NC_000023.11:154532268:C:A|NC_000023.11:154532045:A:C. Take the 1st.
            chrom_acc_version, _, ref_allele, alt_allele = x.split(":") # ie: NC_000023.11:154532268:C:A # chr-pos in Canonical SPDI is 0-indexed
            if len(ref_allele)>1 or len(alt_allele)>1: continue # only considering single neucleodite variants
            
            chrom_pos = int(df.loc[row_i, "GRCh38Location"]) #1-indexed
            
            
            
            new_v = {"clinvar_id": df.loc[row_i, "VariationID"], 
                    "gene_symbol": df.loc[row_i, "Symbol"],
                    "gene_id": df.loc[row_i, "GeneID"], 
                    "snp_id": df.loc[row_i, "dbSNP ID"],
                    
                    "chrom_acc_version": chrom_acc_version,
                    "chrom_pos": chrom_pos, # 1-indexed
                    "ref_allele": ref_allele,
                    "alt_allele": alt_allele,
                    
                    "prot_acc_version": df.loc[row_i, "protein_accession.version"],
                    "prot_pos": prot_pos, # NCBI prot variants are 1-indexed
                    "wt": wt, # 1-letter amino acid
                    "mut": mut,
                    "class": pathogenicity_type
                }
        
            variations.append(new_v) 
        except:
            print("Error occured: ")
            print(row_i, df.loc[row_i])
            raise
        
        # if row_i==10: break
    variations_df = pd.DataFrame(variations)
    return variations_df          

In [4]:
# getting pathogenic data
inp_filepath = home_dir+"data/clinvar/filtered/clinvar_HumanPathogenicMissenseVariants01012022To14022023.txt"
patho_variations_df = get_variants_df(inp_filepath, pathogenicity_type="pathogenic")
print(patho_variations_df.shape)

(3802, 13)


In [5]:
# getting likely-pathogenic data
inp_filepath = home_dir+"data/clinvar/filtered/clinvar_HumanLikelyPathogenicMissenseVariants01012022To14022023.txt"
likelypatho_variations_df = get_variants_df(inp_filepath, pathogenicity_type="likely_pathogenic")
print(likelypatho_variations_df.shape)

(6703, 13)


In [6]:
# merging and clearning patho-&-likelypatho data
variations_df = pd.concat([patho_variations_df, likelypatho_variations_df], ignore_index=True)
variations_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
variations_df["class"].value_counts()

Likely-pathogenic    6668
Pathogenic           3614
Name: class, dtype: int64

In [7]:
# downloaing proteins
protein_acc_list = list(variations_df["prot_acc_version"].unique())
download_protein_list(protein_acc_list, start_i=0, home_dir=home_dir) # sequential downloading
# download_protein_list_mpi(protein_acc_list, len(protein_acc_list))
print("#-unique NCBI protein sequences downloaded: ", len(protein_acc_list))

0 NP_001381642.1 Already existis
1 NP_542172.2 Already existis
2 NP_001164006.1 Already existis
3 NP_002065.1 Already existis
4 NP_000806.2 Already existis
5 NP_003027.1 Already existis
6 NP_002608.1 Already existis
7 NP_056030.1 Already existis
8 NP_009193.2 Already existis
9 NP_001036146.1 Already existis
10 NP_004276.2 Already existis
11 NP_005017.3 Already existis
12 NP_004949.1 Already existis
13 NP_005948.3 Already existis
14 NP_055689.1 Already existis
15 NP_009203.2 Already existis
16 NP_055816.2 Already existis
17 NP_000076.2 Already existis
18 NP_002991.2 Already existis
19 NP_000469.3 Already existis
20 NP_001782.1 Already existis
21 NP_005817.1 Already existis
22 NP_000138.2 Already existis
23 NP_065184.2 Already existis
24 NP_006006.3 Already existis
25 NP_060116.2 Already existis
26 NP_116182.2 Already existis
27 NP_000301.1 Already existis
28 NP_683763.2 Already existis
29 NP_006507.2 Already existis
30 NP_001352928.1 Already existis
31 NP_060620.2 Already existis
32 NP_

In [8]:
# filtering on seq-len <= 1022
data = []
n_prots = len(protein_acc_list)
for i, prot in enumerate(protein_acc_list):
    filepath = home_dir+f"data/proteins/fastas/{prot}.fasta" 
    seq_record = SeqIO.parse(filepath, format="fasta").__next__()
    seq = str(seq_record.seq)
    seq_len = len(seq)
    if seq_len<=1022:
        print(f"{i}/{n_prots}")
        x = variations_df[variations_df['prot_acc_version']==prot].copy()
        x["seq_len"] = int(seq_len)
        x["seq"] = seq
        data.append(x)
variations_df = pd.concat(data)
variations_df["class"].value_counts()

Likely-pathogenic    4168
Pathogenic           2308
Name: class, dtype: int64

In [9]:
# saving variants and combined fasta file
filename = "patho_and_likelypatho"
out_filepath = home_dir+f"models/aa_common/datasets_pathogenicity/{filename}"

print("\nLog: saving variants ...")
variations_df.to_csv(out_filepath+".tsv", index=False, sep="\t", header=True)


print("\nLog: Creating merged fasta document ...")
protein_acc_list = list(variations_df["prot_acc_version"].unique())
create_combined_fasta(protein_acc_list, out_filepath+".fasta", home_dir)


Log: saving variants ...

Log: Creating merged fasta document ...
0 NP_001381642.1 Already existis
1 NP_542172.2 Already existis
2 NP_001164006.1 Already existis
3 NP_002065.1 Already existis
4 NP_000806.2 Already existis
5 NP_003027.1 Already existis
6 NP_002608.1 Already existis
7 NP_009193.2 Already existis
8 NP_004276.2 Already existis
9 NP_005948.3 Already existis
10 NP_055689.1 Already existis
11 NP_009203.2 Already existis
12 NP_000076.2 Already existis
13 NP_002991.2 Already existis
14 NP_000469.3 Already existis
15 NP_001782.1 Already existis
16 NP_005817.1 Already existis
17 NP_000138.2 Already existis
18 NP_065184.2 Already existis
19 NP_060116.2 Already existis
20 NP_116182.2 Already existis
21 NP_000301.1 Already existis
22 NP_683763.2 Already existis
23 NP_006507.2 Already existis
24 NP_060620.2 Already existis
25 NP_000365.3 Already existis
26 NP_116145.1 Already existis
27 NP_060209.4 Already existis
28 NP_000089.1 Already existis
29 NP_000783.2 Already existis
30 NP_7

In [10]:
variations_df["class"].value_counts()

Likely-pathogenic    4168
Pathogenic           2308
Name: class, dtype: int64