In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import pandas as pd
from typing import List

In [2]:
def summarize_functional_effect(mut_function:str):
    # print(mut_function)
    effects = []
    effect = ""
    flag = False
    for i, ch in enumerate(mut_function):
        if ch=='[':
            flag = True
            continue
        if flag and ch in ['-', '+'] and mut_function[i+1] in ['-', '+', ']', ' ']:
            effect += ch
        elif flag and ch == '=':
            effects.append('=')
        elif flag and ch == '0' and mut_function[i+1]=="]":
            effects.append('0')
            
        if ch=="]":
            flag = False
            if len(effect)>0: effects.append(effect)
            effect=""
    
    if len(effects)>0: return effects
    else: return ""

def get_functional_effect(effects:List):
    counts = [] 
    functional_effect = ""
    for ef in effects:
        if "0" in ef:
            return "Knock-out"
        elif "=" in ef:
            functional_effect = "No-effect"
        else:
            counts.append(len(ef)-2)

    
    if len(counts)==0 and functional_effect=="No-effect": return "No-effect"
    elif max(counts)<3: return "Effect"
    else: return "Knock-out"

In [3]:
data_path = home_dir+f"data/pmd/dfPMD-NSFP43a_subset_func.tsv.gz"
pmd_df = pd.read_csv(data_path, sep="\t", compression="gzip")
print(pmd_df.shape)
print(pmd_df.columns.to_list())

pmd_df["wt"] = pmd_df["mut_real"].apply(lambda x: x[0]) # 1-letter amino acid
pmd_df["mut"] = pmd_df["mut_real"].apply(lambda x: x[-1])
pmd_df["prot_pos"] = pmd_df["mut_real"].apply(lambda x: int(x[1:-1])) # mutation position is 1-indexed

pmd_df["function_summarized"] = pmd_df["function"].apply(summarize_functional_effect)
pmd_df = pmd_df[pmd_df["function_summarized"].apply(lambda x:  len(x)>0)] # removing rows that does not have any functional effect annotations
pmd_df["functional_effect"] = pmd_df["function_summarized"].apply(get_functional_effect)
print("after removing non-functional annotations:", pmd_df.shape)

pmd_df["pmd_nr_id"] = pmd_df["pmd_id"]+pmd_df["nr"].apply(lambda x: "_"+str(int(x)))

pmd_df = pmd_df[pmd_df["source"].apply(lambda x: True if "Human" in str(x) else False)] # only keeping human variants
print("after removing non-human entries:", pmd_df.shape)

# pmd_df.head()
print(pmd_df["functional_effect"].value_counts())
print("#-of unique proteins", pmd_df[["pmd_id", "nr"]].drop_duplicates(keep="first").shape) # 2263
print("#-of unique genes", pmd_df["genename"].drop_duplicates(keep="first").shape) # 1089
print(pmd_df.shape)

  pmd_df = pd.read_csv(data_path, sep="\t", compression="gzip")


(9029, 667)
['mut_id', 'md5', 'pmd_id', 'nr', 'crossref', 'uniprot_id', 'ens_real', 'uniprot_id.1', 'ensembl_id', 'other_ref', 'protein', 'source', 'expression_sys', 'mut_PMD', 'mut_real', 'function', 'fb', 'variant', 'col:Id', '#chr', 'pos(1-based)', 'ref', 'alt', 'aaref', 'aaalt', 'rs_dbSNP', 'hg19_chr', 'hg19_pos(1-based)', 'hg18_chr', 'hg18_pos(1-based)', 'aapos', 'genename', 'Ensembl_geneid', 'Ensembl_transcriptid', 'Ensembl_proteinid', 'Uniprot_acc', 'Uniprot_entry', 'HGVSc_ANNOVAR', 'HGVSp_ANNOVAR', 'HGVSc_snpEff', 'HGVSp_snpEff', 'HGVSc_VEP', 'HGVSp_VEP', 'APPRIS', 'GENCODE_basic', 'TSL', 'VEP_canonical', 'cds_strand', 'refcodon', 'codonpos', 'codon_degeneracy', 'Ancestral_allele', 'AltaiNeandertal', 'Denisova', 'VindijiaNeandertal', 'ChagyrskayaNeandertal', 'SIFT_score', 'SIFT_converted_rankscore', 'SIFT_pred', 'SIFT4G_score', 'SIFT4G_converted_rankscore', 'SIFT4G_pred', 'Polyphen2_HDIV_score', 'Polyphen2_HDIV_rankscore', 'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_score', 'Polyphe

In [150]:
# print(pmd_df["crossref"].unique().shape)

# def clean_crossref_ids(x):
#     x = x.split()[0]
#     if x[0]=="(":
#         x = x[1:]
#     if x[-1]==")":
#         x = x[:-1]
#     return x

# pmd_df['crossref_clean'] = pmd_df['crossref'].apply(clean_crossref_ids)
# crossref_ids = pmd_df['crossref_clean'].drop_duplicates(keep="first").values
# crossref_ids = list(crossref_ids)
# print(len(crossref_ids)) # 3898 ids
# crossref_ids = ", ".join(crossref_ids)
# print(len(crossref_ids.split(', ')))
# crossref_ids

# crossref_to_np_df = pd.read_csv(home_dir+f"data/uniprot_id_mapping_tools_outputs/pmd_crossref_to_np_mapping.tsv", sep="\t")
# crossref_to_np_df.rename(columns={"From": "crossref_clean", "To": "prot_acc_version"}, inplace=True)
# crossref_to_np_df = crossref_to_np_df[crossref_to_np_df["prot_acc_version"].apply(lambda x: x.startswith("NP_"))]
# print(crossref_to_np_df["crossref_clean"].drop_duplicates(keep="first").shape)
# crossref_to_np_df.value_counts()

In [4]:
info_cols = ['mut_id', 'md5', 'pmd_id', 'nr', 'pmd_nr_id', 'crossref', 'uniprot_id', 'protein', 'source', 'mut_PMD', 'mut_real', 'wt', 'mut', 'prot_pos', '#chr', 'pos(1-based)', 'ref', 'alt', 'rs_dbSNP', 'genename', 'ensembl_id', 'function', 'function_summarized', 'functional_effect']
method_score_cols = ["CADD_raw", "MetaRNN_score", "MVP_score", "Polyphen2_HVAR_score", "REVEL_score", "SIFT_score", "integrated_fitCons_score", "phyloP17way_primate", "phastCons17way_primate", "bStatistic"]
other_cols = ['seq']
columns = info_cols + method_score_cols + other_cols
pmd_df = pmd_df[columns]

In [5]:
# filtering the dataset on the protein sequence len <=1022
sequences_df = pmd_df[['pmd_nr_id', 'crossref', 'seq']].drop_duplicates(keep="first")
print("#-of sequences b4 seq-len filter", sequences_df.shape[0])
protid_seq_tuple_list = [(tuple.pmd_nr_id, tuple.seq) for tuple in sequences_df.itertuples() if len(str(tuple.seq))<=1022]
print("#-of sequences after seq-len filter", len(protid_seq_tuple_list))

new_protein_acc_list = list(zip(*protid_seq_tuple_list))[0]
pmd_df = pmd_df[pmd_df["pmd_nr_id"].isin(new_protein_acc_list)]
print("after filtering on seq-len <=1022:", pmd_df.shape)
print(pmd_df["functional_effect"].value_counts())

print("#-of unique proteins", pmd_df[["pmd_id", "nr"]].drop_duplicates(keep="first").shape) # 2068
print("#-of unique genes", pmd_df["genename"].drop_duplicates(keep="first").shape) # 997

#-of sequences b4 seq-len filter 2237
#-of sequences after seq-len filter 2042
after filtering on seq-len <=1022: (7310, 35)
Effect       3859
No-effect    1809
Knock-out    1642
Name: functional_effect, dtype: int64
#-of unique proteins (2042, 2)
#-of unique genes (990,)


In [6]:
# computing missing rows for each methods
import numpy as np    
def compute_avg(x):
    x = str(x).split(";")
    return np.mean([float(i) for i in x if i!="."]) 

def compute_number_of_missing_rows(col_name): 
    avg_scores = pmd_df[col_name].apply(compute_avg)
    avg_scores = avg_scores[pd.isna(avg_scores)]
    n_missing_rows = avg_scores.shape[0]
    total_rows = pmd_df.shape[0]
    print(col_name, f"\t100*({n_missing_rows}/{total_rows})=", f"{100*(n_missing_rows/total_rows):.4f}")

for col_name in method_score_cols:
    compute_number_of_missing_rows(col_name)
    # break

# pmd_df["MVP_score"].value_counts()

  return _methods._mean(a, axis=axis, dtype=dtype,


CADD_raw 	100*(3/7310)= 0.0410
MetaRNN_score 	100*(3/7310)= 0.0410
MVP_score 	100*(116/7310)= 1.5869
Polyphen2_HVAR_score 	100*(138/7310)= 1.8878
REVEL_score 	100*(53/7310)= 0.7250
SIFT_score 	100*(168/7310)= 2.2982
integrated_fitCons_score 	100*(419/7310)= 5.7319
phyloP17way_primate 	100*(0/7310)= 0.0000
phastCons17way_primate 	100*(0/7310)= 0.0000
bStatistic 	100*(61/7310)= 0.8345


In [7]:
pmd_df.to_csv(home_dir+"data/pmd/pmd_with_dbnsfp.tsv", sep="\t", index=False)
# pmd_df.to_csv(home_dir+"models/aa_common/datasets_pmd_analysis/pmd_data.tsv", sep="\t", index=False)
# sequences_df = pmd_df[['pmd_nr_id', 'crossref', 'seq']].drop_duplicates(keep="first")
# print("#-of sequences", sequences_df.shape)

# out_fasta_filepath = home_dir+"models/aa_common/datasets_pmd_analysis/pmd_sequences.fasta"
# out_fasta_file_handle = open(out_fasta_filepath, "w")

# for tuple in sequences_df.itertuples():
#     out_fasta_file_handle.write(f">{tuple.pmd_nr_id} | {tuple.crossref}\n")
#     out_fasta_file_handle.write(f"{tuple.seq}\n")
#     # break
# out_fasta_file_handle.close()  

### testing different things here...

In [156]:
pmd_df[["pmd_nr_id", "prot_pos"]].value_counts()
pmd_df[(pmd_df["pmd_nr_id"]=="A000006_2") & (pmd_df["prot_pos"]==271)][["wt", "mut", "prot_pos"]]

Unnamed: 0,wt,mut,prot_pos
0,C,Y,271


In [157]:
print("#-of rows mapped to rs-ids", pmd_df[pmd_df["rs_dbSNP"]!="."].shape)  
print("#-rows not mapped to rs-ids", pmd_df[pmd_df["rs_dbSNP"]=="."].shape)

#-of rows mapped to rs-ids (2311, 49)
#-rows not mapped to rs-ids (4999, 49)


In [158]:
print(pmd_df[pmd_df["crossref"].apply(lambda x: False if "HUMAN" in x else True)]["crossref"].unique())
print(pmd_df[pmd_df["source"].apply(lambda x: False if "Human" in str(x) else True)]["source"].unique())
pmd_df[pmd_df["source"].apply(lambda x: True if "Human" in str(x) else False)].shape

['DIRECT1' 'S10AA_RABIT' 'B45022' 'PPIA_PONPY' 'FKB1A_RABIT' 'A60386']
[]


(7310, 49)