In [9]:
import pandas as pd
import os
from collections import Counter

In [49]:
score_mapping = {
    "GenoCanyon_score": "GenoCanyon_score",
    "M-CAP_score": "M-CAP_score",
    "MPC_score": "MPC_score",
    "MutationAssessor_score": "MutationAssessor_score",
    "Polyphen2_HVAR_scroe": "Polyphen2_HVAR_score",
    "CADD_score": "CADD_raw",
    "DEOGEN2_score": "DEOGEN2_score",
    "FATHMM_score": "FATHMM_score",
    "Integrated_fitCons_score": "integrated_fitCons_score",
    "LRT_score": "LRT_score",
    "Provean_score": "PROVEAN_score",
    "SIFT4G_score": "SIFT4G_score",
    "VEST4_score": "VEST4_score",
}

variant_list = [
    "upstream_gene_variant",
    "splice_donor_variant",
    "NMD_transcript_variant",
    "non_coding_transcript_variant",
    "synonymous_variant",
    "splice_acceptor_variant",
    "5_prime_UTR_variant",
    "incomplete_terminal_codon_variant",
    "non_coding_transcript_exon_variant",
    "splice_region_variant",
    "splice_polypyrimidine_tract_variant",
    "intron_variant",
    "missense_variant",
    "downstream_gene_variant",
    "coding_sequence_variant",
    "3_prime_UTR_variant"
]

all_feat = list(score_mapping.values()) + ["APF_score", "LoFtool"]

In [4]:
os.chdir("d:/functional-prediction")
# df = pd.read_csv("output/NovelCommonSNP.csv")
df = pd.read_csv("output/allSNP.csv")
# df = pd.read_csv("output/KnownRareSNP.csv")
# df = pd.read_csv("output/NovelRareSNP.csv")

In [51]:
df = df[all_feat]

In [52]:
feat_count_list = []
for col in df.columns:
    col_count = df[col].notnull().sum()
    feat_count_list.append({"feat": col, "count": col_count, "ratio": round(col_count / len(df), 4)})
    
df_count = pd.DataFrame(feat_count_list)

df_count = df_count.sort_values(by=["count"])

feat_list = df_count["feat"].values

print(feat_list)

['DEOGEN2_score' 'M-CAP_score' 'MPC_score' 'MutationAssessor_score'
 'LRT_score' 'FATHMM_score' 'PROVEAN_score' 'Polyphen2_HVAR_score'
 'integrated_fitCons_score' 'VEST4_score' 'SIFT4G_score' 'LoFtool'
 'APF_score' 'GenoCanyon_score' 'CADD_raw']


In [53]:
df_count.to_csv("output/column_ratio.tsv", sep="\t", index=False)

In [54]:
df_binary = pd.read_csv("output/qd_binary_score.csv")
len(df_binary)

487989

In [55]:
col_dict = {
    "column": [],
    "positive": [],
    "ratio": []
}
for col in df_binary.columns:
    if "score_eliminate_" in col:
        col_list = df_binary[col].values
        col_list = [1 if x > 0.5 else 0 for x in col_list]
        col_sum = sum(col_list)
        col_ratio = col_sum / len(col_list)
        
        col_dict["column"].append(col)
        col_dict["positive"].append(col_sum)
        col_dict["ratio"].append(col_ratio)


In [56]:
df_eliminate = pd.DataFrame(col_dict)
df_eliminate.to_csv("output/column_eliminate.tsv", sep="\t", index=False)

In [57]:
df_lof = pd.read_csv("data/LoFtool_scores.txt", sep="\t")
lof_dict = dict(zip(
    df_lof["Gene"].values,
    df_lof["LoFtool_percentile"].values
))

In [58]:
gene_dict = {
    "gene": [],
    "variant_count": [],
    "positive": [],
    "ratio": [],
    "LOF": []
}
for gene, content in df_binary[["gene", "score_eliminate_0"]].groupby(["gene"]):
    col_list = content["score_eliminate_0"].values
    col_list = [1 if x > 0.5 else 0 for x in col_list]
    col_sum = sum(col_list)
    col_len = len(col_list)
    col_ratio = col_sum / col_len
    
    gene_dict["gene"].append(gene)
    gene_dict["variant_count"].append(col_len)
    gene_dict["positive"].append(col_sum)
    gene_dict["ratio"].append(col_ratio)
    gene_dict["LOF"].append(lof_dict.get(gene, ""))

In [59]:
len(gene_dict["gene"])

249

In [60]:
df_gene_ratio = pd.DataFrame(gene_dict).sort_values(by=["ratio"], ascending=False)
df_gene_ratio[:50]

Unnamed: 0,gene,variant_count,positive,ratio,LOF
84,CYP2C19,4304,4272,0.992565,0.923
215,SLCO1B1,4286,4241,0.989501,0.994
83,CYP2C18,2132,2099,0.984522,0.857
86,CYP2C9,2183,2144,0.982135,0.903
214,SLCO1A2,4821,4707,0.976353,0.994
238,UGT2A1,2716,2642,0.972754,0.903
128,GSTA1,602,583,0.968439,0.89
85,CYP2C8,1292,1249,0.966718,0.924
22,ADH7,793,765,0.964691,0.972
129,GSTA2,578,555,0.960208,0.846


In [61]:
df_gene_ratio[50:100]

Unnamed: 0,gene,variant_count,positive,ratio,LOF
114,EPHX2,2119,1413,0.666824,0.949
172,PROS1,3659,2431,0.664389,0.27
149,MGMT,13372,8803,0.658316,0.823
79,CYP2A6,403,263,0.652605,0.916
122,FMO4,964,625,0.64834,
104,CYP4F8,638,413,0.647335,
123,FMO5,1402,904,0.644793,0.978
92,CYP39A1,3624,2300,0.634658,0.428
131,GSTA4,590,374,0.633898,0.736
15,ACE,975,614,0.629744,0.822


In [62]:
df_gene_ratio[100:150]

Unnamed: 0,gene,variant_count,positive,ratio,LOF
20,ADH5,598,223,0.37291,0.826
246,XDH,3333,1217,0.365137,0.922
223,SULT1A1,259,90,0.34749,0.205
170,PPP1R9A,12886,4421,0.343086,0.146
87,CYP2D6,311,106,0.340836,
165,PON2,965,325,0.336788,0.823
143,IL12B,546,182,0.333333,0.69
58,CHST8,3822,1196,0.312925,0.629
196,SLC22A2,1745,540,0.309456,0.614
184,SLC10A1,818,253,0.309291,0.99


In [63]:
df_gene_ratio[150:200]

Unnamed: 0,gene,variant_count,positive,ratio,LOF
180,RYR1,6804,612,0.089947,0.00248
148,METTL1,112,10,0.089286,0.409
117,F5,2546,221,0.086803,0.0863
139,GSTZ1,431,37,0.085847,0.929
70,CYP1B1,354,30,0.084746,0.351
126,GBA,390,32,0.082051,0.00827
75,CYP26C1,316,25,0.079114,
21,ADH6,503,38,0.075547,
9,ABCC6,3003,221,0.073593,0.0194
77,CYP27B1,184,13,0.070652,0.456


In [64]:
df_gene_ratio[200:250]

Unnamed: 0,gene,variant_count,positive,ratio,LOF
147,MAT1A,682,18,0.026393,0.0748
209,SLC5A6,536,14,0.026119,0.125
44,CBR3,465,12,0.025806,0.601
190,SLC19A1,1423,36,0.025299,0.187
197,SLC22A3,3616,89,0.024613,0.682
199,SLC22A5,948,22,0.023207,0.403
221,SPG7,2837,64,0.022559,0.103
45,CDA,1164,24,0.020619,0.229
5,ABCC2,2377,49,0.020614,0.0978
39,ATP7B,2970,61,0.020539,0.034


In [65]:
df_gene_ratio.to_csv("output/gene_conservative_ratio.tsv", sep="\t", index=False)

In [5]:
df_multi = pd.read_csv("output/qd_multi_score.csv")
len(df_multi)

51645

In [7]:
result = df_multi["score_eliminate_0"].values

In [14]:
c_dict = Counter(result)
c_dict["normal function"]

30692

In [15]:
(51645 - 30692) / 51645

0.40571207280472454