In [27]:
import os
import warnings
os.chdir('/var/www/html/predicationScript')
warnings.filterwarnings('ignore')

In [28]:
import os
import re
import sys
import math
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

rndNum = '931'
uniID = 'P51688'

# rndNum = '672'
# uniID = 'P34059'

def biological_feature(df):
    dom=pd.read_csv('../domain.csv')
    rdv=pd.read_csv('../residual_volume.csv')
    dd_residual_vol=[]
    aa_RV_dict=dict(rdv.values)
    
    # Residual Volume
    for i,j in zip(df["Mutant"],df["Wild"]):
        # Mutant Residue
        if len(i) != 1: 
            mut_vol = math.fsum([float(aa_RV_dict.get(x)) for x in [*i]])
        else: 
            mut_vol = aa_RV_dict.get(i)
            
        # Wild Residue
        if len(j) != 1: 
            wild_vol = math.fsum([float(aa_RV_dict.get(x)) for x in [*j]])
        else: 
            wild_vol = aa_RV_dict.get(j)
        dd_residual_vol.append(mut_vol-wild_vol)
    df["dd_changeinresidualvolume"]=dd_residual_vol
    
    # Domain and Modification from Residue to Residue
    dom_dom=dict(dom.drop("Modification",axis=True,inplace=False).values)
    dom_mod=dict(dom.drop("Domain",axis=True,inplace=False).values)
    Domain=[]
    Modification=[]
    for i in list(df["pos"]):
        i=int(i)
        Domain.append(dom_dom.get(i))
        Modification.append(dom_mod.get(i))
    df["Domain"]=Domain
    df["Modification"]=Modification

    # Polarity and Hydrophobicity
    pol=pd.read_csv('../polarity.csv')
    hydrophobicity=pd.read_csv('../hydrophobicity.csv')
    aa_pol_dic=dict(pol.values)
    aa_hydro_dic=dict(hydrophobicity.values)
    dd_pol=[]
    dd_hydrophobicity=[]
    
    # Polarity
    for i,j in zip(df["Mutant"],df["Wild"]):
        # Mutant Residue
        if len(i) != 1: 
            mut_pol = math.fsum([float(aa_pol_dic.get(x)) for x in [*i]])
        else: 
            mut_pol = aa_pol_dic.get(i)
            
        # Wild Residue
        if len(j) != 1: 
            wild_pol = math.fsum([float(aa_pol_dic.get(x)) for x in [*j]])
        else: 
            wild_pol = aa_pol_dic.get(j)
        dd_pol.append(mut_pol-wild_pol)
        
    # Hydrophobicity
    for i,j in zip(df["Mutant"],df["Wild"]):
        # Mutant Residue
        if len(i) != 1: 
            mut_hydro = math.fsum([float(aa_hydro_dic.get(x)) for x in [*i]])
        else: 
            mut_hydro = aa_hydro_dic.get(i)
            
        # Wild Residue
        if len(j) != 1: 
            wild_hydro = math.fsum([float(aa_hydro_dic.get(x)) for x in [*j]])
        else: 
            wild_hydro = aa_hydro_dic.get(j)
        dd_hydrophobicity.append(mut_hydro-wild_hydro)

    df["dd_polarity"]=dd_pol
    df["dd_hydrophobicity"]=dd_hydrophobicity
    return df

def create_class(df):
    mutation=df["Mutation"]
    mutation=list(mutation)
    wild=[]
    atlaa=[]
    pos=[]
    for ele in mutation:
        eleLst = re.split('(\d+)',ele)
        wild.append(eleLst[0])
        atlaa.append(eleLst[2])
        pos.append(eleLst[1])
    return([wild,atlaa,pos])

prntDir = os.getcwd().rsplit('/',1)[0]
os.chdir(f"{prntDir}/fileUpload/{rndNum}/outFiles")
tools = sorted(os.listdir())
feature = []
for i in tools:
    feature.append(i.split('.')[0] + '_score')
df = pd.DataFrame()
Mut = pd.read_csv('../mutation.txt', header=None)
df["Mutation"] = Mut
empty_df = []
for i,j in zip(tools,feature):
    if pd.read_csv(i).empty:
        empty_df = i
    else:
        outF = pd.read_csv(i)
        df[j] =  df['Mutation'].map(outF.set_index('Mutation')['Score'])
        df = df.fillna(0)
mut_info = create_class(df)
df.insert(1,"Wild",mut_info[0])
df.insert(2,"Mutant",mut_info[1])
df.insert(3,"pos",list(map(int, mut_info[2])))
# new_df = biological_feature(df)
new_df = df

# adding Clinical Features to the dataframe
cliniFile = pd.read_csv('../MPS_IIIA_Clinical_Significance.csv', names=['Mut','clinical'])
new_df['clinical_phenotype'] = new_df.Mutation.map(cliniFile.set_index('Mut')['clinical']).fillna(0).astype(int)

In [30]:
# new_df

In [35]:
# Feature Selection using One-way ANOVA test 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
X = new_df.drop(["Mutation","Wild","Mutant","pos","clinical_phenotype"], axis=True).apply(pd.to_numeric, errors = 'coerce')
y = pd.DataFrame(new_df['clinical_phenotype'], columns=['clinical_phenotype']).apply(pd.to_numeric, errors = 'coerce')
ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X),columns = X.columns)
# X_test = pd.DataFrame(ss.transform(X_test),columns = X_test.columns)
bestfeatures = SelectKBest(score_func = f_classif, k=10)
fit = bestfeatures.fit(X, y)
dfScore = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframe for better visualization
featuresScore = pd.concat([dfcolumns, dfScore], axis=1)
featuresScore.columns = ['Specs', 'Score']# naming the dataframe columns

print(featuresScore.nlargest(10, 'Score'))

             Specs     Score
2    iMutant_score  2.459678
7  polyphen2_score  2.459678
8        sdm_score  2.039771
9     snpsGO_score  1.137088
5    metaSNP_score  1.103908
0       duet_score  1.079014
1      foldX_score  1.079014
3       mCSM_score  1.079014
4    maestro_score  1.079014
6      muPro_score  0.173903
