## IMPORTING NECESSARY PACKAGES

In [3]:
from fragsys_analysis import *

## READING INPUT DATA

In [4]:
main_dir = "/Users/2394007/Documents/PHD/LOCAL/phase4/prots"
prot_dirs = sorted(Path(main_dir).iterdir(), key = os.path.getmtime)
results_dir = "/Users/2394007/Documents/PHD/LOCAL/FRAGSYS_DEF/results/new"

In [5]:
len(prot_dirs)

35

## PROTEIN SUMMARY DATAFRAME

In [6]:
results_dfs = []
for prot_dir in prot_dirs:
    prot = str(prot_dir).split("/")[-1]
    subdirs = sorted(os.listdir(os.path.join(prot_dir, "unsupp_cifs"))) # one for each group
    for subdir in subdirs:
        res_path = os.path.join(prot_dir, "{}_{}_results.csv".format(prot, subdir))
        if os.path.isfile(res_path):
            prot_res = pd.read_csv(res_path)
            results_dfs.append(prot_res)
        else:
            print("Group {} of {} did not present results".format(prot, subdir))
            pass
        
fragsys_results_df = pd.concat(results_dfs).reset_index(drop = True)
fragsys_results_df["vars_per_seq"] = round(fragsys_results_df.n_vars/fragsys_results_df.n_human_seqs, 2)
fragsys_results_df["vars_per_res"] = round(fragsys_results_df.n_vars/fragsys_results_df.n_human_res, 2)

Group P0DTD1 of 1 did not present results
Group P0DTD1 of 3 did not present results


In [7]:
fragsys_results_df.shape # total number of protein segments

(37, 13)

In [8]:
fragsys_results_df.head(3)

Unnamed: 0,acc,group,n_strucs,n_ligs,n_un_ligs,n_bs,n_seqs,n_human_seqs,n_var_seqs,n_vars,n_human_res,vars_per_seq,vars_per_res
0,H0Y4R8,0,1,1,1,1,596,143,138,3566,10864,24.94,0.33
1,O43809,0,10,19,10,4,10,1,1,206,390,206.0,0.53
2,Q5T0W9,0,11,24,10,6,415,16,16,1398,2254,87.38,0.62


In [9]:
fragsys_results_df.to_pickle(os.path.join(results_dir, "all_prots.pkl")) # all protein segments in data set

## BINDING SITES DATAFRAME

In [18]:
bs_dfs = []
for prot_dir in prot_dirs:
    prot = str(prot_dir).split("/")[-1]
    subdirs = sorted(os.listdir(os.path.join(prot_dir, "unsupp_cifs")))
    for subdir in subdirs:
        bs_res_path = os.path.join(prot_dir, "results", "{}_{}_BS_df.csv".format(prot, subdir))
        if os.path.isfile(bs_res_path):
            prot_bs_res = pd.read_csv(bs_res_path)
            prot_bs_def = pd.read_csv(os.path.join(prot_dir, "results", "{}_{}_BS_def_OC_single_i_rel_0.66.csv".format(prot, subdir)))
            bs_lig_occ_dict = {"BS"+str(k):v for k, v in prot_bs_def.binding_site.value_counts().to_dict().items()}
            prot_bs_res["number_ligs"] = prot_bs_res.bs_id.map(bs_lig_occ_dict)
            prot_bs_res["prop_ligs"] = prot_bs_res.number_ligs/prot_bs_res.number_ligs.sum()
            prot_bs_res["protein"] = prot
            prot_bs_res["group"] = subdir
            bs_dfs.append(prot_bs_res)
        else:
            print("Group {} of {} did not present binding sites data".format(prot, subdir))
            pass
all_bss_dfs = round(pd.concat(bs_dfs).reset_index(drop = True), 4)

Group P0DTD1 of 1 did not present binding sites data
Group P0DTD1 of 3 did not present binding sites data


In [19]:
all_bss_dfs.shape # total number of residues with structural information

(293, 14)

In [20]:
all_bss_dfs.head(3)

Unnamed: 0,bs_id,vars,occ,vars_per_occ,MES,p,norm_shenkin_rel,shenkin_ci,MES_ci,number_bs_res,number_ligs,prop_ligs,protein,group
0,BS0,168,420,0.4,0.2065,0.0281,54.0885,10.6309,0.1831,6,1,1.0,H0Y4R8,0
1,BS0,24,48,0.5,-0.0623,0.8951,20.8011,9.8766,0.522,24,14,0.7368,O43809,0
2,BS1,13,26,0.5,-0.0587,1.0,28.6694,12.4626,0.6883,13,2,0.1053,O43809,0


In [21]:
all_bss_dfs.to_pickle(os.path.join(results_dir, "all_bss.pkl")) # all binding sites

## BINDING RESIDUES DATAFRAME

In [10]:
fragsys_dfs = []
for prot_dir in prot_dirs:
    prot = str(prot_dir).split("/")[-1]
    subdirs = sorted(os.listdir(os.path.join(prot_dir, "unsupp_cifs")))
    for subdir in subdirs:
        res_path = os.path.join(prot_dir, "results", "{}_{}_fragsys_df.csv".format(prot, subdir))
        if os.path.isfile(res_path):
            prot_res = pd.read_csv(res_path)
            prot_res["protein"] = prot
            prot_res["group"] = subdir
            fragsys_dfs.append(prot_res)
        else:
            print("Group {} of {} did not present results".format(prot, subdir))
            pass
        
fragsyss_df = pd.concat(fragsys_dfs).reset_index(drop = True)
fragsyss_df.SS = fragsyss_df.SS.fillna("C")

Group P0DTD1 of 1 did not present results
Group P0DTD1 of 3 did not present results


In [11]:
fragsyss_df.shape # total number of ligand binding residues

(14172, 62)

In [12]:
fragsyss_df.head(3)

Unnamed: 0,index,UniProt_ResNum,Pfam_column,alignment_column,shenkin,occ,gaps,occ_pct,gaps_pct,rel_norm_shenkin,...,BS14,BS15,BS16,BS17,BS18,BS19,BS20,BS21,BS22,BS23
0,52,101,79,79,76.10628,389,207,0.652685,0.347315,75.041931,...,,,,,,,,,,
1,62,106,85,85,58.693017,389,207,0.652685,0.347315,56.402733,...,,,,,,,,,,
2,212,190,317,317,52.334433,240,356,0.402685,0.597315,49.596489,...,,,,,,,,,,


In [13]:
fragsyss_df.to_pickle(os.path.join(results_dir, "all_bs_ress.pkl")) # all binding site residues

## DSSP DATAFRAME

In [14]:
dssp_dfs = []
for prot_dir in prot_dirs:
    prot = str(prot_dir).split("/")[-1]
    subdirs = sorted(os.listdir(os.path.join(prot_dir, "unsupp_cifs")))
    for subdir in subdirs:
        dssp_subdir = os.path.join(prot_dir, "results", "dssp", subdir)
        sifts_subdir = os.path.join(prot_dir, "results", "sifts", subdir)
        dssp_files = [file for file in os.listdir(dssp_subdir) if file.startswith("dssp")]
        pdb_id = dssp_files[0].split("_")[1]
        example_dssp = os.path.join(dssp_subdir, dssp_files[0]) #only grabbing one example dssp file
        example_sifts = os.path.join(sifts_subdir, "sifts_mapping_{}.csv".format(pdb_id))
        if os.path.isfile(example_dssp):
            dssp_df = pd.read_csv(example_dssp)
            sifts_df = pd.read_csv(example_sifts)
            dssp_df["protein"] = prot
            dssp_df["group"] = subdir
            dssp_df["structure"] = pdb_id
            merged_df = dssp_df.merge(sifts_df[["PDB_ResNum", "PDB_ChainID", "UniProt_ResNum"]], how = "left", left_on = ["PDB_ResNum", "CHAIN"], right_on = ["PDB_ResNum", "PDB_ChainID"])
            dssp_dfs.append(merged_df)
        else:
            print("Group {} of {} did not present DSSP data".format(prot, subdir))
            pass
        
all_dssp_dfs = pd.concat(dssp_dfs)
all_dssp_dfs_filt = all_dssp_dfs.copy().query('UniProt_ResNum == UniProt_ResNum')
all_dssp_dfs_filt.UniProt_ResNum = all_dssp_dfs_filt.UniProt_ResNum.astype(int)

In [15]:
all_dssp_dfs_filt.shape # total number of residues with structural information

(13195, 17)

In [16]:
all_dssp_dfs_filt.head(3)

Unnamed: 0,PDB_ResNum,CHAIN,AA,SS,ACC,TCO,KAPPA,ALPHA,PHI,PSI,CHAIN_FULL,RSA,protein,group,structure,PDB_ChainID,UniProt_ResNum
0,701,A,N,,101,0.0,360.0,360.0,360.0,154.3,A,64.331,H0Y4R8,0,5pwc,A,75
1,702,A,S,,47,-0.194,360.0,-139.0,-58.4,154.5,A,36.154,H0Y4R8,0,5pwc,A,76
2,703,A,N,S,126,0.27,81.3,57.9,-101.6,5.5,A,80.255,H0Y4R8,0,5pwc,A,77


In [17]:
all_dssp_dfs_filt.to_pickle(os.path.join(results_dir, "all_dssp_dfs.pkl")) # all dssp data frames

## VARIANTS DATAFRAME

In [26]:
missense_dfs = []
for prot_dir in prot_dirs:
    prot = str(prot_dir).split("/")[-1]
    subdirs = sorted(os.listdir(os.path.join(prot_dir, "unsupp_cifs")))
    for subdir in subdirs:
        missense_df_path = os.path.join(prot_dir,"results/varalign/{}/{}_{}_missense_df.csv".format(subdir, prot, subdir))
        if os.path.isfile(missense_df_path):
            miss_df = pd.read_csv(missense_df_path)
            miss_df["protein"] = prot
            miss_df["group"] = subdir
            missense_dfs.append(miss_df)
        else:
            print("Group {} of {} did not present variation data".format(prot, subdir))
            pass
        
all_missense_df = pd.concat(missense_dfs).reset_index(drop = True)

Group P0DTD1 of 1 did not present variation data
Group P0DTD1 of 3 did not present variation data


In [27]:
all_missense_df.shape # total number of alignment columns with variation data

(10408, 17)

In [28]:
all_missense_df.head(3)

Unnamed: 0,col,shenkin,occ,gaps,occ_pct,gaps_pct,variants,rel_norm_shenkin,abs_norm_shenkin,oddsratio,log_oddsratio,pvalue,ci_dist,miss_class,miss_color,protein,group
0,1,6.0,13,126,0.086957,0.913043,5,0.0,0.0,1.171991,0.158704,0.785343,1.032115,CME,green,H0Y4R8,0
1,2,34.559062,28,111,0.195652,0.804348,3,30.569689,25.051809,0.325849,-1.121321,0.059301,1.191288,,grey,H0Y4R8,0
2,3,52.142235,50,89,0.355072,0.644928,12,49.39076,40.475645,0.730264,-0.314348,0.378002,0.631191,,grey,H0Y4R8,0


In [29]:
all_missense_df.to_pickle(os.path.join(results_dir, "all_miss_dfs.pkl")) # all variant-containing columns

## INTERACTIONS DATAFRAME

In [22]:
all_cons_dfs = []
for prot_dir in prot_dirs:
    prot = str(prot_dir).split("/")[-1]
    subdirs = sorted(os.listdir(os.path.join(prot_dir, "unsupp_cifs")))
    for subdir in subdirs:
        arpeggio_subdir = os.path.join(prot_dir, "results", "arpeggio", subdir)
        arpeggio_files = [file for file in os.listdir(arpeggio_subdir) if file.startswith("arpeggio_all_cons_split")]
        for file in arpeggio_files:
            pdb_id = file[-8:-4]
            file_df = pd.read_csv(os.path.join(arpeggio_subdir, file))
            file_df["struc"] = pdb_id
            file_df["protein"] = prot
            all_cons_dfs.append(file_df)
all_fragsys_cons = pd.concat(all_cons_dfs).reset_index(drop = True)

In [23]:
all_fragsys_cons.shape # total number of interactions

(269013, 30)

In [24]:
all_fragsys_cons.head(3)

Unnamed: 0,Chain (Atom1),ResNum (Atom1),ResName (Atom1),Atom (Atom1),Chain (Atom2),ResNum (Atom2),ResName (Atom2),Atom (Atom2),Clash,Covalent,...,Carbonyl,Polar,Weak Polar,Atom proximity,Vdw proximity,Interacting entities,contact_type,UniProt_Resnum,struc,protein
0,A,727,HIS,CB,H,906,W77,C,0,0,...,0,0,0,4.841,1.441,INTER,sidechain,101.0,5pwc,H0Y4R8
1,A,727,HIS,CB,H,906,W77,N1,0,0,...,0,0,0,3.62,0.37,INTER,sidechain,101.0,5pwc,H0Y4R8
2,A,727,HIS,ND1,H,906,W77,C6,0,0,...,0,0,0,4.64,1.39,INTER,sidechain,101.0,5pwc,H0Y4R8


In [25]:
all_fragsys_cons.to_pickle(os.path.join(results_dir, "all_bss_cons.pkl")) # all binding site contacts