In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from Bio import SeqIO

from matplotlib import pyplot as plt

from scipy import stats
from Bio.Alphabet.IUPAC import IUPACProtein

import glob
import sklearn.metrics

import datetime
year = datetime.date.today().year
month = datetime.date.today().month

import os
figs_dir = '../Results/Figures/{}_{:02}'.format(year, month)
if not os.path.exists(figs_dir):
    os.makedirs(figs_dir)

In [3]:
import matplotlib
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['axes.labelsize'] = 12
matplotlib.rcParams['axes.titlesize'] = 12

matplotlib.rcParams['axes.grid'] = True
matplotlib.rcParams['grid.color'] = '0.5'
matplotlib.rcParams['grid.linewidth'] = '0.5'

matplotlib.rcParams['axes.edgecolor'] = '0.25'
matplotlib.rcParams['xtick.color'] = '0'
matplotlib.rcParams['ytick.color'] = '0'

matplotlib.rcParams['xtick.major.width'] = 1
matplotlib.rcParams['ytick.major.width'] = 1
matplotlib.rcParams['ytick.major.size'] = 5
matplotlib.rcParams['xtick.major.size'] = 5
matplotlib.rcParams['axes.spines.right'] = True
matplotlib.rcParams['axes.spines.left'] = True
matplotlib.rcParams['axes.spines.top'] = True
matplotlib.rcParams['axes.spines.bottom'] = True

matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'Helvetica'
matplotlib.rcParams['font.weight']='normal'
matplotlib.rcParams['axes.axisbelow'] = True



In [4]:
from supporting_functions import *

In [26]:
contact_number_modifier = 1
chain_cutoff = 12
metrics = ['CA', 'CB', 'SCcenter'] ###Order here is important, CA has to come first to get distance thresholds for others
df_contacts_dict = {}
for metric in metrics:
    df_contacts_dict[metric] = {}

    
server = 'raptorx'

lengths = []
sequences = []
ppv_dict = {}
for metric in metrics:
    ppv_dict[metric] = []

CA_distance_cutoff = 8
CB_distance_cutoffs = {}
SC_distance_cutoffs = {}

for couplings_file in glob.glob('../Data/Empirical_ml/*.{}.processed.couplings'.format(server))[:]:
    prot_name = couplings_file.split('/')[-1].split('_')[0]+'_'+couplings_file.split('/')[-1].split('_')[1]
    alignment_file = couplings_file.replace('Empirical_ml/', 'analyzed_fastas/')
    alignment_file = alignment_file.replace('.{}.processed.couplings'.format(server), '.mafft.processed.afa')
    alignments = list(SeqIO.parse(alignment_file, 'fasta'))
    nseqs = len(alignments)
#     if nseqs > 1000:
#         continue
#     db_name = couplings_file.split('/')[-1].split('_')[2].strip('.couplings')
#     nseqs = hmmer_results_dict[db_name][hmmer_results_dict[db_name][1]==prot_name].iloc[0][2]
#     if prot_name in df_contacts_dict[metrics[0]]:
#         continue
#     if nseqs < 2000:
#         continue
    print('### {} ({})'.format(prot_name, nseqs))
    total_contacts = 0
    cutoff=0
    for metric in metrics:            
        df_couplings = pd.read_csv(couplings_file, sep=',')
        contact_file_loc = '../Data/Contact_matrices/{}_{}_contacts.csv'.format(prot_name, metric)
        try:
            df_contacts = pd.read_csv(contact_file_loc, index_col=0)
        except FileNotFoundError:
            print(couplings_file)
            continue
        df_contacts, df_contacts_stack = process_contacts_df(df_contacts, 12)
        df_couplings_stack, df_couplings_pivot = process_couplings_df(df_couplings, df_contacts, 12)
        records = list(SeqIO.parse('../Data/fastas/{}.rosetta.fasta'.format(prot_name), 'fasta'))
        assert len(records)==1
        seq = str(records[0].seq)
        merged_df = merge_contacts_couplings(df_contacts_stack, df_couplings_stack, seq)


        number_to_test = round(len(df_contacts.index)* contact_number_modifier) 
        if metric == 'CA':
            total_contacts = merged_df[merged_df['distance']<CA_distance_cutoff]['distance'].count()
            cutoff = CA_distance_cutoff
            lengths.append(len(df_contacts.columns))
            sequences.append(nseqs)
        else:
            temp = df_contacts_stack.sort_values('distance')[:total_contacts]
            cutoff = temp.iloc[total_contacts-1]['distance']
            if metric == 'CB':
                CB_distance_cutoffs[prot_name] = cutoff
            elif metric == 'SCcenter':
                SC_distance_cutoffs[prot_name] = cutoff
                


        ppv, counts = ppv_from_df(merged_df, number_to_test, length_cutoff=cutoff)
#         print(metric, ppv, len(temp.index), cutoff)
#         merged_df['contact'] = np.where(merged_df['distance']>=CA_distance_cutoff, 0, 1)
#         aupr = sklearn.metrics.average_precision_score(merged_df['contact'], merged_df['couplings'])
#         print(aupr)

                
        ppv_dict[metric].append(ppv)
        df_contacts_dict[metric][prot_name] = df_contacts_stack

### 1RW1_A (3860)
### 1AOE_A (9186)
### 1H2E_A (9981)
### 1JBK_A (9996)
### 1VMB_A (5565)


In [27]:
ppv_dict

{'CA': [0.60360360360360366,
  0.80729166666666663,
  0.86956521739130432,
  0.74603174603174605,
  0.88785046728971961],
 'CB': [0.66666666666666663,
  0.8125,
  0.92753623188405798,
  0.75661375661375663,
  0.95327102803738317],
 'SCcenter': [0.60360360360360366,
  0.609375,
  0.77294685990338163,
  0.60846560846560849,
  0.7289719626168224]}