In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import glob
import pandas as pd
from Bio import SeqIO

import sklearn.metrics

from matplotlib import pyplot as plt



import numpy as np



# from scipy import stats


In [None]:
import process_couplings

In [None]:
import matplotlib
matplotlib.rcParams['xtick.labelsize'] = 16
matplotlib.rcParams['ytick.labelsize'] = 16
matplotlib.rcParams['axes.labelsize'] = 18
matplotlib.rcParams['axes.titlesize'] = 18

matplotlib.rcParams['axes.grid'] = True
matplotlib.rcParams['grid.color'] = '0.5'
matplotlib.rcParams['grid.linewidth'] = '0.5'

matplotlib.rcParams['axes.edgecolor'] = '0.25'
matplotlib.rcParams['xtick.color'] = '0'
matplotlib.rcParams['ytick.color'] = '0'

matplotlib.rcParams['xtick.major.width'] = 1
matplotlib.rcParams['ytick.major.width'] = 1
matplotlib.rcParams['ytick.major.size'] = 5
matplotlib.rcParams['xtick.major.size'] = 5
matplotlib.rcParams['axes.spines.right'] = True
matplotlib.rcParams['axes.spines.left'] = True
matplotlib.rcParams['axes.spines.top'] = True
matplotlib.rcParams['axes.spines.bottom'] = True

matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'helvetica'
matplotlib.rcParams['font.weight']='normal'
matplotlib.rcParams['axes.axisbelow'] = True

In [None]:
import datetime
year = datetime.date.today().year
month = datetime.date.today().month
import os
figs_dir = '../Results/Figures/{}_{:02}'.format(year, month)
if not os.path.exists(figs_dir):
    os.makedirs(figs_dir)

In [None]:
couplings_dir = '../Results/couplings_revision/'
contacts_dir = '../Data/psicov150_aln_pdb/pdb/'
fastas_dir = '../Data/psicov150_aln_pdb/aln_fasta_max1k/'

In [None]:
length_based_modifier = 1.

primary_distance_cutoff = 6

contact_definition = 7.5 

weights_type = 'GSC'

In [None]:
results_dicty_ppv = {}
results_dicty_aupr = {}

types_to_test = ['raw', 'apc', 'ent']

for type_to_test in types_to_test:
    for infile in sorted(glob.glob(couplings_dir+'*{}*.mat'.format(weights_type)))[:]:
        prot_name = infile.split('/')[-1].split('.')[0]
        params = '.'.join(infile.split('/')[-1].split('.')[1:-1])
        if params[:3] != type_to_test:
            continue
        #Read in the couplings for the protein of interest
        testy_df = process_couplings.process_ccmpredpy(infile)
        #Read in the contacts
        df_contacts = pd.read_csv(contacts_dir+'{}_SCcenter_contacts.csv'.format(prot_name), index_col=0)
        df_contacts, df_contacts_stack = process_couplings.process_contacts_df(df_contacts)
        #Read in the fasta sequence
        seq = list(SeqIO.parse(fastas_dir+'{}.fasta'.format(prot_name), 'fasta'))[0]
        seq = str(seq.seq)
        #Merge everyone together
        df_merged = process_couplings.merge_contacts_couplings(df_contacts_stack, testy_df, seq)
        #Remove pairs that are close in primary distance space
        df_merged = process_couplings.remove_close(df_merged, primary_distance_cutoff)
        #Calculate the PPV and add to a results dictionary
        ppv_val, ns = process_couplings.ppv_from_df(df_merged, int(len(seq)*length_based_modifier),\
                                                    length_cutoff=contact_definition)
        try:
            results_dicty_ppv[params].append(ppv_val)
        except:
            results_dicty_ppv[params] = [ppv_val]
        #########
        #Further process the merged dataframe to include a binary variable for contacts
        df_merged['contact'] = df_merged['distance']<contact_definition
        #Calculate the area under the curve and add to a results dictionary
        aupr = sklearn.metrics.average_precision_score(df_merged['contact'], df_merged['couplings'])
        try:
            results_dicty_aupr[params].append(aupr)
        except:
            results_dicty_aupr[params] = [aupr]

In [None]:
coup_type = 'raw'
results_dicty = results_dicty_ppv
metric = 'PPV'

In [None]:
results_dicty.keys()

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

fig, ax = plt.subplots(figsize=(8,3))


data = [np.mean(results_dicty['{}.{}_meanScale_lfactor0.05'.format(coup_type, weights_type)])] +\
    [np.mean(results_dicty['{}.{}_meanScale_lfactor0.{}'.format(coup_type, weights_type, i)]) for i in range(1,10)]+\
    [np.mean(results_dicty['{}.{}_meanScale_lfactor1.0'.format(coup_type, weights_type)])]
xvals= [0.05] + [i/10 for i in range(1,10)]+[1.0]
ax.plot(xvals, data, marker='s', zorder=4, label='{} mean scale'.format(weights_type), markersize=8, c=colors[0])




data = [np.mean(results_dicty['{}.{}_maxScale_lfactor0.05'.format(coup_type, weights_type)])] +\
    [np.mean(results_dicty['{}.{}_maxScale_lfactor0.{}'.format(coup_type, weights_type, i)]) for i in range(1,10)]+\
    [np.mean(results_dicty['{}.{}_maxScale_lfactor1.0'.format(coup_type, weights_type)])]
xvals= [0.05]+[i/10 for i in range(1,10)]+[1.0]
ax.plot(xvals, data, marker='s', zorder=4, label='{} max scale'.format(weights_type), markersize=8, c=colors[1])

ax.axvspan(0.175,0.225, color='k', alpha=0.2)

ax.set_ylabel(metric)
ax.set_xlabel('Pairwise regularization coefficient')
# ax.set_ylim(0.1, 0.45)
legend = ax.legend(loc='best', fontsize=14, framealpha=1.0)


plt.savefig('{}/{}_{}_lfactor.pdf'.format(figs_dir, weights_type, coup_type), bbox_inches='tight')
# # ax.grid(False)