In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib
import random
import scipy.stats

In [3]:
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles

In [9]:
from matplotlib.colors import ListedColormap
c = matplotlib.cm.viridis
# c = ListedColormap(sns.color_palette('mako', 256))

colors = {'amacV14L':c(0), 'amac':c(60), 'av':c(240), 'pplu':c(200), 'cgre':c(110),
         'amacV14LGFP':c(0), 'amacGFP':c(50), 'avGFP':c(240), 'ppluGFP':c(200), 'cgreGFP':c(110),
         'amacGFP:V14L':c(0), 'ppluGFP2':c(200)}

### Get folder IDs, etc.

In [9]:
import os

In [10]:
def find_folder_upstream(folder_name, max_iterations=50):
    current_folder = os.getcwd()
    counter = 0
    while os.path.basename(current_folder) != folder_name and counter < max_iterations:
        current_folder = os.path.dirname(current_folder)
        counter += 1
    if not counter < max_iterations:
        return None
    return current_folder

In [11]:
root_folder_name = 'Orthologous_GFP_Fitness_Peaks'
root_folder = find_folder_upstream(root_folder_name)
if not root_folder:
    print('Did not find the root folder for our github repository')

data_folder = os.path.join(root_folder, 'data', '')
structure_folder = os.path.join(data_folder, 'protein_structure', '')

analysis_folder = os.path.join(root_folder, 'analysis', '')

figures_folder = os.path.join(analysis_folder, 'figures', '')
pymol_folder = os.path.join(analysis_folder, 'pymol', '')
notebooks_folder = os.path.join(analysis_folder, 'notebooks', '')
ngs_folder = os.path.join(analysis_folder, 'ngs_processing', '')

## Variables

In [12]:
bc_controls = {
    'av' : ['ACCCCAAGCAAAAACAGCCG', 'AAACCAGTAAATGAAAAACA', 'ACCCCCCAATCCCCCACCTC', 'CTCCACTATAACACAATCAC'] , 
        'amac' : ['CTTTCCGACGCTATATCCCC', 'CTTTTACTGTGTGTAATTTT'] , 
    'cgre' :['AACCCACGCCCCCATTTTTT', 'TTCCCCCCCGCCCCATTCCT', 'AATTTCCTCTTTACTTCATA', 'ACCTCACATCTTCTGACTTT'], 
        'pplu' : ['CCCCGCCCTCTTACATGCCT', 'CCACCATATCCACACCCCCA'], 
    'count' : ['CCAGCACCACCAAAGCATGA', 'ACTCGCACTTTCGAAACCCA', 'CTCCCCCCTAGCAATCCACC', 
               'AACCCCCAAAAGAAAAACCC', 'TCTAATTACAAAACAATTCC']}

In [13]:
chromomuts = {'amac' : ['R95', 'Y65', 'G66', 'E221'],
             'cgre' : ['R96', 'Y68', 'G69', 'E223'],
             'pplu' : ['R86', 'Y57', 'G58', 'E209'],
             'pseudopos':['R99', 'Y68', 'G69', 'E229']}

In [14]:
genekey = {'av':0, 'amac':1, 'cgre':2, 'pplu':3, 'amacV14L':1}

In [15]:
gate_borders = {'amac':np.array([  897.5,  1571. ,  2865.5,  4447.5,  7381.5, 10533. , 15205. ]),
               'cgre':np.array([ 1179. ,  2594. ,  5606. , 11382.5, 18682.5, 26388.5, 37143.5]),
               'pplu':np.array([ 1179. ,  2388.5,  4721.5,  9176. , 13279. , 18263.5, 24702.5])}

gate_borders_log = {gene: np.log10(gate_borders[gene]) for gene in gate_borders}

In [16]:
# scaling to 0 = dark (middle of the darkest gate), 1 = WT
# values taken from final log10 aminoacid datasets

scaling_refs = {'amac': (2.652246341003323, 3.9707333545555503),
 'cgre': (2.7708520116421442, 4.49691401841976),
 'pplu': (2.7708520116421442, 4.225823746695675),
 'av': (1.28341923933, 3.7192121319)}

In [17]:
gate_borders_scaled = {gene: (gate_borders_log[gene] - scaling_refs[gene][0]) / 
                       (scaling_refs[gene][1] - scaling_refs[gene][0]) for gene in ['amac','cgre','pplu']}

In [1]:
names = {'amac':'amacGFP', 'amacV14L':'amacGFP:V14L', 'V14L':'amacGFP:V14L',
         'av':'avGFP', 'cgre':'cgreGFP', 'pplu':'ppluGFP2',
        'cgre132':'cgreGFP:132', 'cgre1338':'cgreGFP:1338', 'cgre4111':'cgreGFP:4111', 'cgre9708':'cgreGFP:9708'}

### Structure info

In [20]:
%run lgs01_functions_for_getting_aa_pseudopositions.ipynb

General: buried positions, conserved positions, etc.

In [21]:
# this is from the avGFP data; native positions, count starting at 1 (Met)
buried_pos = [8,12,14,16,18,20,27,29,31,33,35,40,42,44,46,48,54,55,56,57,58,59,60,61,62,63,64,65,66,67,
        68,69,70,71,72,74,82,83,84,85,86,87,88,92,94,96,98,100,106,108,110,112,114,119,121,123,125,127,130,
        141,143,145,148,150,152,161,163,165,167,169,179,181,183,185,199,201,203,205,207,218,220,222,224,226]

# make count start at 0 and convert to pseudo positions
buried_pos = [x-1 for x in buried_pos]
buried_pos = {nativepos_to_pseudopos[x][0] for x in buried_pos}

Secondary structures (extracted from Pymol)

    PYMOL> iterate n. CA, print resi + ':' + ss + ','

In [None]:
L = 'L'
S = 'S'
H = 'H'

In [None]:
amac_ss_pymol = {2:L,3:H,4:H,5:H,6:H,7:H,8:H,9:L,10:L,11:L,12:S,13:S,14:S,15:S,16:S,17:S,18:S,19:S,20:S,21:S,22:S,
23:L,24:L,25:S,26:S,27:S,28:S,29:S,30:S,31:S,32:S,33:S,34:S,35:S,36:S,37:H,38:H,39:H,40:S,41:S,42:S,43:S,44:S,45:S,46:S,
47:L,47:L,48:L,49:L,50:L,51:L,52:L,53:L,54:L,55:L,56:L,57:H,58:H,59:H,60:H,61:H,62:H,63:H,64:H,68:L,69:H,70:H,
71:H,72:L,73:L,74:L,75:L,76:H,77:H,78:H,79:H,80:H,81:H,82:L,83:H,84:H,85:H,86:H,87:H,88:L,89:L,90:L,91:L,92:S,
93:S,94:S,95:S,96:S,97:S,98:S,99:S,100:S,101:L,102:L,103:S,104:S,105:S,106:S,107:S,108:S,109:L,109:L,110:S,
111:S,112:S,113:S,114:S,115:S,116:L,117:L,118:S,119:S,120:S,121:S,122:S,123:S,124:S,125:S,126:L,126:L,
127:L,128:L,129:L,130:L,131:L,132:L,133:L,134:L,135:H,136:H,137:H,138:H,139:L,140:L,141:L,142:L,143:L,144:L,145:L,
146:L,147:L,147:L,148:L,149:L,150:S,151:S,152:S,153:S,154:S,155:S,156:L,157:L,158:L,159:S,160:S,161:S,162:S,163:S,
164:S,165:S,166:L,167:S,168:S,169:S,170:S,171:S,172:L,173:L,174:S,175:S,176:S,177:S,178:S,179:S,
180:L,180:L,181:S,182:S,183:S,184:S,185:S,186:S,187:S,188:L,189:L,190:L,191:L,192:L,193:L,194:L,
195:L,196:L,197:L,197:L,198:L,199:S,200:S,201:S,202:S,203:L,203:L,204:S,205:S,206:S,207:S,208:S,
209:L,210:L,211:L,212:L,213:L,214:L,215:L,216:L,
217:L,218:S,219:S,220:S,221:S,222:L,222:L,223:S,224:S,225:S,226:S,227:S,228:L,}

amac_ss_pymol = [amac_ss_pymol[i] if i in amac_ss_pymol else '-' for i in range(1,len(amac_wt))]
amac_ss_pymol = ''.join(amac_ss_pymol)

In [None]:
cgre_ss_pymol = {5:H,6:H,7:H,8:H,9:H,10:H,11:L,12:L,13:L,14:L,15:S,16:S,17:S,18:S,19:S,20:S,
21:S,22:S,23:S,24:S,25:S,26:L,27:L,28:S,29:S,30:S,31:S,32:S,33:S,34:S,35:S,36:S,37:S,38:S,39:S,
40:L,41:L,42:L,43:L,44:S,45:S,46:S,47:S,48:S,49:S,50:S,51:S,52:L,53:L,54:L,55:L,56:L,57:L,58:L,
59:H,60:H,61:H,62:H,63:H,64:H,65:L,66:L,67:L,71:H,72:H,73:H,74:H,75:H,76:L,77:L,78:L,79:L,80:L,81:L,82:L,83:H,
84:H,85:H,86:H,87:H,88:H,89:L,90:L,91:L,92:L,93:S,94:S,95:S,96:S,97:S,98:S,99:S,100:S,101:S,102:L,103:L,
104:L,105:L,106:S,107:S,108:S,109:S,110:S,111:S,112:S,113:S,114:S,115:S,116:S,117:L,118:L,119:S,
120:S,121:S,122:S,123:S,124:S,125:S,126:S,127:S,128:S,129:S,130:L,131:L,132:L,133:L,134:L,135:L,136:L,
137:L,138:L,139:L,140:L,141:L,142:L,143:L,144:L,145:L,146:L,147:L,148:L,149:S,150:S,151:S,152:S,153:S,154:S,
155:S,156:S,157:H,158:H,159:H,160:H,161:S,162:S,163:S,164:S,165:S,166:S,167:S,168:S,169:S,170:S,
171:S,172:L,173:L,174:L,175:L,176:L,177:S,178:S,179:S,180:S,181:S,182:S,183:S,184:S,185:S,186:S,
187:S,188:S,189:L,190:L,191:L,192:L,193:L,194:L,195:L,196:L,197:L,198:L,199:L,200:S,201:S,202:S,203:S,
204:S,205:S,206:S,207:S,208:S,209:S,210:L,211:L,212:L,213:L,214:L,215:L,216:L,217:L,218:L,219:S,
220:S,221:S,222:S,223:S,224:S,225:S,226:S,227:S,228:S,229:S,230:H,231:H,232:H,233:H,234:H,235:H}

cgre_ss_pymol = [cgre_ss_pymol[i] if i in cgre_ss_pymol else '-' for i in range(1,len(cgre_wt))]
cgre_ss_pymol = ''.join(cgre_ss_pymol)

In [None]:
pplu_ss_pymol = {3:L,4:S,5:S,6:S,7:S,8:S,9:S,10:S,11:S,12:S,13:S,14:S,15:L,
16:L,17:S,18:S,19:S,20:S,21:S,22:S,23:S,24:S,25:L,26:S,27:S,28:S,29:H,30:H,31:H,32:S,33:S,
34:S,35:S,36:S,37:S,38:S,39:S,40:S,41:L,42:L,43:L,44:L,45:L,46:L,47:L,48:L,49:H,50:H,51:H,52:H,53:H,54:H,
55:H,56:L,60:L,61:L,62:L,63:L,64:L,65:L,66:L,67:L,68:L,69:L,70:L,71:L,72:L,73:H,74:H,75:H,76:H,77:H,
78:H,79:H,80:L,81:L,82:L,83:S,84:S,85:S,86:S,87:S,88:S,89:S,90:S,91:S,92:L,93:L,94:S,95:S,96:S,97:S,
98:S,99:S,100:S,101:S,102:S,103:S,104:S,105:S,106:L,107:L,108:L,109:S,110:S,111:S,112:S,113:S,114:S,
115:S,116:S,117:S,118:S,119:S,120:L,121:L,122:L,123:L,124:L,125:L,126:L,127:L,128:L,129:L,130:L,131:L,
132:L,133:S,134:S,135:S,136:L,137:L,138:S,139:S,140:S,141:S,142:S,143:S,144:S,145:S,146:S,147:L,148:L,
149:S,150:S,151:S,152:S,153:S,154:S,155:S,156:S,157:S,158:S,159:S,160:S,161:L,162:L,163:S,164:S,165:S,
166:S,167:S,168:S,169:S,170:S,171:S,172:S,173:S,174:S,175:S,176:S,177:L,178:L,179:L,180:L,181:L,182:L,
183:L,184:L,185:L,188:L,189:L,190:L,191:S,192:S,193:S,194:S,195:S,196:S,197:S,198:S,199:S,200:S,201:S,
202:L,203:L,204:L,205:S,206:S,207:S,208:S,209:S,210:S,211:S,212:S,213:S,214:S,215:S,216:L,217:L,218:L}

pplu_ss_pymol = [pplu_ss_pymol[i] if i in pplu_ss_pymol else '-' for i in range(1,len(pplu_wt))]
pplu_ss_pymol = ''.join(pplu_ss_pymol)

In [None]:
av_ss_pymol = {3:L,4:L,5:H,6:H,7:H,8:H,9:H,10:L,
               11:L,12:L,13:L,14:S,15:S,15:S,16:S,17:S,18:S,19:S,20:L,21:L,22:L,23:L,24:L,
25:L,26:L,27:L,28:L,29:S,30:S,31:L,32:L,33:L,34:L,35:L,36:L,37:L,38:L,39:L,40:L,41:L,42:S,42:S,43:S,
43:S,44:S,45:S,46:S,47:S,48:S,49:L,50:L,51:L,52:L,53:L,54:L,55:L,56:H,57:H,58:H,59:H,60:H,61:H,62:L,
63:L,64:L,68:H,69:H,70:H,71:H,72:H,73:L,73:L,74:L,75:H,76:H,77:H,78:H,79:H,80:H,81:H,82:H,83:H,84:H,
85:H,86:H,86:H,87:H,88:L,89:L,90:L,90:L,91:L,92:L,93:S,94:S,95:S,96:S,97:S,98:S,99:S,100:S,101:L,102:L,
103:L,104:L,105:L,106:S,107:S,108:S,109:S,110:S,111:S,112:S,113:S,114:S,115:L,116:L,117:L,118:L,119:S,
120:S,121:S,122:S,123:S,124:S,124:S,125:S,126:S,127:L,128:L,129:L,130:L,131:L,131:L,132:L,133:L,134:L,
135:L,136:L,137:L,138:L,139:L,140:L,141:L,142:L,142:L,143:L,144:L,145:L,146:L,147:L,148:L,149:S,150:S,
151:S,152:S,153:S,154:L,155:H,156:H,157:H,158:H,159:H,160:L,161:L,162:S,163:S,164:S,164:S,165:S,166:S,
166:S,167:S,168:S,169:S,170:S,171:L,172:L,173:L,174:L,175:L,176:S,177:S,178:S,179:S,180:S,181:S,182:S,
183:S,184:S,184:S,185:S,186:S,187:L,188:L,189:L,190:L,190:L,191:L,192:L,193:L,194:L,195:L,196:L,197:L,
198:L,199:S,200:S,201:S,202:S,203:S,204:S,205:S,206:S,207:S,208:S,209:L,210:L,211:L,212:L,213:L,214:L,
215:L,216:L,217:S,218:S,219:S,220:S,221:S,222:S,223:S,224:S,225:S,226:S,227:S,228:L,229:L,230:L,231:L}

av_ss_pymol = [av_ss_pymol[i] if i in av_ss_pymol else '-' for i in range(1,len(av_wt))]
av_ss_pymol = ''.join(av_ss_pymol)

In [None]:
def plot_secondary_structure(residue_ss, y, arrow_width=0.1, head_length=1, 
                             linewidth=3, hel=0.25, c='k', shift=0):
    helices = []
    betas = []
    loops = []
    
    ss = residue_ss[0]
    indices = [0]
    for i in range(1, len(residue_ss)):
        if residue_ss[i] != residue_ss[i-1]:
            indices.append(i)
            if ss=='H':
                helices.append(indices)
            elif ss=='S':
                betas.append(indices)
            elif ss=='L':
                loops.append(indices)
            ss = residue_ss[i]
            indices = [i]
        if i==len(residue_ss)-1:
            indices.append(i+1)
            if ss=='H':
                helices.append(indices)
            elif ss=='S':
                betas.append(indices)
            elif ss=='L':
                loops.append(indices)
            
#     plt.xlim(0, len(residue_ss))
#     plt.ylim(-1,1)

    for loop in loops:
        plt.plot([loop[0]-shift, loop[1]-shift], [y,y], linestyle='-', linewidth=linewidth, color=c)
    for helix in helices:
        plt.plot(np.linspace(helix[0]-shift,helix[1]-shift,helix[1]-helix[0]+1),
                 [y+(hel)*(-1)**i for i in range(helix[1]-helix[0]+1)], 
                 linewidth=linewidth+2, color=c)
    for beta in betas:
        plt.arrow(x=beta[0]-shift,y=y,dx=beta[1]-beta[0],dy=0,color=c,width=arrow_width, length_includes_head=True, 
                  head_length=head_length, head_width=arrow_width*2)

## General functions

In [3]:
def flatten(list_of_lists):
    return [x[i] for x in list_of_lists for i in range(len(x))]

In [5]:
def omit_wt_state(mutations):
    if type(mutations) ==  list:
        return [x[1:] for x in mutations]
    elif type(mutations) == set:
        return {x[1:] for x in mutations}
    elif type(mutations) == dict:
        return {x[1:] : mutations[x] for x in mutations}

In [4]:
def pseudify(seq, gene, filler='-'):
    return [seq[pseudopos_to_nativepos[i][genekey[gene]]] 
          if str(pseudopos_to_nativepos[i][genekey[gene]])!='nan' else filler for i in range(246)]

## Plots

#### General

In [9]:
def label_plot_axis(x='', y='', t='', fontsize_x=13, fontsize_y=13, fontsize_t=16, **kwargs):
    plt.title(t, fontsize=fontsize_t, **kwargs)
    plt.xlabel(x, fontsize=fontsize_x, **kwargs)
    plt.ylabel(y, fontsize=fontsize_y, **kwargs)
#     plt.legend()

In [2]:
def subplots(width=4, height=4, nplots_per_row=2, nplots=4, dpi=200):
    axes = []
    if nplots%nplots_per_row == 0: 
        nrows = nplots/nplots_per_row
    else:
        nrows = int(1 + nplots/nplots_per_row)
    width *= nplots_per_row
    height *= nrows
    
    plt.figure(figsize=[width,height], dpi=dpi)
    for i in range(nplots):
        ax = plt.subplot(nrows, nplots_per_row, i+1)
        axes.append(ax)

    return axes

#### Histograms & Violins

In [1]:
def plot_multi_hist(dataset, masks, column, labels, bins=50, r=(2.5,5),
                    norm_to_1=True, colors='krbygcm', size=[20,4], **kwargs):
    '''For plotting the same variable (column) from the same dataframe, applying different masks to the dataframe.
    "masks" should be a list of masks to apply.'''
#     plt.figure(figsize=size)
#     bin_range = (dataset[column].min(), dataset[column].max())
    if norm_to_1:
        for i in range(0, len(masks)):
            weights, bins = np.histogram(dataset[masks[i]][column], bins=bins, range=r)
            plt.hist(bins[:-1],bins=bins,color=colors[i], label=labels[i], weights=weights/sum(weights), **kwargs)
    else:
        for i in range(0, len(masks)):
            plt.hist(dataset[masks[i]][column], color=colors[i], label=labels[i], **kwargs)
    plt.legend()

In [None]:
def plot_violins(list_of_datasets, labels, colors='rkbgycm', **kwargs):
    violins = plt.violinplot([list(x) for x in list_of_datasets], showmedians=True, **kwargs)
    
    for patch,color in zip(violins['bodies'], colors):
        patch.set_color(color)
    for item in ['cbars', 'cmaxes', 'cmins', 'cmedians']:
        violins[item].set_color('k')
        violins[item].set_linewidth(1)
    
    plt.xticks(range(1,len(list_of_datasets)+1), labels)

    https://matplotlib.org/3.1.1/api/collections_api.html

In [None]:
import matplotlib.patheffects as pe

def plot_half_violin(list_of_datasets, side, color, widths=0.5, alpha=0.6, 
                     show_medians=True, positions=None, chonkylines=False):
    if positions == None:
        positions = range(len(list_of_datasets))
    violins = plt.violinplot([list(x) for x in list_of_datasets], showmedians=False, showextrema=False,
                            positions = positions, widths=widths)
    
    if side == 'left':
        for body in violins['bodies']:
            m = np.mean(body.get_paths()[0].vertices[:,0])
            body.get_paths()[0].vertices[:,0] = np.clip(body.get_paths()[0].vertices[:,0], -np.inf, m)
            body.set_color(color[0])
            body.set_alpha(alpha)
            body.set_linewidth(0)
            
    elif side == 'right':
        for body in violins['bodies']:
            m = np.mean(body.get_paths()[0].vertices[:,0])
            body.get_paths()[0].vertices[:,0] = np.clip(body.get_paths()[0].vertices[:,0], m, np.inf)
            body.set_color(color[0])
            body.set_alpha(alpha)
            body.set_linewidth(0)
            
    if show_medians == True:
        medians = [np.nanmedian(x) for x in list_of_datasets]
        marker = {'left':0, 'right':1}
        if chonkylines == False:
            plt.plot(positions, medians, color=color[1], linestyle='-', ms=10,)#marker=marker[side], 
        else:
#             plt.plot(positions, medians, color=color[1], linestyle='-', ms=10)
            plt.plot(positions, medians, color=color[1], alpha=0.75,
                     lw=2, path_effects=[pe.Stroke(linewidth=5, foreground='w'), pe.Normal()])

In [20]:
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D

def legendary(colors, labels, edges=None, **kwargs):
    if edges==None:
        edges = colors
    '''Possible kwargs: loc (location), ncol (for horizontal alignment fo labels)'''
    plt.legend(handles=[mpatches.Patch(facecolor=colors[i], edgecolor=edges[i]) for i in range(len(colors))], 
               labels=labels, frameon=False, **kwargs)

#### Scatterplots

In [3]:
def density_plot(x, y, nbins=100, log=False, **kwargs):
    mask = (~np.isnan(x)) & (~np.isnan(y))
    x = x[mask]
    y = y[mask]
    H, xedges, yedges = np.histogram2d(x,y,bins=nbins)
    ix = np.searchsorted(xedges, x)
    ix[ix == nbins] = nbins - 1
    iy = np.searchsorted(yedges, y)
    iy[iy == nbins] = nbins - 1
    v = H[ix, iy]
    i = v.argsort()
    cc = v[i]
    if log:
        cc = np.log(cc + 1)
    plt.scatter(x[i], y[i], c=cc, edgecolor='', **kwargs)

In [11]:
def plot_correlation(dataset, x_axis, y_axis, title='', **kwargs):
    get_correlation(dataset=dataset, x_axis=x_axis, y_axis=y_axis)
#     plt.figure()
    dataset = dataset[[x_axis, y_axis]].dropna().copy(deep=True)
    density_plot(np.array(dataset[x_axis]), np.array(dataset[y_axis]), **kwargs)
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)
    plt.title(title, fontsize=14)

#### Other

In [44]:
def get_effects_by_position(gene, dataset, positions='pseudo', aa=False, func=np.nanmedian, col='effect_in_'):
    genekey = {'av':0, 'amac':1, 'cgre':2, 'pplu':3}
    
    if aa==False:
        if positions=='native':
            median_effects = [func(dataset[dataset['position']==i][col+gene]) for i in range(0,247)
                         if str(pseudopos_to_nativepos[i][genekey[gene]])!='nan']
        elif positions=='pseudo':
#             print(type(dataset[dataset['position']==1]))
            median_effects = [func(dataset[dataset['position']==i][col+gene]) for i in range(0,247)]
            
    else:
        if positions == 'pseudo':
            median_effects = [float(dataset[(dataset['position']==i) & (dataset['mutation']==aa)][col+gene]) 
                          if str(pseudopos_to_nativepos[i][genekey[gene]])!='nan'
                    and len(dataset[(dataset['position']==i) & (dataset['mutation']==aa)])!=0
                        else np.nan  for i in range(1,247) ]
#         elif positions == 'native':
            
            
    return median_effects

## Statistics

In [None]:
def get_correlation(dataset, x_axis, y_axis):
    for_correlation = dataset[[x_axis, y_axis]].dropna()
    pearson = scipy.stats.pearsonr(for_correlation[x_axis],for_correlation[y_axis])
    spearman = scipy.stats.spearmanr(for_correlation[x_axis],for_correlation[y_axis])
    print('Pearson correlation coefficient: %.2f, p=%.5f' % (pearson[0], pearson[1]))
    print('Spearman correlation coefficient: %.2f, p=%.5f' % (spearman[0], spearman[1]))

## Pymol

In [45]:
pdb = {'av':os.path.join(structure_folder, 'PDB_structures', 'avGFP__2wur.pdb'), 
       'cgre':os.path.join(structure_folder, 'PDB_structures', 'cgreGFP__2hpw.pdb'),
          'pplu':os.path.join(structure_folder, 'PDB_structures', 'ppluGFP2__2g3o_monomer.pdb'),
      'amac':os.path.join(structure_folder, 'PDB_structures', 'amacGFP__7lg4.pdb')}

In [2]:
import pymol
from pymol import cmd, stored
import Bio.PDB

In [1]:
def start_pymol():
    import sys
    import pymol
    pymol.pymol_argv = ['pymol','-qc'] #+ sys.argv[1:]
    stdout = sys.stdout
    stderr = sys.stderr
    pymol.finish_launching()
#     cmd = pymol.cmd

    sys.stdout = stdout
    sys.stderr = stderr
    
def open_or_fetch(PDB_ID_or_filename, object_name=None):
    if len(PDB_ID_or_filename) in [4,5] and '.' not in PDB_ID_or_filename:
        cmd.fetch(PDB_ID_or_filename)#, async=0)
    else:
        if not object_name:
            object_name = PDB_ID_or_filename
        cmd.load(PDB_ID_or_filename, object_name)

        
def save_session(filename_pse='test.pse', pymol_viewer_version='1.72'):
    cmd.set('pse_export_version', pymol_viewer_version)
    cmd.save(filename_pse)


def white_and_beautiful(representation='cartoon'):
    cmd.hide('lines', 'all')
    cmd.show(representation, 'all')
    cmd.select('waters', 'name o')
    cmd.hide('everything', 'waters')
    cmd.color('white', 'all')
    cmd.set('bg_rgb', '(1,1,1)')
    cmd.set('surface_quality', '1')
    cmd.set('transparency', '0.5')
    cmd.set('ray_opaque_background', 'off')

In [None]:
def color_positions(positions, values=None, representation='spheres', colormap=matplotlib.cm.cool, 
    constant_color=120, print_colors=False):
    colors_used = []
    # only positive values
    # if str(values) != 'None':
    #     assert min(values) >= 0
    if type(constant_color) == int or type(constant_color) == float:
        color = colormap(constant_color)
    elif type(constant_color) == str:
        color = matplotlib.colors.hex2color(constant_color)
    elif type(constant_color) == tuple:
        color = constant_color
    else:
        print('Weird color!')

    if str(values) != 'None':
        values = np.array(values) - min(values)
        values = 1. * values / max(values)
    for index, position in enumerate(positions):
        if str(values) != 'None':
            if str(values[index])=='nan':
                colors_used.append((values[index], 'white'))
            else:
                color=colormap(values[index])
                colors_used.append((values[index], color))
        elif type(constant_color) == list:
            color = matplotlib.colors.hex2color(constant_color[index])
        colorName = "color_" + str(position)
        selName = "temp_selection"
        cmd.set_color(colorName, color[0:3])
        cmd.select(selName, 'resi %s' %position)
        cmd.show(representation, selName)
        cmd.color(colorName, selName)
    if print_colors:
        return set(colors_used)
    else:
        return None