In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns

ylim_offset = 0.75

def subset(x, interval=10):
    x_new = []
    for i in range(len(x)):
        if i % interval == 0:
            x_new.append(x[i])
    return x_new


dir_names = "4zw9 4zw9_asp89 4zw9_trp331 5eqi 5eqi_asp91 5eqi_trp333".split(" ")
label_names = dir_names
names = dict(zip(dir_names, label_names))
    

distance_dataset = []
for protein in dir_names:
    structure = protein[0:4]
    for j, item in enumerate(["TM5_TM11_in", "TM5_TM11_out", "TM1_TM7_in", "TM1_TM7_out", "TM2_TM8_in", "TM2_TM8_out", "TM2_TM11_in", "TM2_TM11_out", "TM5_TM8_in", "TM5_TM8_out", "TM2_TM3_in", "TM2_TM3_out", "TM8_TM9_in", "TM8_TM9_out" ]):
    # for j, item in enumerate(["TM5_TM11_in", "TM5_TM11_out", "TM1_TM7_in", "TM1_TM7_out", "TM2_TM8_in", "TM2_TM8_out", "TM2_TM11_in", "TM2_TM11_out", "TM5_TM8_in", "TM5_TM8_out", "TM2_TM3_in", "TM2_TM3_out", "TM8_TM9_in", "TM8_TM9_out" ]):
        for i in range(3):
            path = f"distances/{structure}/" + f"{protein}_{item}_rep{i+1}.com.xvg"
            if not os.path.isfile(path):
                print(f"{path} not found. Continue.")
                continue

            x, y = np.loadtxt(path, comments=["@", "#"], unpack=True)
            x = subset(x, 100)
            y = subset(y, 100)

            for x_item, y_item in zip(x, y):
                distance_dataset.append({"protein"  : protein,
                                         "structure": structure,
                                         "rep"      : str(i+1),
                                         "tm"       : item,
                                         "time"     : x_item,
                                         "distance" : y_item,
                                        })

distance_dataset_df = pd.DataFrame(distance_dataset)

distance_dataset_df = pd.pivot_table(distance_dataset_df, index=["protein", "structure", "rep", "time"], columns="tm", values="distance").reset_index()
distance_dataset_df['protein_rep'] = distance_dataset_df['protein'] + "_" + distance_dataset_df['rep']
distance_dataset_df


distances/4zw9/4zw9_TM2_TM11_in_rep1.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM11_in_rep2.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM11_in_rep3.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM11_out_rep1.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM11_out_rep2.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM11_out_rep3.com.xvg not found. Continue.
distances/4zw9/4zw9_TM5_TM8_in_rep1.com.xvg not found. Continue.
distances/4zw9/4zw9_TM5_TM8_in_rep2.com.xvg not found. Continue.
distances/4zw9/4zw9_TM5_TM8_in_rep3.com.xvg not found. Continue.
distances/4zw9/4zw9_TM5_TM8_out_rep1.com.xvg not found. Continue.
distances/4zw9/4zw9_TM5_TM8_out_rep2.com.xvg not found. Continue.
distances/4zw9/4zw9_TM5_TM8_out_rep3.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM3_in_rep1.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM3_in_rep2.com.xvg not found. Continue.
distances/4zw9/4zw9_TM2_TM3_in_rep3.com.xvg not found. Continue.
distances/4zw

tm,protein,structure,rep,time,TM1_TM7_in,TM1_TM7_out,TM2_TM8_in,TM2_TM8_out,TM5_TM11_in,TM5_TM11_out,protein_rep
0,4zw9,4zw9,1,0.0,2.912,1.906,2.904,2.409,1.838,2.595,4zw9_1
1,4zw9,4zw9,1,20000.0,2.858,1.921,3.081,2.341,1.823,2.525,4zw9_1
2,4zw9,4zw9,1,40000.0,2.925,1.910,3.006,2.369,1.817,2.562,4zw9_1
3,4zw9,4zw9,1,60000.0,2.903,1.937,2.890,2.231,1.889,2.549,4zw9_1
4,4zw9,4zw9,1,80000.0,2.856,1.974,3.059,2.411,1.768,2.456,4zw9_1
...,...,...,...,...,...,...,...,...,...,...,...
40267,5eqi_trp333,5eqi,3,44000000.0,2.990,1.911,2.885,2.143,1.715,2.406,5eqi_trp333_3
40268,5eqi_trp333,5eqi,3,44020000.0,3.093,1.933,2.982,2.007,1.697,2.449,5eqi_trp333_3
40269,5eqi_trp333,5eqi,3,44040000.0,3.006,1.924,2.982,1.989,1.702,2.424,5eqi_trp333_3
40270,5eqi_trp333,5eqi,3,44060000.0,2.995,1.902,2.964,2.112,1.780,2.402,5eqi_trp333_3


In [7]:
distance_dataset_df['variant'] = distance_dataset_df['protein']
distance_dataset_df.loc[distance_dataset_df['protein'] == "4zw9_asp89", "variant"] = "G91D"
distance_dataset_df.loc[distance_dataset_df['protein'] == "5eqi_asp91", "variant"] = "G91D"
distance_dataset_df.loc[distance_dataset_df['protein'] == "4zw9_trp331", "variant"] = "R333W"
distance_dataset_df.loc[distance_dataset_df['protein'] == "5eqi_trp333", "variant"] = "R333W"

In [8]:
def hist_overlap(a, b):
    """Overlap between histograms

    Args:
        a (list): List a
        b (list): List b

    Returns:
        float: fraction overlap
    """    
    sm = 0
    for i in range(len(a)):
        sm += min(a[i], b[i])
    return sm/sum(a)

def hist_shift(a_hist, a_edge, b_hist, b_edge):
    """Calculate Peak shift

    Args:
        a (list): List a
        b (list): List b
    
    Returns:
        float: Shift
    """
    peak1 = a_edge[a_hist.argmax(axis=0)]
    peak2 = b_edge[b_hist.argmax(axis=0)]
    return peak2 - peak1


def overlap_shift_5eqi(setting="TM2_TM8_in"):
    selection = distance_dataset_df[distance_dataset_df['structure'] == "5eqi"]

    min_d, max_d = np.min(selection[setting]), np.max(selection[setting])

    bins = np.arange(np.round(min_d, 2), max_d, 0.005)

    wildtype = selection[selection['protein'] == "5eqi"]
    asp91 = selection[selection['protein'] == "5eqi_asp91"]
    trp333 = selection[selection['protein'] == "5eqi_trp333"]

    wt_vals, wt_edge = np.histogram(wildtype[setting], bins=bins)
    asp91_vals, asp91_edge = np.histogram(asp91[setting], bins=bins)
    trp333_vals, trp333_edge = np.histogram(trp333[setting], bins=bins)

    overlap1 = hist_overlap(wt_vals, asp91_vals)
    shift1 = hist_shift(wt_vals, wt_edge, asp91_vals, asp91_edge)

    overlap2 = hist_overlap(wt_vals, trp333_vals)
    shift2 = hist_shift(wt_vals, wt_edge, trp333_vals, trp333_edge)
    return overlap1, shift1, overlap2, shift2

def overlap_shift_4zw9(setting="TM2_TM8_in"):
    selection = distance_dataset_df[distance_dataset_df['structure'] == "4zw9"]
    # sns.kdeplot(data=selection, x=setting,  hue="variant")
    # plt.figure()
    min_d, max_d = np.min(selection[setting]), np.max(selection[setting])

    bins = np.arange(np.round(min_d, 2), max_d, 0.005)

    wildtype = selection[selection['protein'] == "4zw9"]
    asp89 = selection[selection['protein'] == "4zw9_asp89"]
    trp331 = selection[selection['protein'] == "4zw9_trp331"]

    wt_vals, wt_edge = np.histogram(wildtype[setting], bins=bins)
    asp89_vals, asp89_edge = np.histogram(asp89[setting], bins=bins)
    trp331_vals, trp331_edge = np.histogram(trp331[setting], bins=bins)

    overlap1 = hist_overlap(wt_vals, asp89_vals)
    shift1 = hist_shift(wt_vals, wt_edge, asp89_vals, asp89_edge)

    overlap2 = hist_overlap(wt_vals, trp331_vals)
    shift2 = hist_shift(wt_vals, wt_edge, trp331_vals, trp331_edge)
    return overlap1, shift1, overlap2, shift2


In [9]:

for item in ["TM2_TM8", "TM1_TM7", "TM5_TM11"]:
    for in_out in ["in", "out"]:
        print(f"{item} 5eqi {in_out}")
        overlap1, shift1, overlap2, shift2 = overlap_shift_5eqi(f"{item}_{in_out}")
        print("Overlap", f"{overlap1*100:.0f}%", f"{overlap2*100:.0f}%")
        print("Shift", f"{shift1:.2f}", f"{shift2:.2f}")
        print()


TM2_TM8 5eqi in
Overlap 88% 91%
Shift 0.01 0.00

TM2_TM8 5eqi out
Overlap 60% 73%
Shift -0.00 0.02

TM1_TM7 5eqi in
Overlap 36% 85%
Shift 0.08 -0.02

TM1_TM7 5eqi out
Overlap 44% 76%
Shift -0.08 0.00

TM5_TM11 5eqi in
Overlap 74% 68%
Shift -0.01 -0.01

TM5_TM11 5eqi out
Overlap 44% 48%
Shift -0.11 -0.09



In [5]:
for item in ["TM2_TM8", "TM1_TM7", "TM5_TM11"]:
    for in_out in ["in", "out"]:
        print(f"{item} 4zw9 {in_out}")
        overlap1, shift1, overlap2, shift2 = overlap_shift_4zw9(f"{item}_{in_out}")
        print("Overlap", f"{overlap1*100:.0f}%", f"{overlap2*100:.0f}%")
        print("Shift", f"{shift1:.2f}", f"{shift2:.2f}")
        print()

TM2_TM8 4zw9 in
Overlap 59% 93%
Shift 0.03 -0.01

TM2_TM8 4zw9 out
Overlap 70% 77%
Shift -0.07 -0.04

TM1_TM7 4zw9 in
Overlap 63% 80%
Shift 0.05 -0.01

TM1_TM7 4zw9 out
Overlap 17% 90%
Shift -0.10 -0.01

TM5_TM11 4zw9 in
Overlap 73% 75%
Shift -0.03 -0.02

TM5_TM11 4zw9 out
Overlap 28% 41%
Shift -0.10 -0.08

