In [5]:
import os
import sys
import pylab as p
import pylab as pl
import matplotlib as mpl
import pylab as pl
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import csv
from sklearn.metrics import r2_score
from matplotlib.offsetbox import AnchoredText
import matplotlib.ticker as ticker
sns.reset_defaults()
import warnings
warnings.filterwarnings('ignore')

# Settings for matplotlib
plt.rcParams["font.family"] = "Arial"
mpl.rcParams['axes.linewidth'] = 2
mpl.rcParams['pdf.fonttype'] = 42


In [9]:
# Dependency Function Definitions:
def calc_rmsd(x, y):
    if len(x) == 0:
        return 0.0
    chi2 = sum((y - x) ** 2)
    rmsd = np.sqrt(chi2 / len(y))
    return rmsd

def calc_r2(x, y):
    if len(x) == 0:
        return 0.0
    
    return np.corrcoef(x, y)[0,1]**2

def diff(list1, list2):
    c = set(list1).union(set(list2))  # or c = set(list1) | set(list2)
    d = set(list1).intersection(set(list2))  # or d = set(list1) & set(list2)
    common = []
    for element in list1:
        if element in list2:
            common.append(element)
    return list(c - d), common

In [38]:
# GS == Honglue ensemble

# Read in CSV data
dfp = pd.read_csv("data/Orca_DFT/mtTAR_FARFAR_Ensemble_Benchmark_DFT.csv").drop_duplicates()
dfm = pd.read_csv("data/HIV_TAR_measured_cs.csv")
df_ls = dfp.copy()

# Process & combine predicted & measured data
dfp['shift_ens_avg'] = np.mean(dfp.iloc[:,2:], axis=1)
dfp['shift_ens_std'] = np.std(dfp.iloc[:,2:], axis=1)
dfp = dfp.loc[:, ["res", "atom", "shift_ens_avg", "shift_ens_std"]]
dfm = dfm.loc[:, ["Resi", "Atom", "Shift", "SDev"]].rename(columns = {
    'Resi': 'res', 'Atom': 'atom', 'Shift': 'shift_nmr_avg', 'SDev': 'shift_nmr_std'})
df = dfm.merge(dfp, on=["res", "atom"], how="inner")

df_ls = dfm.merge(df_ls,on=["res", "atom"], how="inner" )

# Define masks for TAR domains:
mask = {}
mask['H1'] = df.res.isin([17, 18, 19, 20 ,21, 22, 40, 41, 42, 43, 44, 45])
mask['H2'] = df.res.isin([26, 27, 28, 29, 36, 37, 38, 39])
mask['B'] = df.res.isin([23, 24, 25])
mask['L'] = df.res.isin([30, 31, 32, 33, 34, 35])

# Define resonances & TAR domains to plot:
domains = ['H1', 'H2', 'B']
domain_cols = {'H1':'red', 'H2':'blue', 'B':'orange'}
resonances = ["C1'", "C2'", "C3'", "C4'", "C5'", "C8", "C6", "C2", "C5", "N1/N3"]
N = len(resonances)

LinearCorrectedDFs = []

# Loop through diff. nuclei:
for idx, resonance in enumerate(resonances):
    if resonance == "N1/N3":
        G_iminos = [18, 21, 26, 28, 36, 43]
        U_iminos = [38, 42]
        filter_N1 = df.res.isin(G_iminos) & (df["atom"] == "N1")
        filter_N3 = df.res.isin(U_iminos) & (df["atom"] == "N3")
        filter_H1 = df.res.isin(G_iminos) & (df["atom"] == "H1")
        filter_H3 = df.res.isin(U_iminos) & (df["atom"] == "H3")
        filter1 = filter_N1 | filter_N3
        filter2 = filter_H1 | filter_H3
        resonance_H = "H1/H3"
    else:
        filter1 = (df["atom"] == resonance)
        resonance_H = "H" + resonance[1:]
        filter2 = (df["atom"] == resonance_H)
    
    # Calculate linear correction:
    x = df.loc[filter1]["shift_ens_avg"]
    y = df.loc[filter1]["shift_nmr_avg"]
    m, b = np.polyfit(x, y, 1)   
    print(resonance , ": y = %3.2f x + %3.2f"%(m, b))
    x_H = df.loc[filter2]["shift_ens_avg"]
    y_H = df.loc[filter2]["shift_nmr_avg"]
    m_H, b_H = np.polyfit(x_H, y_H, 1)   
    print(resonance_H , ": y = %3.2f x + %3.2f"%(m_H, b_H))
    
    # Apply linear correction:
    predicted_shift = np.array(df_ls.loc[filter1].iloc[:, -20:])
    predicted_shift = predicted_shift * m + b
    t = pd.DataFrame()
    t["res"] = df_ls.loc[filter1, "res"] 
    t["res"] = df_ls.loc[filter1, "atom"]
    t["lc_predicted_cs_avg_GS"] = np.mean(predicted_shift, axis=1)
    t["lc_predicted_cs_std_GS"] = np.std(predicted_shift, axis=1)
    t["lc_measured_cs_ref_GS"] = df_ls.loc[filter1, "shift_nmr_avg"]
    LinearCorrectedDFs.append(t)

LinearCorrectedDF = pd.concat(LinearCorrectedDFs, ignore_index=True)
print(LinearCorrectedDF.head())
print(LinearCorrectedDF.shape)







C1' : y = 0.83 x + 10.31
H1' : y = 0.73 x + 1.02
C2' : y = 0.26 x + 55.08
H2' : y = 0.83 x + 0.40
C3' : y = 0.66 x + 22.80
H3' : y = 0.46 x + 2.29
C4' : y = 0.62 x + 31.20
H4' : y = -0.34 x + 6.07
C5' : y = 0.62 x + 23.37
H5' : y = 1.55 x + -2.42
C8 : y = 0.89 x + 19.64
H8 : y = 0.71 x + 2.41
C6 : y = 0.71 x + 42.70
H6 : y = 0.37 x + 4.74
C2 : y = 0.82 x + 29.55
H2 : y = 0.56 x + 2.94
C5 : y = 0.63 x + 37.47
H5 : y = 0.47 x + 3.02
N1/N3 : y = 0.58 x + 56.26
H1/H3 : y = 1.02 x + 0.16
   res  lc_predicted_cs_avg  lc_predicted_cs_std  lc_measured_cs_ref
0  C1'            93.026825             1.293270              93.020
1  C1'            94.023481             1.299277              93.807
2  C1'            93.544829             1.209390              92.944
3  C1'            93.195263             1.465194              92.625
4  C1'            92.301362             1.062920              92.441
(148, 4)
