In [3]:
import csv
import pandas
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import collections
import re
from pathlib import Path, PureWindowsPath



In [86]:
def IntegralIncrease1(nums: list) ->float:
    #method 1:
    #return the sum of all local max-local min
    #start from beginning point: r5 substate
    
    loc_min, loc_max=nums[0], nums[0]
    res=0
    for i in range(1, len(nums)):
        if nums[i] > nums[i-1]:
            loc_max = nums[i]
        elif nums[i] < nums[i-1]:
            res += loc_max - loc_min
            loc_min=nums[i]
            loc_max=nums[i]
    res += loc_max-loc_min
    return res

def IntegralIncrease2(nums: list)->float:
    #method2:
    #return the sum of all local max-local min
    #start from second last one point: r1 substate
    
    loc_min, loc_max=nums[-2], nums[-2]
    res=0
    for i in range(-1, len(nums)-1):
        if nums[i] > nums[i-1]:
            loc_max = nums[i]
        elif nums[i] < nums[i-1]:
            res += loc_max - loc_min
            loc_min=nums[i]
            loc_max=nums[i]
    
    res += loc_max-loc_min
    return res



def crgstringtolist(crg_s: str)->list:
    #fuction to read crg_seq from string into crg set
    #return a list of charges
    n=len(crg_s)
    i=0
    res=[]
    while i < n:
        if crg_s[i]=='1':
            res.append(1)
            i +=1
        elif crg_s[i]=='0':
            res.append(0)
            i +=1
        else:
            res.append(int(crg_s[i:i+2]))
            i +=2
    return res

def readMsE2df(dicts: dict, substate: str): 
    #read all_ms_E format file, and output structure and energy informations
    #return a dataframe
    substate=str(substate)
    df_e1=[]
    columns_e1=['state_snap', 'E(-3)_min_'+substate, 'E(-4)_min_'+substate, 'ms(-3)_minE_'+substate, 'ms(-4)_minE_'+substate,\
                'E_self(-3)_'+substate,'E_self(-4)_'+substate, 'E_pw(-3)_'+substate,'E_pw(-4)_'+substate,\
                'E_mfe(-3)_'+substate,'E_mfe(-4)_'+substate,\
                r'$\bigtriangleup$E_'+substate,\
                r'$\bigtriangleup$E_self_'+substate,r'$\bigtriangleup$E_pw_'+substate, r'$\bigtriangleup$E_mfe_'+substate]

    ##read data into df_e1
    for struc, dataframe in dicts.items():
        tmp=dataframe.iloc[:7]
        data=[struc]
        data.append(tmp['Min_E(Kcal)'][2])
        data.append(tmp['Min_E(Kcal)'][1])
        data.append(crgstringtolist(tmp['Crg_seq'][2]))
        data.append(crgstringtolist(tmp['Crg_seq'][1]))
        data.extend([tmp['E_self(Kcal)'][2],tmp['E_self(Kcal)'][1], tmp['E_pw(Kcal)'][2],tmp['E_pw(Kcal)'][1],\
                     tmp['E_mfe(Kcal)'][2],tmp['E_mfe(Kcal)'][1]])
        data.append(tmp['Min_E(Kcal)'][1]-tmp['Min_E(Kcal)'][2])
        data.append(tmp['E_self(Kcal)'][1]-tmp['E_self(Kcal)'][2])
        data.append(tmp['E_pw(Kcal)'][1]-tmp['E_pw(Kcal)'][2])
        data.append((tmp['E_mfe(Kcal)'][1]-tmp['E_mfe(Kcal)'][2])*2)
        df_e1.append(data)
    
    return pandas.DataFrame(df_e1, columns = columns_e1)    

def readMsOcc2df(dicts: dict, substate: str): 
    #read ms_occ format file, and output structure and energy informations
    #return a dataframe
    substate=str(substate)
    df_e1=[]
    columns_e1=['state_snap', 'Tot_crg', 'E(Kcal)', 'Occ',\
                'ms_str','ms', 'Conf_Names']

    
    ##read data into df_e1
    for struc, dataframe in dicts.items():
        for i in range(dataframe.shape[0]):
            data=[struc]
            data.append(dataframe.iloc[i]['Tot_crg'])
            data.append(dataframe.iloc[i]['E(Kcal)'])
            data.append(dataframe.iloc[i]['Occ'])
            data.append(dataframe.iloc[i]['Crg_seq'])
            data.append(crgstringtolist(dataframe.iloc[i]['Crg_seq']))
            data.append(dataframe.iloc[i]['Conf_Names'])
            
            df_e1.append(data)
    
    return pandas.DataFrame(df_e1, columns = columns_e1)   


def color_struc(s, color_map, column):
    '''
    color each row by column value based on colormap .
    '''
    #copy df to new - original data are not changed
    df = s.copy()
    #set by condition
    df.loc[:,:] = 'background-color: blue '
    for key in color_map:
        
        mask = s[column] == key
        #print(mask)
        df.loc[mask, :] = 'background-color: {}'.format(color_map.get(key))
    #print(df)
    return df    


def readClusteringFile(file, dataframe):
    #read clustering information from filepath and append the information into dataframe
    
    columns=['traj/xray', 'time_point','cluster_size']
    with open(file, 'r') as f:
        next(f)
        data=[]
        for line in f:
            line=line.strip()
            if line:
                line= re.split(':|,|\(|\)' , line)
                
                if len(line)==1: continue
                if len(line)==2:  # read the traj/xry information
                    struc=line[0]
                    continue
                time_point=int(line[4])
                cluster_size=int(line[2])
                data.append([struc, str(time_point), cluster_size])
    
    df=pandas.DataFrame(data, columns = columns)
    #print(df)
    return pandas.merge(dataframe, df, how='outer', on =['traj/xray','time_point'])

def colormaps(color_map, column):
    '''
    color each row by column value based on colormap .
    '''
    n=len(column)
    ans=[''] * n
    for i, x in enumerate(column):
        ans[i]=color_map.get(x,'blue')
        #print(x, ans[i])
    return ans
    

        

# Import proton binding energy into dataframe: df_sum

In [87]:
#import crystal and md at e1/f3 substate data files
filepath_cry_e1='/Users/caixiuhong/Dropbox/cai/btype_cco/crg_data/quick_run_mdlip/free_no_water/ms_occ_E1.xlsx'
filepath_md_e1='/Users/caixiuhong/Dropbox/cai/btype_cco/crg_data/quick_run_mdlip/clustering/PLS/ms_occ_E1.xlsx'
#f3 data files pathway
#filepath_cry_f3='/Users/caixiuhong/Dropbox/cai/btype_cco/crg_data/quick_run_mdlip/free_no_water/ms_occ_f3.xlsx'
#filepath_md_f3='/Users/caixiuhong/Dropbox/cai/btype_cco/crg_data/quick_run_mdlip/clustering/PLS/ms_occ_f3.xlsx'

#crystal structure name
cry_name= ["1ehk","1xme","4gp4","4gp5","3eh3","3eh5","3s3b","3s3d","3eh4","3s3a","3s3c","3s8f"]


file_cry_e1 = pandas.read_excel(filepath_cry_e1, sheetname=None, index_col=1)
file_md_e1 = pandas.read_excel(filepath_md_e1, sheetname=None, index_col=1)
#file_cry_f3 = pandas.read_excel(filepath_cry_f3, sheetname=cry_name, index_col=1)
#file_md_f3 = pandas.read_excel(filepath_md_f3, sheetname=None, index_col=1)


file_cry_e1.update(file_md_e1)
file_e1=file_cry_e1   # store data for e1 substate

#file_cry_f3.update(file_md_f3)
#file_f3= file_cry_f3   # store data for f3 substate


# read min_e, e_self, e_pw, e_mfe
df_e1=readMsOcc2df(file_e1,'e1')
#df_f3=readMsE2df(file_f3,'f3')


In [96]:
snap_ms=collections.defaultdict(list)
snap_ms_str=collections.defaultdict(list)

tot_ms=[]
tot_ms_str=[]
for index, row in df_e1.iterrows():
    if row['ms_str'] not in tot_ms_str:
        tot_ms_str.append(row['ms_str'])
        tot_ms.append(row['ms'])

    
    
    #print(row['ms_str'],row['state_snap'])
    if row['state_snap'] not in snap_ms_str:
        snap_ms_str[row['state_snap']].append(row['ms_str'])
        snap_ms[row['state_snap']].append(row['ms'])
        
    else:
        if row['ms_str'] not in snap_ms_str[row['state_snap']]:
            snap_ms_str[row['state_snap']].append(row['ms_str'])
            snap_ms[row['state_snap']].append(row['ms'])

In [97]:
tot_ms

[[-1, 0, 1, -1, -1, -1],
 [-1, 0, 0, 0, -1, -1],
 [-1, 0, 0, -1, 0, -1],
 [-1, 0, 1, -1, -1, 0],
 [-1, 0, 0, -1, -1, -1],
 [-1, -1, 0, -1, 0, -1],
 [-1, -1, 1, -1, 0, -1],
 [-1, -1, 0, 0, 0, -1],
 [-1, -1, 0, -1, -1, -1],
 [-1, -1, 0, 0, -1, -1],
 [-1, 0, 0, -1, -1, 0],
 [-1, -1, 0, -1, 0, 0],
 [-1, -1, 1, -1, -1, -1],
 [-1, -1, 0, -1, -1, 0]]

# group the data by traj/xray and summarize the information

In [13]:
groups=df_res.groupby('traj/xray')[[r'$\int$dsum_pls6_m2','sum_pls6_avecrg',r'$\bigtriangleup$E_e1',r'$\bigtriangleup$E_f3']]\
.agg(['mean','std','size']).reset_index()

# style the dataframe and save the dataframe to excel file

In [14]:
def color_struc(s, color_map, column):
    '''
    color each row by column value based on colormap .
    '''
    #copy df to new - original data are not changed
    df = s.copy()
    #set by condition
    df.loc[:,:] = 'background-color: blue '
    for key in color_map:
        
        mask = s[column] == key
        #print(mask)
        df.loc[mask, :] = 'background-color: {}'.format(color_map.get(key))
    #print(df)
    return df    

#colormap of each structure
color_maps={"p1": "orange", "dddro": "orange", "p2": "purple", "pddro": "purple", "p3": "green", "ppdro": "green",\
           "p4": "cyan","dpdro": "cyan", "d372p":"yellow", "d372ph376p": "red"}

df_res_styler=df_res.round(2).style.apply(color_struc, color_map=color_maps, column='traj/xray', axis=None)