In [6]:
import numpy as np
from scipy.spatial.distance import cdist
import os
from tqdm import tqdm

In [7]:
# ss - nparray ['-','H','S']

In [8]:
secondary = np.array(['-','-','-','-','-','-','H','H','H','H','H','S','S','S','S','S','S','-','-','-','-','-','S','S','S','S','S'
                     
                     ])

In [9]:
np.where(section_finder(secondary)=='-')[0]

NameError: name 'section_finder' is not defined

# Fundamental structure extraction

In [10]:
def section_finder(ss):
    
    '''Find protein sub-unit sections from the full secondary structure'''
    
    sections = []
    structure_change = np.diff(np.unique(ss, return_inverse=True)[1])

    for i, c in enumerate( structure_change ):

        if c!=0:
            sections.append(ss[i])

        if i==structure_change.shape[0]-1:
            sections.append(ss[i])
            
    sections = np.array(sections)
    
    return sections #, linker_indices #, structure_change


def find_linker_indices(sections):
    
    '''Find linker sub-unit section indices'''
    
    linker_indices = np.where(sections=='-')[0]
    return linker_indices


def find_sheet_indices(sections):
    
    '''Find sheet sub-unit section indices'''

    sheet_indices = np.where(sections=='S')[0]
    return sheet_indices

# Generation

In [11]:
def generate_random_structures(coords_file, fingerprint_file, linker_indices):
    
    '''Generate random structures changing one linker section at a time
    
    Parameters
    coords_file:       /path/ to CA coordinates.dat file
    fingerprint_file:  /path/ to fingerprint.dat file
    linker_indices:    Indices of linker sub-unit sections of protein
    
    Return
    Generated structures are written to ~/rand_structures/.. section_*LINKERINDEX*.dat as xyz
    ''' 
    
    current = os.getcwd()
    random = 'rand_structures'
    random_working = os.path.join(current, random)

    try:
        os.mkdir(random_working)
    except OSError as error:
        print(str(error)[11:])

    # linker_indices = linker_prep(coords_file, fingerprint_file)
    
    print('Beginning random structures generation \n')
    for l in tqdm(linker_indices):
        
        outputname = '/section_'+str(l)
        !./generate_structure {fingerprint_file} {coords_file} {random_working}{outputname} {l}
        
    print('')
    print('Finished generating random structures')

In [13]:
sections = section_finder(secondary)
linker_indices = find_linker_indices(sections)

In [14]:
linker_indices

array([0, 3])

In [12]:

coord_file = 'Example/Lysozyme/Lysozyme.pdb



generate_random_structures(coord_file, fingerprint_file, linker_indices)

File exists: '/home/josh/Documents/CarbonaraDev/CarbonARA/rand_structures'
Beginning random structures generation 



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.14it/s]


Finished generating random structures





# Post processing

In [3]:
def sheet_group_mask(ss):
     
    '''Groups adjacent sheets in secondary structure file and returns a grouping mask ( 0 : not a sheet;  1+: sheet )
    
    Parameters
    ss (numpy array):            Secondary structure labels (array of strings)
    
    Returns
    sheet_groups (numpy array):  Mask of grouped sheet sections
    '''
    
    sheet_mask = (ss == 'S')*1
    sheet_groups = np.zeros(ss.shape[0])
    group = 1
    
    if sheet_mask[0] == 1:
        label = True
    else:
        label = False

    for i, c in enumerate(np.diff(sheet_mask)):
        
        
        if c == 1:
            label = True

        elif c==-1:
            label=False
            group += 1

        else:
            pass 

        if label == True:
            if ss[i+1] == 'S':
                sheet_groups[i+1] = group
                
    return sheet_groups


def linker_group_mask(ss):
    
    '''Groups adjacent linkers in secondary structure file and returns a grouping mask ( 0 : not a linker;  1+: linker )
    
    Parameters
    ss (numpy array):             Secondary structure labels (array of strings)
    
    Returns
    linker_groups (numpy array):  Mask of grouped linker sections
    '''
    
    linker_mask = (ss == '-')*1
    linker_groups = np.zeros(ss.shape[0])
    group = 1
    
    # checking first index for linker 
    if linker_mask[0] == 1:
        label = True
        linker_groups[0] = group
    else:
        label = False

    for i, c in enumerate(np.diff(linker_mask)):
    
        if c == 1:
            label = True

        elif c==-1:
            label=False
            group += 1

        else:
            pass 

        if label == True:
            
            linker_groups[i+1] = group
                
    return linker_groups #, linker_mask


def get_sheet_coords(coords, sheet_groups):

    '''Finds CA coordinates of 
    
    Parameters
    coords (numpy array):        xyz coordinates of all protein CA atoms
    sheet_groups (numpy array):  Mask of grouped sheet sections
    
    Returns
    sheet_coords (numpy array):  xyz coordinates of CA atoms in each sheet structure [ [...sheet 1 coords...] [...sheet 2 coords...] ... ]
    '''
    
    sheet_coords = []

    for g in np.unique(sheet_groups):
        if g>0:
            sheet_coords.append(coords[sheet_groups==g])
            
    sheet_coords = np.asarray(sheet_coords)
    
    return sheet_coords


def sheet_pairwise_bond_number(sheet_coords, thr=5.5):
    
    '''Finds the number of pairs of CA atoms within some threshold between all sheet sections
    
    Parameters
    sheet_coords (numpy array): xyz coordinates of CA atoms in each sheet structure [ [...sheet 1 coords...] [...sheet 2 coords...] ... ]
    thr (float) {optional}:     Cutoff distance for inter-sheet bonding (default = 5.5 Å)
    
    Returns
    pairwise_bond_num (numpy array): Lower triangular array containing the number of individual CA bonds within threshold between each sheet pair
    
    '''
    
    thr = 5.5
    number_bonds = 0

    pairwise_bond_num = np.zeros([len(sheet_coords), len(sheet_coords)])

    for i in range(1,len(sheet_coords)):

        for j in range(0,i):

            arr1, arr2 = sheet_coords[j], sheet_coords[i]
            dist_matrix = cdist(arr1, arr2)
            indices = np.where(dist_matrix < thr)

            pairwise_bond_num[i,j] = indices[0].shape[0]

            number_bonds += indices[0].shape[0]

    return pairwise_bond_num  




In [4]:
def get_section_groupings(ss, structure_change):
    
    group = 0
    structural_groups = np.zeros(ss.shape)
    structural_groups[0] = group

    for i, c in enumerate(structure_change):

        if c != 0:
            group += 1

        structural_groups[i+1] = group
    return structural_groups

In [5]:
def sheet_pairwise_bond_number(sheet_coords, thr=5.5):
    
    '''Finds the number of pairs of CA atoms within some threshold between all sheet sections
    
    Parameters
    sheet_coords (numpy array): xyz coordinates of CA atoms in each sheet structure [ [...sheet 1 coords...] [...sheet 2 coords...] ... ]
    thr (float) {optional}:     Cutoff distance for inter-sheet bonding (default = 5.5 Å)
    
    Returns
    pairwise_bond_num (numpy array): Lower triangular array containing the number of individual CA bonds within threshold between each sheet pair
    
    '''
    
    thr = 5.5
    number_bonds = 0

    pairwise_bond_num = np.zeros([len(sheet_coords), len(sheet_coords)])

    for i in range(1,len(sheet_coords)):

        for j in range(0,i):

            arr1, arr2 = sheet_coords[j], sheet_coords[i]
            dist_matrix = cdist(arr1, arr2)
            indices = np.where(dist_matrix < thr)

            pairwise_bond_num[i,j] = indices[0].shape[0]

            number_bonds += indices[0].shape[0]

    return pairwise_bond_num  

In [65]:
a = np.array([ [0,0,0],[0,0,0] ])
b = np.array([ [1,2,3],[1,2,3] ])
c = np.array([ [1,4,4],[1,4,1] ])
d = np.array([ [1,1,1],[1,88,1] ])



In [69]:
sheet_pairwise_bond_number( np.asarray([a,b,a,d]) )

array([[0., 0., 0., 0.],
       [4., 0., 0., 0.],
       [4., 4., 0., 0.],
       [2., 2., 2., 0.]])

In [27]:
linker_group_mask(secondary)

array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       2., 2., 2., 2., 2., 0., 0., 0., 0., 0.])

In [None]:
def sheet_finder(ss):
    
    sheet_index = (ss == 'S')*1
    ss_groups = np.zeros(ss.shape[0])
    group = 1
    
    if sheet_index[0] == 1:
        label = True
    else:
        label = False

    for i, c in enumerate(np.diff(sheet_index)):
        
        
        if c == 1:
            label = True

        elif c==-1:
            label=False
            group += 1

        else:
            pass 

        if label == True:
            if ss[i+1] == 'S':
                ss_groups[i+1] = group
                
    return ss_groups


def linker_finder(ss):
    
    linker_index = (ss == '-')*1
    ss_groups = np.zeros(ss.shape[0])
    group = 1
    
    # checking first index for linker 
    if linker_index[0] == 1:
        label = True
        ss_groups[0] = group
    else:
        label = False

    for i, c in enumerate(np.diff(linker_index)):
    
        if c == 1:
            label = True

        elif c==-1:
            label=False
            group += 1

        else:
            pass 

        if label == True:
            
            ss_groups[i+1] = group
            
            # if i==0:
            #     if linker_index[i] == '-':
            #         ss_groups[i] = group
            # else:       
            #     if ss[i+1] == '-':
            #         ss_groups[i+1] = group
                
    return ss_groups, linker_index


def get_sheet_coords(coords, sheet_groups):
    
    sheet_reg_lst = []

    for g in np.unique(sheet_groups):
        if g>0:
            sheet_reg_lst.append(coords[sheet_groups==g])
            
    return sheet_reg_lst


def linker_prep(coords_file, fingerprint_file):
    ss = open(fingerprint_file, 'r').readlines()[-1]
    ss = np.asarray(list(ss))[:-1] # remove newline: \n
    sections, linker_indices, _ = section_finder(ss)
    return linker_indices


def get_section_groupings(ss, structure_change):
    
    group = 0
    structural_groups = np.zeros(ss.shape)
    structural_groups[0] = group

    for i, c in enumerate(structure_change):

        if c != 0:
            group += 1

        structural_groups[i+1] = group
    return structural_groups

def structure_parser(coords_file, fingerprint_file):
    
    coords = np.genfromtxt(coords_file)
    
    coords = coords[~np.isnan(coords).any(axis=1)]
    
    ss = open(fingerprint_file, 'r').readlines()[-1]
    ss = np.asarray(list(ss))[:-1] # remove newline: \n

    # find unique sections
    sections, linker_indices,structure_change = section_finder(ss)
    
    # get section structure groups
    section_groups = get_section_groupings(ss, structure_change)
    
    # find linkers
    linker_groups, linker_index = linker_finder(ss)
    
    # find sheets
    sheet_groups = sheet_finder(ss)
    
    # get sheet coordinates
    sheet_coords = get_sheet_coords(coords, sheet_groups)
    
    return ss, linker_groups, linker_index
    
    
def find_bond_number(sheet_coords, thr=5.5):
    
#     number_bonds = 0

#     for i in range(1,len(sheet_coords)):

#         for j in range(0,i):

#             arr1, arr2 = sheet_coords[j], sheet_coords[i]
#             dist_matrix = cdist(arr1, arr2)
#             indices = np.where(dist_matrix < thr
#                               )

#             number_bonds += indices[0].shape[0]

            
    thr = 5.5
    number_bonds = 0

    pairwise_bond_num = np.zeros([len(sheet_coords), len(sheet_coords)])

    for i in range(1,len(sheet_coords)):

        for j in range(0,i):

            arr1, arr2 = sheet_coords[j], sheet_coords[i]
            dist_matrix = cdist(arr1, arr2)
            indices = np.where(dist_matrix < thr
                              )

            pairwise_bond_num[i,j] = indices[0].shape[0]

            number_bonds += indices[0].shape[0]

    return pairwise_bond_num    


def get_coords_from_file(coords_file):
    
    coords = np.genfromtxt(coords_file)
    coords = coords[~np.isnan(coords).any(axis=1)]
    return coords

def get_secondary_from_fingerprint(fingerprint_file):
    
    ss = open(fingerprint_file, 'r').readlines()[-1]
    ss = np.asarray(list(ss))[:-1] # remove newline: \n
    return ss
    
    
def sheet_pipe(coords_file, fingerprint_file):
    
    coords = get_coords_from_file(coords_file)
    ss     = get_secondary_from_fingerprint(fingerprint_file)
    
    # find sheets
    sheet_groups = sheet_finder(ss)
    
    # get sheet coordinates
    sheet_coords = get_sheet_coords(coords, sheet_groups)
    
    return sheet_coords


def sheet_groups_pipe(fingerprint_file):
    
    ss     = get_secondary_from_fingerprint(fingerprint_file)
    sheet_groups = sheet_finder(ss)
    return sheet_groups

def structure_coloring_v2(file_dir, fingerprint_file, linker_indices):
    
    
    struture_lst = listdir_nohidden(file_dir)
    
    linker_file_dict = {}
    for l in linker_indices:
        tmp = []
        
        for file in np.sort(struture_lst):
            if str(l) == file.split('_')[1]:
                tmp.append(file)

        linker_file_dict[l] = tmp
        
    linker_bond_lst = {}

    for l in linker_indices:
        
        tmp = []
        
        for file in linker_file_dict[l]:
            coords_file = file_dir+file
            sheet_coords = sheet_pipe(coords_file, fingerprint_file)
            tmp.append(find_bond_number(sheet_coords))
    
        linker_bond_lst[l] = tmp
        
    # sheet_change = []
    # for pw in linker_bond_lst:
    #     pw_bonds = linker_bond_lst[pw]
        
        # if len(lst)>0:
        #     sheet_change.append(np.max(lst) - np.min(lst))
        # else:
        #     sheet_change.append(0)

#     sheet_change = np.array(sheet_change)    
        
#     # --- Coloring ---
    
#     section_groups = get_sections(fingerprint_file)
#     coloring = np.zeros(section_groups.shape)

#     for l, c in zip(linker_indices, sheet_change):

#         inds = np.where(section_groups==l)

#         coloring[inds] = c
        
    return linker_bond_lst