#### Uncomment to install biobox - used in PDB parsing

In [2]:
# !pip install biobox

# Imports + functions

In [344]:
import os
import biobox as bb
from tqdm import tqdm

def setup_working_directory():
    
    current = os.getcwd()
    working = 'Fitting'
    working_path = os.path.join(current, working)
    try:
        os.mkdir(working_path)
        print('Making directory ', working_path)
    except OSError as error:
        print(str(error)[11:])
        
    try:
        os.mkdir(working_path+'/fitdata')
        print('Making directory for fit data')
    except OSError as error:
        print(str(error)[11:])
    print('Complete')
    return working_path


def pdb_2_biobox(pdb_file):
    M = bb.Molecule()
    M.import_pdb(pdb_file)
    return M


def extract_CA_coordinates(M):
    ca_idx = (M.data['name']=='CA').values
    ca_coords = M.coordinates[0][ca_idx]
    
    if ca_coords.shape[0] != M.data['resid'].nunique():
        raise Exception("You better check your PDB... The number of CA atoms does not equal the number of ResIDs in your PDB file!") 
    else:
        return ca_coords

    
def extract_sequence(M):
    
    
    aa_names = {
                'A': 'ALA', 'C': 'CYS', 'D': 'ASP', 'E': 'GLU',
                'F': 'PHE', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE',
                'K': 'LYS', 'L': 'LEU', 'M': 'MET', 'N': 'ASN',
                'P': 'PRO', 'Q': 'GLN', 'R': 'ARG', 'S': 'SER',
                'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR'
                }

    names_aa = {y: x for x, y in aa_names.items()}
    
    ca_idx = (M.data['name']=='CA').values
    resnames = M.data['resname'][ca_idx].map(names_aa).values
    
    if resnames.shape[0] != M.data['resid'].nunique():
        raise Exception("You better check your PDB... The number of CA atoms does not equal the number of ResIDs in your PDB file!") 
    else:
        return resnames


def write_fingerprint_file(number_chains, sequence, secondary_structure, working_path):
    
    assert isinstance(number_chains, int), 'Yikes... The number of chains is not int type!'
    
    if number_chains > 1:
        print('Are sure you have more than one chain - if not this will cause segmentation errors later! You have been warned...')
    
    seq_run = ''.join(list(sequence))
    ss_run = ''.join(list(secondary_structure))
    
    if len(seq_run) != len(ss_run):
        raise Exception("Uh Oh... The length of sequence and secondary structure is not equal!") 
    
    f = open(working_path+"/fingerPrint1.dat", "w")
    f.write(str(number_chains))
    f.write('\n \n')
    f.write(seq_run)
    f.write('\n \n')
    f.write(ss_run)
    f.close()
    
    
def write_coordinates_file(ca_coords, working_path):
    
    assert type(coords).__module__ == np.__name__, 'Thats never good... the CA coordinates are not a numpy array'
    np.savetxt(working_path+'/coordinates1.dat', coords, delimiter=' ', fmt='%s',newline='\n', header='', footer='')
    
    
def write_mixture_file(working_path):
    # if default:
    f = open(working_path+"/mixtureFile.dat", "w")
    f.write(str(1))
        
#     else:
#          copy input file


def write_varysections_file(varying_sections, working_path):
    # auto: run beta sheet breaking code; write output sections to file
    f = open(working_path+"/varyingSectionSecondary1.dat", "w")
    for i, s in enumerate(varying_sections):
        f.write(str(s))
        
        if i < len(varying_sections)-1:
            f.write('\n')
    f.close()

    
def copy_saxs(SAXS_file, working_path):
    
    saxs_arr = np.genfromtxt(SAXS_file)
    
    if saxs_arr.shape[1] == 3:
        saxs_arr = saxs_arr[:,:2]
        
    np.savetxt(working_path+'/Saxs.dat', saxs_arr, delimiter=' ', fmt='%s',newline='\n', header='', footer='')


def read_dssp_file(dssp_filename):
    
    simplify_dict = {'H': 'H', 'B': 'S', 'E': 'S', 'G': 'H', 'I': 'H', 'T': '-', 'S': '-', '-': '-', ' ': '-'}
    
    lines=[]
    with open(dssp_filename) as input_data:
        # Skips text before the beginning of the interesting block:
        for line in input_data:
            if line.strip() == '#  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O    O-->H-N    N-H-->O    O-->H-N    TCO  KAPPA ALPHA  PHI   PSI    X-CA   Y-CA   Z-CA': 
                break
        # Reads text until the end of the block:
        for line in input_data:  # This keeps reading the file
            lines.append(simplify_dict[line[16]])
    return ''.join(lines)
    
    
def simplify_secondary(dssp_struct):
    
    simplify_dict = {'H': 'H', 'B': 'S', 'E': 'S', 'G': 'H', 'I': 'H', 'T': '-', 'S': '-', '-': '-', ' ': '-'}
    
    secondary_structure = []
    
    for s in dssp_struct:
        
        if s not in list(simplify_dict.keys()):
            print('>>> ', s, ' <<<')
            raise Exception('Secondary structure not recognised!')
            
        secondary_structure.append(simplify_dict[s])
        
    return secondary_structure


def write_sh_file(working_path, fit_n_times, min_q, max_q, max_fit_steps):
    
    curr = os.getcwd()
    run_file = curr + '/RunMe.sh'

    with open(run_file, 'w+') as fout:
        fout.write('#!/bin/bash')
        
        saxs_file = working_path+'/Saxs.dat'
        FP_file = working_path+"/fingerPrint1.dat"
        coords_file = working_path+'/coordinates1.dat'
        varying_file = working_path+"/varyingSectionSecondary1.dat"
        mixture_file = working_path+"/mixtureFile.dat"
        
        # Auto assign min / max q from SAXS profile
        # saxs_arr = np.genfromtxt(saxs_file)
        # min_q = np.round(saxs_arr[:,0].min(),2)
        # max_q = np.round(saxs_arr[:,0].max(),2)
        
        fout.write('\nfor i in {1..'+str(fit_n_times)+'}')

        fout.write('\n\ndo')
        fout.write('\n\n   echo " Run number : $i "')
        fout.write('\n\n   ./predictStructure ' + saxs_file + ' ' + working_path+'/' + ' ' + coords_file + ' ' + 'none' + ' ' + varying_file + ' ' + '1' + ' ' + 'none' + \
                   ' ' + 'none' + ' ' + str(min_q) + ' ' + str(max_q) + ' ' + str(max_fit_steps) + ' ' + working_path+'/fitdata/fitmolecule$i' + ' ' + working_path+'/fitdata/scatter$i.dat' + ' ' + mixture_file + ' ' +'1')
                   
        fout.write('\n\ndone')
        
    print('Successfully written bash script to: ', working_path+run_file)

# SMARCAL Example

#### Create a working directory for Carbonara - input files & predictions will be written to this directory

In [335]:
working_path = setup_working_directory()

Making directory  /Users/josh/Documents/PhD/TEMP_Carb/CarbonARA/Fitting
Making directory for fit data


#### Provide a PDB containing the known structure for protein - xyz coordinates and sequence extracted from PDB

In [336]:
# Define a PDB file location
pdb_file = 'Example/SMARCAL/human_SMARCAL1.pdb'

# Read in a pdb file
M = pdb_2_biobox(pdb_file)

# Extract coordinates + primary sequence
coords = extract_CA_coordinates(M)
sequence = extract_sequence(M)

#### Carbonara requires the protein's secondary structure - user can copy and paste a string or give a DSSP file!

> Note: Carbonara uses a simplified secondary structure dictionary: H (alpha helices), S (beta strands), and - (Linkers/Loops) 

In [337]:
# Manually defining the simplified secondary structure string
secondary = ['------SSSSSSS----SSSSSS---HHHHHHHH-----SSS----SSSSSHHHHHHHHHHH-----SSSS---HHHHHH---HHH-------------------HHHHH---HHHHHHHHHHHH---SSSS-------HHHHHHHHHHHHHH---SSSSS----HHHHHHHHHHH-----HHHSSS------------SSSSSHHHH------------SSS---HHHH-----HHHHHHHHHHH---SSSSS--------HHHHHHHHHHH-------HHHHHHHH---SS---SSS------HHHHHHHHHHHH-----HHHH------SSSSSS---HH---HHHHHHHHHHHHHHH-----HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH-----SSSS---HHHHHHHHHHHHH----SSSS-------HHHHHHHHHHH-----SSSSS--------------SSSS------HHHHHHHH-----------SSSSSSS-----HHHHHHHHHHHHHHHHH---------HHHH--']

# --- Simplify your secondary structure string ---

# Have DSSP secondary structure string? Simplify with:
# secondary = simplify_secondary(DSSP_string)

# --- DSSP file procedure ---

# Just have a DSSP file?
# secondary = read_dssp_file(DSSP_file)

#### Writing files to be used as input for Carbonara

In [338]:
# Write files ready for Carbonara!
write_fingerprint_file(1, sequence, secondary, working_path)
write_coordinates_file(coords, working_path)
write_mixture_file(working_path)

#### Sections to be resampled - User can manually select these or have these automatically assigned

In [339]:
# Manually defining sections to change
varying_sections = [43, 51, 79, 81]  # <<< Give us your selection! #
write_varysections_file(varying_sections, working_path)

#### Provide a SAXS profile

In [340]:
# Copy SAXS data file into correct format
SAXS_file = 'Example/SMARCAL/smrclcnc_a2.dat'
copy_saxs(SAXS_file, working_path)

#### Generate bash file to run Carbonara

**Inputs**
 > fit_n_times   : number of unique fits to be generated (sequentially)
 
 > min_q         : lower bound q in SAXS data
 
 > max_q         : upper bound q in SAXS data
 
 > max_fit_steps : maximum number of fitting iterations to be performed (recommend 4000-10,000) 

In [341]:
write_sh_file(working_path=working_path, fit_n_times=3, min_q=0.02, max_q=0.25, max_fit_steps=10)

Successfully written bash script to:  /Users/josh/Documents/PhD/TEMP_Carb/CarbonARA/Fitting/Users/josh/Documents/PhD/TEMP_Carb/CarbonARA/RunMe.sh


In [342]:
!sh RunMe.sh

 Run number : 1 
 Run number : 2 
 Run number : 3 


**********