# Calculate secondary structure with NUPACK

This notebook requires a local copy of NUPACK. Results were calculated with nupack version 4.0.1.1 and saved to `stucture_results_28C.pickle`

In [1]:
import pickle
import pandas
import nupack

In [2]:
# Load data
data_full = pandas.read_csv(
    'preprocess_data/Zb_5UTR_MPRA_TPM_MRL.tsv.gz',
    index_col=0,
    sep='\t',
)

In [3]:
# prefix and suffix from https://benchling.com/s/seq-uUa6FERaYAzU182uPg9k?m=slm-vz2Gz1jlNp1fA3RxFcQA
# only use the first 100 bases of GFP as suffix
seq_prefix = 'GAAGAGTAGCCTGCAGATAGAC'
seq_suffix = 'ATGGTGTCTAAAGGAGAGGAGCTGTTCACAGGCGTGGTGCCAATCCTGGTGGAGCTGGATGGAGACGTGAACGGCCACAAGTTCAGCGTGAGAGGCGAGG'

In [4]:
# Initialize nupack model
# temp = 37
temp = 28
model1 = nupack.Model(material='rna', celsius=temp)

In [5]:
# Run analysis for all sequences
seq_results = {}

for i, (index, row) in enumerate(data_full.iterrows()):
    if i%100==0:
        print(f"Processing sequence {i:,} / {len(data_full):,}...")
    
    # Run analysis
    seq = seq_prefix + row['insert_seq'] + seq_suffix
    strand = nupack.Strand(seq, name=index)
    tube = nupack.Tube(strands={strand: 1e-9}, name='tube')
    tube_results = nupack.tube_analysis(tubes=[tube], model=model1, compute=['mfe', 'pairs'])
    complex_results = tube_results[f"({index})"]

    # Extract relevant results
    seq_info = {}
    seq_info['free_energy'] = complex_results.free_energy
    seq_info['mfe_structures_str'] = [str(m.structure) for m in complex_results.mfe]
    seq_info['mfe_energy'] = [m.energy for m in complex_results.mfe]
    # seq_info['pair_prob'] = complex_results.pairs.to_array()
    seq_info['unpaired_prob'] = complex_results.pairs.to_array().diagonal()

    seq_results[index] = seq_info

print('Saving results...')
results_to_save = {}
results_to_save['seq_prefix'] = seq_prefix
results_to_save['seq_suffix'] = seq_suffix
results_to_save['structure_results'] = seq_results

with open(f'stucture_results_{temp}C.pickle', 'wb') as handle:
    pickle.dump(results_to_save, handle, protocol=pickle.HIGHEST_PROTOCOL)

print('Done.')

Processing sequence 0 / 18,021...
Processing sequence 100 / 18,021...
Processing sequence 200 / 18,021...
Processing sequence 300 / 18,021...
Processing sequence 400 / 18,021...
Processing sequence 500 / 18,021...
Processing sequence 600 / 18,021...
Processing sequence 700 / 18,021...
Processing sequence 800 / 18,021...
Processing sequence 900 / 18,021...
Processing sequence 1,000 / 18,021...
Processing sequence 1,100 / 18,021...
Processing sequence 1,200 / 18,021...
Processing sequence 1,300 / 18,021...
Processing sequence 1,400 / 18,021...
Processing sequence 1,500 / 18,021...
Processing sequence 1,600 / 18,021...
Processing sequence 1,700 / 18,021...
Processing sequence 1,800 / 18,021...
Processing sequence 1,900 / 18,021...
Processing sequence 2,000 / 18,021...
Processing sequence 2,100 / 18,021...
Processing sequence 2,200 / 18,021...
Processing sequence 2,300 / 18,021...
Processing sequence 2,400 / 18,021...
Processing sequence 2,500 / 18,021...
Processing sequence 2,600 / 18,021