In [1]:
import pandas as pd
import numpy as np
import xlsxwriter
import math
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

In [2]:
"""DataFrame Initialization
One for the original sequence reference
and the other for oligos
"""

d = {"SeqID": "", "Seq": "", "RevComp": "", "SeqLen": []}
df = pd.DataFrame(data=d)

opsheet = {"Target": [],"Probe Name": [],"Sequence (5'-3')": []}
opf = pd.DataFrame(data=opsheet)

In [3]:
"""Oligo generation and original sequence reference recording

Iterate through the number of sequences to update the original sequence reference dataframe with
the sequence ID, the original sequence, its reverse and complementary sequence, and its sequence length 

For each sequence, iterate per 60 bp to update the oligo dataframe with
the target name that is the sequence ID, 
the probe name that is the sequence ID plus the oligo number for that sequence ID,
and the 60 bp or shorter oligo sequence itself in the 5' to 3' direction

Note that oligos less than 4 bp are elimiated
"""

col2width = 0 # initialize the column width for the probe name

for seq_record in SeqIO.parse("Compiled Seq.fasta", "fasta"):
    revcomp = str(seq_record.seq.reverse_complement()[::])
    seqlen = len(seq_record)
    
    if len(seq_record.id) > col2width:
        col2width = len(seq_record.id)
    
    df = df.append({"SeqID": seq_record.id,
                    "Seq": str(seq_record.seq[::]),
                    "RevComp": revcomp,
                    "SeqLen": seqlen}, ignore_index=True)
    
    oligtot = math.ceil(seqlen/60)
    for olignumb in range(oligtot):
        
        seqstart = olignumb*60
        targetname = seq_record.id.split("-")[0]
        
        if olignumb != oligtot-1: 
            seqend = seqstart+60
        elif olignumb == oligtot-1 and seqlen-seqstart < 4:
            print("Skip the last oligo for " + str(targetname) + ", as it is shorter than 4 bp")
            continue
        else:
            seqend = seqlen
             
        opf = opf.append({"Target": targetname,
                          "Probe Name": str(targetname) + "-SDRNA-" + str(olignumb+1),
                          "Sequence (5'-3')": revcomp[seqstart:seqend]}, ignore_index=True)
            

Skip the last oligo for ITS1, as it is shorter than 4 bp
Skip the last oligo for RDN5, as it is shorter than 4 bp


In [4]:
opf

Unnamed: 0,Target,Probe Name,Sequence (5'-3')
0,ETS1,ETS1-SDRNA-1,ACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACC...
1,ETS1,ETS1-SDRNA-2,TTTTTCCCACCTATTCCCTCTTGCTAGAAGATACTTATTGAGTTTG...
2,ETS1,ETS1-SDRNA-3,TCCAGAAAAATTGCTTTTTCAGGTCTCTCTGCTGCCGGAAATGCTC...
3,ETS1,ETS1-SDRNA-4,TTTTACACTCTTGACCAGCGCACTCCGTCACCATACCATAGCACTC...
4,ETS1,ETS1-SDRNA-5,AATCAGGTTCCACCAAACAGATACCCCGGTGTTTCACGGAATGGTA...
...,...,...,...
113,RDN25,RDN25-SDRNA-56,CTCGCCGTTACTAAGGCAATCCCGGTTGGTTTCTTTTCCTCCGCTT...
114,RDN25,RDN25-SDRNA-57,GTTCAGCGGGTACTCCTACCTGATTTGAGGTCAAAC
115,RDN58,RDN58-SDRNA-1,AAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCA...
116,RDN58,RDN58-SDRNA-2,ATTCGATGATTCACGGAATTCTGCAATTCACATTACGTATCGCATT...


In [5]:
df["SeqLen"] = df["SeqLen"].astype("int")
df

Unnamed: 0,SeqID,Seq,RevComp,SeqLen
0,ETS1-1,ATGCGAAAGCAGTTGAAGACAAGTTCGAAAAGAGTTTGGAAACGAA...,ACTATCTTAAAAGAAGAAGCAACAAGCAGTAAAAAAGAAAGAAACC...,700
1,ETS2-1,TTTTTATTTCTTTCTAAGTGGGTACTGGCAGGAGCCGGGGCCTAGT...,ATAGTAAATAGTAACTTACATACATTAGTAAATGGTACACTCTTAC...,211
2,ITS1-1,AAGAAATTTAATAATTTTGAAAATGGATTTTTTTGTTTTGGCAAGA...,TTAATATTTTAAAATTTCCAGTTACGAAAATTCTTGTTTTTGACAA...,361
3,ITS2-1,CCTTCTCAAACATTCTGTTTGGTAGTGAGTGATACTCTTTGGAGTT...,TTTAAGAACATTGTTCGCCTAGACGCTCTCTTCTTATCGATAACGT...,232
4,RDN5-1,GGTTGCGGCCATATCTACCAGAAAGCACCGTTTCCCGTCCGATCAA...,AGATTGCAGCACCTGAGTTTCGCGTATGGTCACCCACTACACTACT...,121
5,RDN18-1,TATCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTA...,TAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTT...,1800
6,RDN25-1,GTTTGACCTCAAATCAGGTAGGAGTACCCGCTGAACTTAAGCATAT...,ACAAATCAGACAACAAAGGCTTAATCTCAGCAGATCGTAACAACAA...,3396
7,RDN58-1,AAACTTTCAACAACGGATCTCTTGGTTCTCGCATCGATGAAGAACG...,AAATGACGCTCAAACAGGCATGCCCCCTGGAATACCAAGGGGCGCA...,158


In [6]:
"""Export the dataframes to an Excel file
One dataframe per sheet with adjustments such as
the column widths, alignments, and text fonts.
"""

writer = pd.ExcelWriter("RNA oligos for SDRNA in S. cerevisiae.xlsx", engine='xlsxwriter')
opf.to_excel(writer, sheet_name="Oligos", header=True, index=False, index_label=None)
df.to_excel(writer, sheet_name="Sequences", header=True, index=False, index_label=None)

workbook  = writer.book
leftformat = workbook.add_format({"text_wrap": True,
                                  "valign": "vcenter",
                                  "font_name": "Courier New"})
centerformat = workbook.add_format({"text_wrap": True,
                                    "align": "center",
                                    "valign": "vcenter",
                                    "font_name": "Courier New"})

worksheet1 = writer.sheets["Oligos"]
worksheet1.set_column("A:A", None, leftformat)
worksheet1.set_column("B:B", col2width*2.5, leftformat)
worksheet1.set_column("C:C", 80, leftformat)

worksheet2 = writer.sheets["Sequences"]
worksheet2.set_column("A:A", None, centerformat)
worksheet2.set_column("D:D", None, centerformat)
worksheet2.set_column("B:C", 60, leftformat)

writer.save()
writer.close()