In [1]:
import pandas as pd
import numpy as np
import subprocess

## **GPA1 file preparation**
We need inserts at the 10th, 36th, and 67th position. As we save the GPA1 sequence +/- 1000 nt, we can remove the first 10*3nt=30nt, 36*3nt=108nt, 67*3nt=201nt and the program will insert them at the thousandst position as it would normally consider the +/- 1000 additional buffer nucleotides. 

In [215]:
# read in S288C_YHR005C_GPA1_flanking.fsa
with open('Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking.fsa', 'r') as file:
    gpa1_seq = file.read()
print(gpa1_seq)

>GPA1 YHR005C SGDID:S000001047, Chromosome VIII:113499..114917+/- 1kb
TCATACTTAGAAAAACATGTCTTTCTTAGGTTTCGGTGGTGGTCAGCCTCAATTATCATC
TCAACAAAAGATTCAAGCTGCGGAAGCTGAACTAGATTTGGTCACAGACATGTTCAATAA
ATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAA
TAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCA
AGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAA
ATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATCCATAAGCTGTTTT
ACTCGACTCAACGTTATAATTAGTATTATTGATTCATATCCTGTATATACAAGTAACATT
ATACTCTTTTCTGTACTTCATCTTTACCTTTTTATCTTACATGGCACATGTTGTTTGAAA
ACAAGATCATAGGTGGATAAAGCAAGCCGAATCTAAAAAAAAAAAAATGTCTCTATTGGA
AAACTGAATGCATAACGATATTCCCTTTTCATGCAGATAACACTGACTAGTTTCAATTTG
AAACGCATCTTCGTGTTATTTCACCGAAACGCACTCGGCTCAGCATGTTAAAAAGCACAT
CAATTTAGGGCTCTGCGCGTCCTTCTGCGTATTCTTCCTTGTAGAAATGCAATTAAATGG
AGAGCAGAAATTTTTTTGTTACATATTGTTTTCCTTAAAGGGAAATATTAAAAAATAGTC
TAAAATGAAGAGGATAGTAGAATTCCACCAATTTCTTTACGTTTTATATTATTCGTAATC
TTTTGATCTGTTATTCATTTTTTCTTGTCACTCCGTTTCTAACATTTTTGACCATTTCTA
AGACCAAACTGAGTA

In [217]:
print('In gpa1_seq:', gpa1_seq[:150], '...', 'we want to remove the nucleotides beginning from','---', gpa1_seq[70:80],'...', 'i.e. beginning from the index 70 of gpa1_seq')

In gpa1_seq: >GPA1 YHR005C SGDID:S000001047, Chromosome VIII:113499..114917+/- 1kb
TCATACTTAGAAAAACATGTCTTTCTTAGGTTTCGGTGGTGGTCAGCCTCAATTATCATC
TCAACAAAAGATTCAAGCT ... we want to remove the nucleotides beginning from --- TCATACTTAG ... i.e. beginning from the index 70 of gpa1_seq


the indexing is including when-from, but excluding when-to, so if we want to eliminate the first thirty nt, we need to index from 30+1.

In [None]:
# the index is not included in the slice when -from, and included when -to
gpa1_seq[:1] == gpa1_seq[0], gpa1_seq[1:], gpa1_seq[1]  

(True,
 'GPA1 YHR005C SGDID:S000001047, Chromosome VIII:113499..114917+/- 1kb\nTCATACTTAGAAAAACATGTCTTTCTTAGGTTTCGGTGGTGGTCAGCCTCAATTATCATC\nTCAACAAAAGATTCAAGCTGCGGAAGCTGAACTAGATTTGGTCACAGACATGTTCAATAA\nATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAA\nTAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCA\nAGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAA\nATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATCCATAAGCTGTTTT\nACTCGACTCAACGTTATAATTAGTATTATTGATTCATATCCTGTATATACAAGTAACATT\nATACTCTTTTCTGTACTTCATCTTTACCTTTTTATCTTACATGGCACATGTTGTTTGAAA\nACAAGATCATAGGTGGATAAAGCAAGCCGAATCTAAAAAAAAAAAAATGTCTCTATTGGA\nAAACTGAATGCATAACGATATTCCCTTTTCATGCAGATAACACTGACTAGTTTCAATTTG\nAAACGCATCTTCGTGTTATTTCACCGAAACGCACTCGGCTCAGCATGTTAAAAAGCACAT\nCAATTTAGGGCTCTGCGCGTCCTTCTGCGTATTCTTCCTTGTAGAAATGCAATTAAATGG\nAGAGCAGAAATTTTTTTGTTACATATTGTTTTCCTTAAAGGGAAATATTAAAAAATAGTC\nTAAAATGAAGAGGATAGTAGAATTCCACCAATTTCTTTACGTTTTATATTATTCGTAATC\nTTTTGATCTGTTATTCATTTTTTCTTGTCACTCCGTTTCTAACATTTTTGACC

here we remove first 30, then 108, and last 201 nt by adding the difference to the first index of the starting sequence to the nt that we want to remove. 

In [211]:
gpa1_seq_without_30 = gpa1_seq[70+30+1:]
gpa1_seq_without_108 = gpa1_seq[70+108+1:]
gpa1_seq_without_201 = gpa1_seq[70+201+1:]
gpa1_seq_without_30, gpa1_seq_without_108, gpa1_seq_without_201

('TTCGGTGGTGGTCAGCCTCAATTATCATC\nTCAACAAAAGATTCAAGCTGCGGAAGCTGAACTAGATTTGGTCACAGACATGTTCAATAA\nATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAA\nTAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCA\nAGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAA\nATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATCCATAAGCTGTTTT\nACTCGACTCAACGTTATAATTAGTATTATTGATTCATATCCTGTATATACAAGTAACATT\nATACTCTTTTCTGTACTTCATCTTTACCTTTTTATCTTACATGGCACATGTTGTTTGAAA\nACAAGATCATAGGTGGATAAAGCAAGCCGAATCTAAAAAAAAAAAAATGTCTCTATTGGA\nAAACTGAATGCATAACGATATTCCCTTTTCATGCAGATAACACTGACTAGTTTCAATTTG\nAAACGCATCTTCGTGTTATTTCACCGAAACGCACTCGGCTCAGCATGTTAAAAAGCACAT\nCAATTTAGGGCTCTGCGCGTCCTTCTGCGTATTCTTCCTTGTAGAAATGCAATTAAATGG\nAGAGCAGAAATTTTTTTGTTACATATTGTTTTCCTTAAAGGGAAATATTAAAAAATAGTC\nTAAAATGAAGAGGATAGTAGAATTCCACCAATTTCTTTACGTTTTATATTATTCGTAATC\nTTTTGATCTGTTATTCATTTTTTCTTGTCACTCCGTTTCTAACATTTTTGACCATTTCTA\nAGACCAAACTGAGTAGAAGCTATTCATACTGTAAATTGGTATTTTAGCATCACATCAATA\nATCCAGAGGTGTATAAATTGATATATTAAGGTAGGAA

In [183]:
# see what we had so far
with open('Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_30_nt.fsa', 'r') as file:
    gpa1_seq_without_30_manual = file.read()

with open('Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_108_nt.fsa', 'r') as file:
    gpa1_seq_without_108_manual = file.read()

with open('Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_201_nt.fsa', 'r') as file:
    gpa1_seq_without_201_manual = file.read()

gpa1_seq_without_30_manual, gpa1_seq_without_108_manual, gpa1_seq_without_201_manual

('>GPA1 YHR005C SGDID:S000001047, Chromosome VIII:113499..114917+/- 1kb\nGTTTCGGTGGTGGTCAGCCTCAATTATCATC\nTCAACAAAAGATTCAAGCTGCGGAAGCTGAACTAGATTTGGTCACAGACATGTTCAATAA\nATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAA\nTAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCA\nAGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAA\nATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATCCATAAGCTGTTTT\nACTCGACTCAACGTTATAATTAGTATTATTGATTCATATCCTGTATATACAAGTAACATT\nATACTCTTTTCTGTACTTCATCTTTACCTTTTTATCTTACATGGCACATGTTGTTTGAAA\nACAAGATCATAGGTGGATAAAGCAAGCCGAATCTAAAAAAAAAAAAATGTCTCTATTGGA\nAAACTGAATGCATAACGATATTCCCTTTTCATGCAGATAACACTGACTAGTTTCAATTTG\nAAACGCATCTTCGTGTTATTTCACCGAAACGCACTCGGCTCAGCATGTTAAAAAGCACAT\nCAATTTAGGGCTCTGCGCGTCCTTCTGCGTATTCTTCCTTGTAGAAATGCAATTAAATGG\nAGAGCAGAAATTTTTTTGTTACATATTGTTTTCCTTAAAGGGAAATATTAAAAAATAGTC\nTAAAATGAAGAGGATAGTAGAATTCCACCAATTTCTTTACGTTTTATATTATTCGTAATC\nTTTTGATCTGTTATTCATTTTTTCTTGTCACTCCGTTTCTAACATTTTTGACCATTTCTA\nAGACCAAACTGAGTAGAAGCTATTCA

It doesnt match with our previous input (by manually counting, the S288C_YH.. files), and the new lines `\n` appear to be counted. So we repeat by first removing `\n` from gpa1_seq.

In [214]:
# remove the \n from the gpa1_seq and create new gpa1_without_30, gpa1_seq_without_108, gpa1_seq_without_201
gpa1_seq = gpa1_seq.replace('\n', '')
gpa1_seq_without_30 = gpa1_seq[70+30+1:]
gpa1_seq_without_108 = gpa1_seq[70+108+1:]
gpa1_seq_without_201 = gpa1_seq[70+201+1:]
gpa1_seq_without_30, gpa1_seq_without_108, gpa1_seq_without_201

('TCGGTGGTGGTCAGCCTCAATTATCATCTCAACAAAAGATTCAAGCTGCGGAAGCTGAACTAGATTTGGTCACAGACATGTTCAATAAATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAATAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCAAGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAAATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATCCATAAGCTGTTTTACTCGACTCAACGTTATAATTAGTATTATTGATTCATATCCTGTATATACAAGTAACATTATACTCTTTTCTGTACTTCATCTTTACCTTTTTATCTTACATGGCACATGTTGTTTGAAAACAAGATCATAGGTGGATAAAGCAAGCCGAATCTAAAAAAAAAAAAATGTCTCTATTGGAAAACTGAATGCATAACGATATTCCCTTTTCATGCAGATAACACTGACTAGTTTCAATTTGAAACGCATCTTCGTGTTATTTCACCGAAACGCACTCGGCTCAGCATGTTAAAAAGCACATCAATTTAGGGCTCTGCGCGTCCTTCTGCGTATTCTTCCTTGTAGAAATGCAATTAAATGGAGAGCAGAAATTTTTTTGTTACATATTGTTTTCCTTAAAGGGAAATATTAAAAAATAGTCTAAAATGAAGAGGATAGTAGAATTCCACCAATTTCTTTACGTTTTATATTATTCGTAATCTTTTGATCTGTTATTCATTTTTTCTTGTCACTCCGTTTCTAACATTTTTGACCATTTCTAAGACCAAACTGAGTAGAAGCTATTCATACTGTAAATTGGTATTTTAGCATCACATCAATAATCCAGAGGTGTATAAATTGATATATTAAGGTAGGAAATAATGGGGTGTACAGTGAGTACGCAAACAATA

Now we can proceed saving these new input sequences and run the PIPOline again.

In [219]:
# first add th first 70 strings of gpa1_seq to gpa1_seq_without_30, _108, _201, and then save them to fasta files
gpa1_seq_without_30 = gpa1_seq[:70] + gpa1_seq_without_30
gpa1_seq_without_108 = gpa1_seq[:70] + gpa1_seq_without_108
gpa1_seq_without_201 = gpa1_seq[:70] + gpa1_seq_without_201


with open('Protein_design/GPA1_Venus/S288C_YH_gpa1_seq_without_30_nt.fsa', 'w') as file:
    file.write(gpa1_seq_without_30)

with open('Protein_design/GPA1_Venus/S288C_YH_gpa1_seq_without_108_nt.fsa', 'w') as file:
    file.write(gpa1_seq_without_108)

with open('Protein_design/GPA1_Venus/S288C_YH_gpa1_seq_without_201_nt.fsa', 'w') as file:
    file.write(gpa1_seq_without_201)

## **GPA1-Venus insert**

### Example done with Vojislav
python.exe ./main.py --backbone_path ./Test_examples/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Test_examples/S288C_YHR005C_GPA1_flanking_without_first_30_nt.fsa --linker_path ./Test_examples/long_linker.fsa --modality 5 --enzyme_path ./Test_examples/raw_enzyme_list.txt --popular_enzyme_path ./Test_examples/Popular_enzymes.txt --FPG_paths ./Test_examples/CLB2_3p_labeling/ymNeonGreen_FPG.fsa

*add "--assembled_plasmid_name name_of_result.fsa" to output it to a fasta file*

### Construct the inserts at different positions

### *10th position*

python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_30_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa

In [2]:
gpa1_10_venus = "python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_30_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa"
output = subprocess.check_output(gpa1_10_venus, shell=True, text=True)

with open('Protein_design/GPA1_Venus/result_gpa1_10_venus.txt', 'w') as file:
    file.write(output)

##### Saving the output (additional flag):
python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_30_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa --assembled_plasmid_name gpa1_10_venus.fsa

##### PIPOline 2nd try with removed `\n` from input sequences:
python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YH_gpa1_seq_without_30_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa --assembled_plasmid_name gpa1_10_venus_v2.fsa

### *36th position*

python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_108_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa

In [3]:
gpa1_36_venus = "python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_108_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa"
output = subprocess.check_output(gpa1_36_venus, shell=True, text=True)

with open('Protein_design/GPA1_Venus/result_gpa1_36_venus.txt', 'w') as file:
    file.write(output)

##### Saving the output (additional flag):
python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_108_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa --assembled_plasmid_name gpa1_36_venus.fsa

##### PIPOline 2nd try with removed `\n` from input sequences:
python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YH_gpa1_seq_without_108_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa --assembled_plasmid_name gpa1_36_venus_v2.fsa

### *67th position*

python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_201_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa

In [4]:
gpa1_67_venus = "python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_201_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa"
output = subprocess.check_output(gpa1_67_venus, shell=True, text=True)

with open('Protein_design/GPA1_Venus/result_gpa1_67_venus.txt', 'w') as file:
    file.write(output)

##### Saving the output (additional flag):
python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YHR005C_GPA1_flanking_without_first_201_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa --assembled_plasmid_name gpa1_67_venus.fsa

##### PIPOline 2nd try with removed `\n` from input sequences:
python.exe ./main.py --backbone_path ./Protein_design/GPA1_Venus/pETUL_backbone.fsa --MCS_start_ind 1 --MCS_end_ind 108 --min_homology 150 --alpha 2 --Gene_path ./Protein_design/GPA1_Venus/S288C_YH_gpa1_seq_without_201_nt.fsa --linker_path ./Protein_design/GPA1_Venus/3xSGGGG_linker.fsa --modality 5 --enzyme_path ./raw_enzyme_list.txt --popular_enzyme_path ./Popular_enzymes.txt --FPG_paths ./FPGs/yEVenus.fsa --assembled_plasmid_name gpa1_67_venus_v2.fsa

## **Check the pETUL backbone requirements**

**Vojislav's input:** One thing to be careful about: If you're ordering from a company, the pETUL backbone has to be opened up between XhoI and SacI sites (the company imposes this requirement). PIPOline might suggest different cutsites for digesting the backbone and cloning the inserts in. So, can you make sure that even in the case XhoI and SacI are used, the cutsites used for integration into yeast genome are unique? This is SphI in all of your plasmids. (SphI has to cut the plasmid only once, in order to linearize it for integration into yeast genome. Otherwise, with multiple cutistes, we wouldn't be linearizing the plasmid, but chopping it into pieces).

The company has already removed the terminal 'TCGAG' from XhoI cutsite and 'GAGCT' from SacI cutsite. You could restore XhoI and SacI cutsites by adding 'TCGAG' to the 5' end of your insert and 'GAGCT' to the 3' end. This can be useful in the future in case you want to reuse SacI and XhoI, in case they are unique in the final plasmid.

***1st interpretation:*** That we have to check if one of the cutsite is in the yeast genome

***2nd interpretation:*** That we need to look into the pETUL backbone, or in the inserted piece of DNA such that we dont chop up either one or the other when trying to insert it into the yeast genome

### *read in the yeast genome (S288C)*

##### ORF 3' UTR

In [22]:
# read in text from \Genes\S288C\SGD_all_ORFs_3prime_UTRs.fsa
with open('Genes/S288C/SGD_all_ORFs_3prime_UTRs.fsa', 'r') as file:
    SGD_ORF_3p = file.read()

SGD_ORF_3p[:600]

">sacCer3_ct_Pelechanoonlybased3primeUTRs_3950_YAL067C_id001_three_prime_UTR range=chrI:7013-7235 5'pad=0 3'pad=0 strand=- repeatMasking=none\nATACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGT\nTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATAT\nATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATA\nTCTTTCTCCTAACAAAACATTCGCTTGTAAAGTATTATATTTAGGAAAAA\nAATAATCGATAAAGGCTCATCCG\n>sacCer3_ct_Pelechanoonlybased3primeUTRs_3950_YAL066W_id001_three_prime_UTR range=chrI:10399-10460 5'pad=0 3'pad=0 strand=+ repeatMasking=none\nGAAAATCACAGTACAAAAATTTTGAATTTATGTATAACCGTTTCGCCTGA\nTATATGTAAGAG\n>sacCer3_ct_Pelechanoonl"

##### ORF 5' UTR

In [23]:
# read in text from \Genes\S288C\SGD_all_ORFs_5prime_UTRs.fsa
with open('Genes/S288C/SGD_all_ORFs_5prime_UTRs.fsa', 'r') as file:
    SGD_ORF_5p = file.read()

SGD_ORF_5p[:600]

">sacCer3_ct_PelechanoonlybasedUTRs_1122_YAL067C_id001_five_prime_UTR range=chrI:9016-9049 5'pad=0 3'pad=0 strand=- repeatMasking=none\nCAAAGAGAACTACTGCATATATAAATAACATACA\n>sacCer3_ct_PelechanoonlybasedUTRs_1122_YAL066W_id001_five_prime_UTR range=chrI:9807-10091 5'pad=0 3'pad=0 strand=+ repeatMasking=none\nGTCAAACCAAATGGTTTTTCAGATAAGAAATTGACAGTATCTGAGAATTT\nGCTATCAAAGCTCAGAGGATTTACATATTTTAACGTAATTAAAACATTTT\nTATGTTCGATATATTAGCAAATAGCGTATTAATATACAGCTGTTGCGCTC\nATGGTAAAATTTAGCGATATACTTTGCATCTTGGCTGCAAAGAAGAATGA\nATCGGATATACTATTTTTGATCATAATGACGGACATCATGATATAATAAC\nGTTATACGGATAACTTTATTTCAAAAGCACCATCA\n>sacC"

### *read in the restriction enzymes and sites*

In [8]:
# create a dictionary with raw_enzymes_list.txt, all odd values are keys, all even values are values
enzyme_dict = {}
with open('raw_enzyme_list.txt', 'r') as file:
    for line in file:
        line = line.strip()
        if line != '':
            enzyme_dict[line] = next(file).strip()

In [14]:
# create a df and confirm that there is no nan values or duplicates
enzyme_df = pd.DataFrame.from_dict(enzyme_dict, orient='index')
enzyme_df = enzyme_df.reset_index()
enzyme_df.columns = ['enzyme', 'sequence']
enzyme_df = enzyme_df.dropna()
enzyme_df = enzyme_df.drop_duplicates()
enzyme_df = enzyme_df.reset_index(drop=True)
enzyme_df.head()

Unnamed: 0,enzyme,sequence
0,AatII,GACGTC
1,AccI,GTMKAC
2,Acc65I,GGTACC
3,AciI,CCGC
4,AclI,AACGTT


### ***1st:*** *check if restriction sites are not in yeast genome* **(not useful)**

In [82]:
restr_check = enzyme_df['enzyme'].isin(['SphI', 'XhoI', 'SacI'])
restr_check = enzyme_df[restr_check]
restr_check

Unnamed: 0,enzyme,sequence
242,SacI,GAGCTC
265,SphI,GCATGC
285,XhoI,CTCGAG


##### ORF 3' UTR

In [106]:
# total amount of positions in the genome where the restriction sites are found
SGD_ORF_3p.count('GAGCTC') + SGD_ORF_3p.count('GCATGC') + SGD_ORF_3p.count('CTCGAG')

594

##### ORF 5' UTR

In [109]:
# total amount of positions in the genome where the restriction sites are found
SGD_ORF_5p.count('GAGCTC') + SGD_ORF_5p.count('GCATGC') + SGD_ORF_5p.count('CTCGAG')

687

### ***2nd:*** *check if restriction sites are not represented in the pETUL backbone or the plasmid with the tagged gpa1*

read in pETUL backbone

In [110]:
# read in pETUL_backbone.fsa
with open('Protein_design/GPA1_Venus/pETUL_backbone.fsa', 'r') as file:
    pETUL_backbone = file.read()
pETUL_backbone[:600]

'>pETUL_backbone  (6598 bp)\nGGTACCGGGCCCCCCCTCGAGGTCGACGGTATCGATAAGCTTGATATCGAATTCCTGCAGCCCGGGGGATCCACTAGTTC\nTAGAGCGGCCGCCACCGCGGTGGAGCTCCAGCTTTTGTTCCCTTTAGTGAGGGTTAATTGCGCGCTTGGCGTAATCATGG\nTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATAcGAGCCGGAAGCATAAAGTGTAAAGC\nCTGGGGTGCCTAATGAGTGAGcTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGT\nGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACT\nGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATC\nAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGT\nTTTTCC'

read in the PIPO result of the tagged gpa1

In [112]:
# read in the inserted pieces
with open('Protein_design/GPA1_Venus/gpa1_10_venus.fsa', 'r') as file:
    gpa1_10_venus = file.read()

with open('Protein_design/GPA1_Venus/gpa1_36_venus.fsa', 'r') as file:
    gpa1_36_venus = file.read()

with open('Protein_design/GPA1_Venus/gpa1_67_venus.fsa', 'r') as file:
    gpa1_67_venus = file.read()

gpa1_10_venus[:600], gpa1_67_venus[:600], gpa1_36_venus[:600]


(">Plasmid for N-terminal tagging of S288 with fluorescent proteins. Can be integrated into the budding yeast genome using SphI, SphI-HF®, GCATGC cutsite. Intented to be labeled with ['yEVenus.fsa']. To ensure high-efficency digestion during FPG cloning, there is a placeholder sequence between the gene piece and the linker\nGGTACCGGGCCCCCCCTCGAGGTCGACTTGGTCACAGACATGTTCAATAAATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAATAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCAAGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAAATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATC",
 ">Plasmid for N-terminal tagging of S288 with fluorescent proteins. Can be integrated into the budding yeast genome using SphI, SphI-HF®, GCATGC cutsite. Intented to be labeled with ['yEVenus.fsa']. To ensure high-efficency digestion during FPG cloning, there is a placeholder sequence between the gene piece and the linker\nGGTACCGGGCCCCCCCTCGAGATTTGAAACGCATCTTCGTGTTATTTCACCGAAACGCACTCGGCTCA

define restriction enzymes that we want to test

In [124]:
restr_check = enzyme_df[enzyme_df['enzyme'].isin(['SphI', 'XhoI', 'SacI'])] # 'SphI-HF®' is the same as 'SphI'
restr_check

Unnamed: 0,enzyme,sequence
242,SacI,GAGCTC
265,SphI,GCATGC
285,XhoI,CTCGAG


how many times are the restriction site sequences represented in the pETUL backbone or the designed inserts?

In [151]:
# check how many occurences of enzyme_df['sequence'] are in pETUL_backbone or gpa1_10_venus
for index, row in restr_check.iterrows():
    print(row['enzyme'], 'in pETUL_backbone:', pETUL_backbone.count(row['sequence']), '|', row['enzyme'], 'in gpa1_10_venus:', gpa1_10_venus.count(row['sequence']), row['enzyme'], '|', 'in gpa1_36_venus:', gpa1_36_venus.count(row['sequence']), '|', row['enzyme'], 'in gpa1_67_venus:', gpa1_67_venus.count(row['sequence']))
    

SacI in pETUL_backbone: 1 | SacI in gpa1_10_venus: 1 SacI | in gpa1_36_venus: 1 | SacI in gpa1_67_venus: 1
SphI in pETUL_backbone: 0 | SphI in gpa1_10_venus: 2 SphI | in gpa1_36_venus: 2 | SphI in gpa1_67_venus: 2
XhoI in pETUL_backbone: 1 | XhoI in gpa1_10_venus: 1 XhoI | in gpa1_36_venus: 1 | XhoI in gpa1_67_venus: 1


Analyze the 2 present sequences in gpa1_venus inserts

In [165]:
# print all the positions of the restriction sites in the sequence of interest (SOI) +/- 15 nt from the matched site
# There should be more than one occurrence for SphI
for index, row in restr_check.iterrows():
    if row['sequence'] in gpa1_10_venus:
        print('========================================================================')
        enzyme = row['enzyme']
        seq = row['sequence']
        positions = [i for i in range(len(gpa1_10_venus)) if gpa1_10_venus.startswith(seq, i)]
        print(f'Enzyme: {enzyme} | Restriction site: {seq} | Position/s in SOI: {positions}')
        print('------------------------------------------------------------------------')
        for pos in positions:
            print('plotted sequence in SOI +/- 15nt:', gpa1_10_venus[max(0, pos - 15):pos + len(seq) + 15])
        


Enzyme: SacI | Restriction site: GAGCTC | Position/s in SOI: [1820]
------------------------------------------------------------------------
plotted sequence in SOI +/- 15nt: ACCGGACGAGCAAAAGAGCTCCAGCTTTTGTTCCCT
Enzyme: SphI | Restriction site: GCATGC | Position/s in SOI: [137, 1670]
------------------------------------------------------------------------
plotted sequence in SOI +/- 15nt: phI, SphI-HF®, GCATGC cutsite. Inten
plotted sequence in SOI +/- 15nt: AATAAAGATTTGTTTGCATGCAAGAGAATACTGCTA
Enzyme: XhoI | Restriction site: CTCGAG | Position/s in SOI: [338]
------------------------------------------------------------------------
plotted sequence in SOI +/- 15nt: GGTACCGGGCCCCCCCTCGAGGTCGACTTGGTCACA


From the two occurences for the SphI in the insnert, only one looks legitimate at position 1670 of the gpa1_venus inserts, the other position point to the description of the gpa1_venus.fsa results. This means, that we have only one cutsite of `SphI` in the inserted site and none `SphI` in the pETUL backbone, which in result will only linearize and not chop down the plasmid.


In addition it is good that the SphI cutsite is inbetween the SacI and XhoI cutsites, as we will be able to insert the plasmid without chopping down our POI. 

# When checking for the correctness of the input sequence, found that new lines were counted as characters, so the insert sequences got different. Need to repeat the step starting from ***'2nd interpretation'***

Note _v2, as the second fasta files that were created after removing `\n` from the input sequences

In [None]:
# read in the inserted pieces
with open('Protein_design/GPA1_Venus/gpa1_10_venus_v2.fsa', 'r') as file:
    gpa1_10_venus_v2 = file.read()

with open('Protein_design/GPA1_Venus/gpa1_36_venus_v2.fsa', 'r') as file:
    gpa1_36_venus_v2 = file.read()

with open('Protein_design/GPA1_Venus/gpa1_67_venus_v2.fsa', 'r') as file:
    gpa1_67_venus_v2 = file.read()

gpa1_10_venus_v2[:600], gpa1_36_venus_v2[:600], gpa1_67_venus_v2[:600]


(">Plasmid for N-terminal tagging of S288 with fluorescent proteins. Can be integrated into the budding yeast genome using SphI, SphI-HF®, GCATGC cutsite. Intented to be labeled with ['yEVenus.fsa']. To ensure high-efficency digestion during FPG cloning, there is a placeholder sequence between the gene piece and the linker\nGGTACCGGGCCCCCCCTCGAGGTCGACGACATGTTCAATAAATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAATAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCAAGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAAATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATCCATAAGCTG",
 ">Plasmid for N-terminal tagging of S288 with fluorescent proteins. Can be integrated into the budding yeast genome using SphI, SphI-HF®, GCATGC cutsite. Intented to be labeled with ['yEVenus.fsa']. To ensure high-efficency digestion during FPG cloning, there is a placeholder sequence between the gene piece and the linker\nGGTACCGGGCCCCCCCTCGAGAGAAATCCATAAGCTGTTTTACTCGACTCAACGTTATAATTAGTATT

how many times are the restriction site sequences represented in the pETUL backbone or the v2 designed inserts?

In [223]:
# check how many occurences of enzyme_df['sequence'] are in pETUL_backbone or gpa1_10_venus
for index, row in restr_check.iterrows():
    print(row['enzyme'], 'in pETUL_backbone:', pETUL_backbone.count(row['sequence']), '|', row['enzyme'], 'in gpa1_10_venus_v2:', gpa1_10_venus_v2.count(row['sequence']), row['enzyme'], '|', 'in gpa1_36_venus_v2:', gpa1_36_venus_v2.count(row['sequence']), '|', row['enzyme'], 'in gpa1_67_venus_v2:', gpa1_67_venus_v2.count(row['sequence']))
    

SacI in pETUL_backbone: 1 | SacI in gpa1_10_venus_v2: 1 SacI | in gpa1_36_venus_v2: 1 | SacI in gpa1_67_venus_v2: 1
SphI in pETUL_backbone: 0 | SphI in gpa1_10_venus_v2: 2 SphI | in gpa1_36_venus_v2: 2 | SphI in gpa1_67_venus_v2: 1
XhoI in pETUL_backbone: 1 | XhoI in gpa1_10_venus_v2: 1 XhoI | in gpa1_36_venus_v2: 1 | XhoI in gpa1_67_venus_v2: 1


In [225]:
# print all the positions of the restriction sites in the sequence of interest (SOI) +/- 15 nt from the matched site
# There should be more than one occurrence for SphI
for index, row in restr_check.iterrows():
    if row['sequence'] in gpa1_10_venus:
        print('========================================================================')
        enzyme = row['enzyme']
        seq = row['sequence']
        positions = [i for i in range(len(gpa1_10_venus)) if gpa1_10_venus.startswith(seq, i)]
        print(f'Enzyme: {enzyme} | Restriction site: {seq} | Position/s in SOI: {positions}')
        print('------------------------------------------------------------------------')
        for pos in positions:
            print('plotted sequence in SOI +/- 15nt:', gpa1_10_venus[max(0, pos - 15):pos + len(seq) + 15])
        


Enzyme: SacI | Restriction site: GAGCTC | Position/s in SOI: [1820]
------------------------------------------------------------------------
plotted sequence in SOI +/- 15nt: ACCGGACGAGCAAAAGAGCTCCAGCTTTTGTTCCCT
Enzyme: SphI | Restriction site: GCATGC | Position/s in SOI: [137, 1670]
------------------------------------------------------------------------
plotted sequence in SOI +/- 15nt: phI, SphI-HF®, GCATGC cutsite. Inten
plotted sequence in SOI +/- 15nt: AATAAAGATTTGTTTGCATGCAAGAGAATACTGCTA
Enzyme: XhoI | Restriction site: CTCGAG | Position/s in SOI: [338]
------------------------------------------------------------------------
plotted sequence in SOI +/- 15nt: GGTACCGGGCCCCCCCTCGAGGTCGACTTGGTCACA


Compare old vs new fasta files (insert results)

In [235]:
# read in gpa1 and v2
with open('Protein_design/GPA1_Venus/gpa1_10_venus.fsa', 'r') as file:
    gpa1_10_venus = file.read()
with open('Protein_design/GPA1_Venus/gpa1_10_venus_v2.fsa', 'r') as file:
    gpa1_10_venus_v2 = file.read()

with open('Protein_design/GPA1_Venus/gpa1_36_venus.fsa', 'r') as file:
    gpa1_36_venus = file.read()
with open('Protein_design/GPA1_Venus/gpa1_36_venus_v2.fsa', 'r') as file:
    gpa1_36_venus_v2 = file.read()

with open('Protein_design/GPA1_Venus/gpa1_67_venus.fsa', 'r') as file:
    gpa1_67_venus = file.read()
with open('Protein_design/GPA1_Venus/gpa1_67_venus_v2.fsa', 'r') as file:
    gpa1_67_venus_v2 = file.read()

print('Insert at 10th position commparison v1 vs v2:', '\n', gpa1_10_venus[:600],'\n', gpa1_10_venus_v2[:600], '\n', '\n', 'Insert at 36th position commparison v1 vs v2:', '\n', gpa1_36_venus[:600], '\n', gpa1_36_venus_v2[:600], '\n','\n', 'Insert at 67th position commparison v1 vs v2:', '\n',  gpa1_67_venus[:600],'\n',  gpa1_67_venus_v2[:600])

Insert at 10th position commparison v1 vs v2: 
 >Plasmid for N-terminal tagging of S288 with fluorescent proteins. Can be integrated into the budding yeast genome using SphI, SphI-HF®, GCATGC cutsite. Intented to be labeled with ['yEVenus.fsa']. To ensure high-efficency digestion during FPG cloning, there is a placeholder sequence between the gene piece and the linker
GGTACCGGGCCCCCCCTCGAGGTCGACTTGGTCACAGACATGTTCAATAAATTGGTTAATAACTGTTATAAAAAATGTATCAATACTTCTTATTCCGAGGGTGAGCTGAATAAGAATGAATCTTCGTGCCTAGACAGATGTGTGGCCAAATATTTTGAGACCAATGTTCAAGTCGGTGAAAACATGCAGAAAATGGGCCAATCATTTAACGCAGCCGGTAAGTTTTAGAAATGTGCATTAAAGCAGTAATGATAAGACGAAAATAAGAAAAGAAATC 
 >Plasmid for N-terminal tagging of S288 with fluorescent proteins. Can be integrated into the budding yeast genome using SphI, SphI-HF®, GCATGC cutsite. Intented to be labeled with ['yEVenus.fsa']. To ensure high-efficency digestion during FPG cloning, there is a placeholder sequence between the gene piece and the linker
GGTACCGGGCCCCCCCTCGAGGTCGA

Version 2 looks different from the previous results, as here it is done computationally we have a higher control I suggest to go for the inserts v2, such that it can be used to reproduce the results. 