In [1]:
def get_gene_coords(gff_path, gene_name):
    
    """ Reads gff lines containing given gene name and returns coordinate info of the gene 
    """
    
    #opening file in read-only mode and reading line by line
    f = open(gff_path, 'r')
    for line in f.readlines():
        
        if gene_name in line:
                
            #information fields in GFFs are separated by tabs, 
            #split up the information into a list based on where the tabs are
            parsed = line.split('\t') 
            
            feature = parsed[2]
            
            if feature == 'gene':
                
                print('whole line:')
                print(line)
            
                #get the parts of the line that we care about
                contig = parsed[0]

                #the computer thinks these are strings unless I specify that they are integers
                start, stop =  int(parsed[3]), int(parsed[4]) 

                direction = parsed[6]
                
                break #we found what we need so we don't need to go through the rest of the file
                      #in practice, it's a good idea to go through the rest of the file and
                        #check that there aren't duplicate entries
    
    gene_info = [contig, start, stop, direction]
    
    print('contig: {}, start: {}, stop: {}, direction: {}'.format(contig, start, stop, direction))
    
    return gene_info

In [2]:
gff_path = 'Cp3488.genes.gff3'

get_gene_coords(gff_path, 'CPAG_00001')

whole line:
Supercontig_1.1	Broad	gene	24526	26741	.	+	.	ID=CPAG_00001;Name=CPAG_00001

contig: Supercontig_1.1, start: 24526, stop: 26741, direction: +


['Supercontig_1.1', 24526, 26741, '+']

In [3]:
gff_path = 'Cp3488.genes.gff3'

get_gene_coords(gff_path, 'CPAG_00216')

whole line:
Supercontig_1.1	Broad	gene	832813	833592	.	+	.	ID=CPAG_00216;Name=CPAG_00216

contig: Supercontig_1.1, start: 832813, stop: 833592, direction: +


['Supercontig_1.1', 832813, 833592, '+']

In [4]:
#how long is the gene?

x = get_gene_coords(gff_path, 'CPAG_00216')
gene_length = x[2]+1 - x[1]

gene_length

whole line:
Supercontig_1.1	Broad	gene	832813	833592	.	+	.	ID=CPAG_00216;Name=CPAG_00216

contig: Supercontig_1.1, start: 832813, stop: 833592, direction: +


780

This matches what it says on FungiDB - woohoo!

https://fungidb.org/fungidb/app/record/gene/CPAG_00216

Note: You need to add 1 to the 3' end coordinate because this is the last base of the stop codon.

If the sequence is 'ATGTGC', 'A' is base #1 and the 'C' base #6, but if you do 6-1 it would give you a gene length of 5, which is incorrect.

In [5]:
#just to clarify what line.split('\t') does

line = 'Supercontig_1.1	Broad	gene	832813	833592	.	+	.	ID=CPAG_00216;Name=CPAG_00216'

print('original line: ')
print(line)
print(type(line))

print()

parsed = line.split('\t') 
print('line split by tabs: ')
print(parsed)
print(type(parsed))

original line: 
Supercontig_1.1	Broad	gene	832813	833592	.	+	.	ID=CPAG_00216;Name=CPAG_00216
<class 'str'>

line split by tabs: 
['Supercontig_1.1', 'Broad', 'gene', '832813', '833592', '.', '+', '.', 'ID=CPAG_00216;Name=CPAG_00216']
<class 'list'>


For the record, you can also do something similar on the command line using: 

grep CPAG_00186 Cp3488.genes.gff3

In [6]:
#you can run command line commands in jupyter by starting the command with "!"

!grep CPAG_00186 Cp3488.genes.gff3

Supercontig_1.1	Broad	gene	757292	758704	.	-	.	ID=CPAG_00186;Name=CPAG_00186
Supercontig_1.1	Broad	exon	758632	758704	.	-	.	Parent=CPAG_00186
Supercontig_1.1	Broad	exon	757292	758558	.	-	.	Parent=CPAG_00186
Supercontig_1.1	Broad	cds	757295	758506	.	-	0	ID=cds0000186;Parent=CPAG_00186


# GET EXONS

In [7]:
def get_exon_coords(gff_path, gene_name):
    
    f = open(gff_path, 'r')
    
    all_exon_lines = [] # Create New List
    
    
    for line in f.readlines():
        
        if gene_name in line:
                
            parsed = line.split('\t') 
            
            feature = parsed[2]
            
            if feature == 'exon': # Look for exons
                
                ### print(line) #print all the line 
            
                contig = parsed[0]

                start, stop =  int(parsed[3]), int(parsed[4]) 

                direction = parsed[6]
                
                #Print formatted line
                print ('contig: {}, start: {}, stop: {}, direction: {}'.format(contig, start, stop, direction))
                
                #Make sure we get a list of what we want
                exon_info = [contig, start, stop, direction]
                all_exon_lines.append(exon_info)
                
                
                
    
    return all_exon_lines

In [8]:
get_exon_coords("Cp3488.genes.gff3", "CPAG_00001")

contig: Supercontig_1.1, start: 24526, stop: 24552, direction: +
contig: Supercontig_1.1, start: 24610, stop: 24669, direction: +
contig: Supercontig_1.1, start: 24858, stop: 24957, direction: +
contig: Supercontig_1.1, start: 25032, stop: 25112, direction: +
contig: Supercontig_1.1, start: 25221, stop: 25280, direction: +
contig: Supercontig_1.1, start: 25369, stop: 26125, direction: +
contig: Supercontig_1.1, start: 26180, stop: 26741, direction: +


[['Supercontig_1.1', 24526, 24552, '+'],
 ['Supercontig_1.1', 24610, 24669, '+'],
 ['Supercontig_1.1', 24858, 24957, '+'],
 ['Supercontig_1.1', 25032, 25112, '+'],
 ['Supercontig_1.1', 25221, 25280, '+'],
 ['Supercontig_1.1', 25369, 26125, '+'],
 ['Supercontig_1.1', 26180, 26741, '+']]

# GET CDS 

In [9]:
def get_cds_coords(gff_path, gene_name):
    
    f = open(gff_path, 'r')
    
    all_cds_lines = [] # Create New List
    
    
    for line in f.readlines():
        
        if gene_name in line:
                
            parsed = line.split('\t') 
            
            feature = parsed[2]
            
            if feature == 'cds': # Look for exons
                
                ## print(line) #print all the line 
            
                contig = parsed[0]

                start, stop =  int(parsed[3]), int(parsed[4]) 

                direction = parsed[6]
                
                #Print formatted line
                print ('contig: {}, start: {}, stop: {}, direction: {}'.format(contig, start, stop, direction))
                
                #Make sure we get a list of what we want
                cds_info = [contig, start, stop, direction]
                all_cds_lines.append(cds_info)
                
    
    return all_cds_lines

In [10]:
get_cds_coords("Cp3488.genes.gff3","CPAG_00001")

contig: Supercontig_1.1, start: 24526, stop: 24552, direction: +
contig: Supercontig_1.1, start: 24610, stop: 24669, direction: +
contig: Supercontig_1.1, start: 24858, stop: 24957, direction: +
contig: Supercontig_1.1, start: 25032, stop: 25112, direction: +
contig: Supercontig_1.1, start: 25221, stop: 25280, direction: +
contig: Supercontig_1.1, start: 25369, stop: 25406, direction: +


[['Supercontig_1.1', 24526, 24552, '+'],
 ['Supercontig_1.1', 24610, 24669, '+'],
 ['Supercontig_1.1', 24858, 24957, '+'],
 ['Supercontig_1.1', 25032, 25112, '+'],
 ['Supercontig_1.1', 25221, 25280, '+'],
 ['Supercontig_1.1', 25369, 25406, '+']]

# Exon length

In [11]:

exons = get_exon_coords("Cp3488.genes.gff3", 'CPAG_00001')

#For every line(list) in the list
for x in exons: 
    
    exon_length = x[2]+1 - x[1]

    #Print can take any value {} . format 
    print("The length for the exon is {}".format(exon_length))

contig: Supercontig_1.1, start: 24526, stop: 24552, direction: +
contig: Supercontig_1.1, start: 24610, stop: 24669, direction: +
contig: Supercontig_1.1, start: 24858, stop: 24957, direction: +
contig: Supercontig_1.1, start: 25032, stop: 25112, direction: +
contig: Supercontig_1.1, start: 25221, stop: 25280, direction: +
contig: Supercontig_1.1, start: 25369, stop: 26125, direction: +
contig: Supercontig_1.1, start: 26180, stop: 26741, direction: +
The length for the exon is 27
The length for the exon is 60
The length for the exon is 100
The length for the exon is 81
The length for the exon is 60
The length for the exon is 757
The length for the exon is 562


# CDS length

In [12]:
cds = get_cds_coords("Cp3488.genes.gff3", 'CPAG_00001')

#For every line(list) in the list
for x in cds: 
    
    cds_length = x[2]+1 - x[1]

    #Print can take any value {} . format 
    print("The length for the cds is {}".format(cds_length))

contig: Supercontig_1.1, start: 24526, stop: 24552, direction: +
contig: Supercontig_1.1, start: 24610, stop: 24669, direction: +
contig: Supercontig_1.1, start: 24858, stop: 24957, direction: +
contig: Supercontig_1.1, start: 25032, stop: 25112, direction: +
contig: Supercontig_1.1, start: 25221, stop: 25280, direction: +
contig: Supercontig_1.1, start: 25369, stop: 25406, direction: +
The length for the cds is 27
The length for the cds is 60
The length for the cds is 100
The length for the cds is 81
The length for the cds is 60
The length for the cds is 38


# Genes coordinates

In [13]:
get_gene_coords(gff_path,"CPAG_00001")

whole line:
Supercontig_1.1	Broad	gene	24526	26741	.	+	.	ID=CPAG_00001;Name=CPAG_00001

contig: Supercontig_1.1, start: 24526, stop: 26741, direction: +


['Supercontig_1.1', 24526, 26741, '+']

In [14]:
def get_allgene_coords(gff_path):
    
    f = open(gff_path, 'r')
    
    all_gene_lines = [] # Create New List
    
    
    for line in f.readlines():
        
        if "gene" in line:
                
            parsed = line.split('\t') 
            
            feature = parsed[2]
            
            if feature == 'gene': # Look for genes
                
                ### print(line) #print all the line 
            
                contig = parsed[0]

                start, stop =  int(parsed[3]), int(parsed[4]) 

                direction = parsed[6]
                
                gene_name = parsed[-1]
                
                #Print formatted line
                print ('contig: {}, start: {}, stop: {}, direction: {}, gene_name: {}'.format(contig, start, stop, direction, gene_name))
                
                #Make sure we get a list of what we want
                gene_info = [contig, start, stop, direction, gene_name]
                all_gene_lines.append(gene_info)
                
                
                
    
    return all_gene_lines

In [15]:
All_genes = get_allgene_coords(gff_path)

contig: Supercontig_1.1, start: 24526, stop: 26741, direction: +, gene_name: ID=CPAG_00001;Name=CPAG_00001

contig: Supercontig_1.1, start: 26900, stop: 29044, direction: -, gene_name: ID=CPAG_00002;Name=CPAG_00002

contig: Supercontig_1.1, start: 29647, stop: 30038, direction: -, gene_name: ID=CPAG_00003;Name=CPAG_00003

contig: Supercontig_1.1, start: 30973, stop: 31753, direction: +, gene_name: ID=CPAG_00004;Name=CPAG_00004

contig: Supercontig_1.1, start: 33423, stop: 34578, direction: +, gene_name: ID=CPAG_00005;Name=CPAG_00005

contig: Supercontig_1.1, start: 34386, stop: 35114, direction: -, gene_name: ID=CPAG_00006;Name=CPAG_00006

contig: Supercontig_1.1, start: 36949, stop: 38359, direction: -, gene_name: ID=CPAG_00007;Name=CPAG_00007

contig: Supercontig_1.1, start: 55900, stop: 56463, direction: -, gene_name: ID=CPAG_00008;Name=CPAG_00008

contig: Supercontig_1.1, start: 56980, stop: 59436, direction: +, gene_name: ID=CPAG_00009;Name=CPAG_00009

contig: Supercontig_1.1, sta


contig: Supercontig_1.1, start: 4917362, stop: 4917922, direction: -, gene_name: ID=CPAG_01681;Name=CPAG_01681

contig: Supercontig_1.1, start: 4919371, stop: 4919815, direction: -, gene_name: ID=CPAG_01682;Name=CPAG_01682

contig: Supercontig_1.1, start: 4919997, stop: 4921732, direction: +, gene_name: ID=CPAG_01683;Name=CPAG_01683

contig: Supercontig_1.1, start: 4922195, stop: 4924336, direction: +, gene_name: ID=CPAG_01684;Name=CPAG_01684

contig: Supercontig_1.1, start: 4924438, stop: 4924801, direction: +, gene_name: ID=CPAG_01685;Name=CPAG_01685

contig: Supercontig_1.1, start: 4924958, stop: 4927191, direction: +, gene_name: ID=CPAG_01686;Name=CPAG_01686

contig: Supercontig_1.1, start: 4927731, stop: 4929274, direction: -, gene_name: ID=CPAG_01687;Name=CPAG_01687

contig: Supercontig_1.1, start: 4929675, stop: 4931027, direction: -, gene_name: ID=CPAG_01688;Name=CPAG_01688

contig: Supercontig_1.1, start: 4931658, stop: 4932904, direction: -, gene_name: ID=CPAG_01689;Name=CPA


contig: Supercontig_1.1, start: 7648723, stop: 7649929, direction: -, gene_name: ID=CPAG_02825;Name=CPAG_02825

contig: Supercontig_1.1, start: 7650272, stop: 7653928, direction: +, gene_name: ID=CPAG_02826;Name=CPAG_02826

contig: Supercontig_1.1, start: 7654089, stop: 7655034, direction: -, gene_name: ID=CPAG_02827;Name=CPAG_02827

contig: Supercontig_1.1, start: 7655322, stop: 7656822, direction: -, gene_name: ID=CPAG_02828;Name=CPAG_02828

contig: Supercontig_1.1, start: 7656885, stop: 7657957, direction: -, gene_name: ID=CPAG_02829;Name=CPAG_02829

contig: Supercontig_1.1, start: 7658496, stop: 7659190, direction: +, gene_name: ID=CPAG_02830;Name=CPAG_02830

contig: Supercontig_1.1, start: 7659933, stop: 7662803, direction: -, gene_name: ID=CPAG_02831;Name=CPAG_02831

contig: Supercontig_1.1, start: 7664360, stop: 7666123, direction: -, gene_name: ID=CPAG_02832;Name=CPAG_02832

contig: Supercontig_1.1, start: 7668487, stop: 7669163, direction: +, gene_name: ID=CPAG_02833;Name=CPA

contig: Supercontig_1.2, start: 4363266, stop: 4367573, direction: -, gene_name: ID=CPAG_04535;Name=CPAG_04535

contig: Supercontig_1.2, start: 4369040, stop: 4371930, direction: +, gene_name: ID=CPAG_04536;Name=CPAG_04536

contig: Supercontig_1.2, start: 4373468, stop: 4375141, direction: +, gene_name: ID=CPAG_04537;Name=CPAG_04537

contig: Supercontig_1.2, start: 4375460, stop: 4377166, direction: +, gene_name: ID=CPAG_04538;Name=CPAG_04538

contig: Supercontig_1.2, start: 4377383, stop: 4380218, direction: +, gene_name: ID=CPAG_04539;Name=CPAG_04539

contig: Supercontig_1.2, start: 4381866, stop: 4384285, direction: +, gene_name: ID=CPAG_04540;Name=CPAG_04540

contig: Supercontig_1.2, start: 4384390, stop: 4385741, direction: -, gene_name: ID=CPAG_04541;Name=CPAG_04541

contig: Supercontig_1.2, start: 4386036, stop: 4389750, direction: +, gene_name: ID=CPAG_04542;Name=CPAG_04542

contig: Supercontig_1.2, start: 4390130, stop: 4391366, direction: -, gene_name: ID=CPAG_04543;Name=CPAG

contig: Supercontig_1.3, start: 3361368, stop: 3364445, direction: +, gene_name: ID=CPAG_05824;Name=CPAG_05824

contig: Supercontig_1.3, start: 3364616, stop: 3366073, direction: +, gene_name: ID=CPAG_05825;Name=CPAG_05825

contig: Supercontig_1.3, start: 3366255, stop: 3369225, direction: -, gene_name: ID=CPAG_05826;Name=CPAG_05826

contig: Supercontig_1.3, start: 3369567, stop: 3370835, direction: -, gene_name: ID=CPAG_05827;Name=CPAG_05827

contig: Supercontig_1.3, start: 3371061, stop: 3372952, direction: +, gene_name: ID=CPAG_05828;Name=CPAG_05828

contig: Supercontig_1.3, start: 3371988, stop: 3376851, direction: -, gene_name: ID=CPAG_05829;Name=CPAG_05829

contig: Supercontig_1.3, start: 3377271, stop: 3380286, direction: -, gene_name: ID=CPAG_05830;Name=CPAG_05830

contig: Supercontig_1.3, start: 3380590, stop: 3385922, direction: +, gene_name: ID=CPAG_05831;Name=CPAG_05831

contig: Supercontig_1.3, start: 3385089, stop: 3385844, direction: -, gene_name: ID=CPAG_05832;Name=CPAG

contig: Supercontig_1.4, start: 3326687, stop: 3327253, direction: -, gene_name: ID=CPAG_07323;Name=CPAG_07323

contig: Supercontig_1.4, start: 3328132, stop: 3330131, direction: -, gene_name: ID=CPAG_07324;Name=CPAG_07324

contig: Supercontig_1.4, start: 3330510, stop: 3330833, direction: -, gene_name: ID=CPAG_07325;Name=CPAG_07325

contig: Supercontig_1.4, start: 3330847, stop: 3331256, direction: +, gene_name: ID=CPAG_07326;Name=CPAG_07326

contig: Supercontig_1.4, start: 3332148, stop: 3333740, direction: +, gene_name: ID=CPAG_07327;Name=CPAG_07327

contig: Supercontig_1.4, start: 3334036, stop: 3334959, direction: -, gene_name: ID=CPAG_07328;Name=CPAG_07328

contig: Supercontig_1.4, start: 3335151, stop: 3338643, direction: +, gene_name: ID=CPAG_07329;Name=CPAG_07329

contig: Supercontig_1.4, start: 3337855, stop: 3338854, direction: -, gene_name: ID=CPAG_07330;Name=CPAG_07330

contig: Supercontig_1.4, start: 3339069, stop: 3340614, direction: +, gene_name: ID=CPAG_07331;Name=CPAG


contig: Supercontig_1.6, start: 680442, stop: 681866, direction: -, gene_name: ID=CPAG_08823;Name=CPAG_08823

contig: Supercontig_1.6, start: 682063, stop: 683169, direction: -, gene_name: ID=CPAG_08824;Name=CPAG_08824

contig: Supercontig_1.6, start: 684677, stop: 688304, direction: +, gene_name: ID=CPAG_08825;Name=CPAG_08825

contig: Supercontig_1.6, start: 688459, stop: 691544, direction: -, gene_name: ID=CPAG_08826;Name=CPAG_08826

contig: Supercontig_1.6, start: 692420, stop: 693529, direction: +, gene_name: ID=CPAG_08827;Name=CPAG_08827

contig: Supercontig_1.6, start: 693870, stop: 696595, direction: -, gene_name: ID=CPAG_08828;Name=CPAG_08828

contig: Supercontig_1.6, start: 696099, stop: 697376, direction: +, gene_name: ID=CPAG_08829;Name=CPAG_08829

contig: Supercontig_1.6, start: 698096, stop: 698787, direction: -, gene_name: ID=CPAG_08830;Name=CPAG_08830

contig: Supercontig_1.6, start: 700100, stop: 700756, direction: +, gene_name: ID=CPAG_08831;Name=CPAG_08831

contig: S

# Biopython

In [16]:
from Bio import SeqIO

In [17]:
for record in SeqIO.parse("Cp3488.fasta", "fasta"):
    print(record.id)

Supercontig_1.1
Supercontig_1.2
Supercontig_1.3
Supercontig_1.4
Supercontig_1.5
Supercontig_1.6


In [18]:
cp_fasta = SeqIO.parse("Cp3488.fasta", "fasta")

In [19]:
supercontig_1 = next(cp_fasta)

In [20]:
supercontig_2 = next(cp_fasta)

In [21]:
supercontig_1

SeqRecord(seq=Seq('CAAAGCAAGAGCTTGCATTTGCATAACAAGCTTATCATGAAGAGAGATTTCTCT...TTC'), id='Supercontig_1.1', name='Supercontig_1.1', description='Supercontig_1.1', dbxrefs=[])

In [22]:
supercontig_1.seq

Seq('CAAAGCAAGAGCTTGCATTTGCATAACAAGCTTATCATGAAGAGAGATTTCTCT...TTC')

In [23]:
supercontig_1.seq[832813-1:833592]

Seq('ATGGCATTCCCGATGGAGCGTCTCATCACCAGCCTGACTAAAGGCGCCTCCGTG...TAG')

In [24]:
true = "ATGGCATTCCCGATGGAGCGTCTCATCACCAGCCTGACTAAAGGCGCCTCCGTGGAGACAATATGGACCAACATTACCGGCTCCTTTTCCCCGGCGGTCATTGAATTCTTCGGCACCCTCAGCGTCCAGCTTGTCACCTTCTGGCTCCCATCACTCTTCTTCTTGTCCCTGGACATCTGGGCGCCGTCCTTCTCTAACCGACACAAGCTCCAGCCCATCCCCAAGCAACCGACGAGCAAGGAAATCAAATCCTGCGTTCTCCTCGTCCTGCGGAACCAAATCATCAATTCCATCCTGCACATCATCCTCATCTTCATCTCTCCGCAACGCCCCTACCGAATCGAACCCTCACTCCCTACTCTCCCCGAAATCGCCCGGGATTTTATCATCTCCCTCCTGATCCGCGAAGCCCTATTCTACTACAGCCACCGGCTCCTCCACCACCGCATCTTCTACGCACGAATCCACAAGCTTCACCACCGTTTCACCGCGCCAGTCGCACTCGCAGCCCAATACGCGCATCCGATCGAACACATCGTCGCGAATGTCTTGCCCATAACACTCCCACCGGCGCTACTAAGGAGCCACATCTTGACCTTCTGGACGTTCCTTGCCTACGAGCTGTCCAATACAGCACTTGTCCACAGTGGATATGATTTCTTCAGCGGGATAGCCAAGATGCATGATTTGCATCATGAGAAATTCAATTTGAATTATGGGTCAATTGGATTACTAGATTGGTTTCATGGCACGGATAAACTGCACAAGCGCACTGCGTAG"

In [25]:
from Bio.Seq import Seq
FungiDB_dna = Seq(true)

In [26]:
FungiDB_dna

Seq('ATGGCATTCCCGATGGAGCGTCTCATCACCAGCCTGACTAAAGGCGCCTCCGTG...TAG')

In [27]:
FungiDB_dna.find("CATCACCAGCCTGACTAAAG")

23

In [28]:
print(supercontig_1.seq[832813-1:833592])

ATGGCATTCCCGATGGAGCGTCTCATCACCAGCCTGACTAAAGGCGCCTCCGTGGAGACAATATGGACCAACATTACCGGCTCCTTTTCCCCGGCGGTCATTGAATTCTTCGGCACCCTCAGCGTCCAGCTTGTCACCTTCTGGCTCCCATCACTCTTCTTCTTGTCCCTGGACATCTGGGCGCCGTCCTTCTCTAACCGACACAAGCTCCAGCCCATCCCCAAGCAACCGACGAGCAAGGAAATCAAATCCTGCGTTCTCCTCGTCCTGCGGAACCAAATCATCAATTCCATCCTGCACATCATCCTCATCTTCATCTCTCCGCAACGCCCCTACCGAATCGAACCCTCACTCCCTACTCTCCCCGAAATCGCCCGGGATTTTATCATCTCCCTCCTGATCCGCGAAGCCCTATTCTACTACAGCCACCGGCTCCTCCACCACCGCATCTTCTACGCACGAATCCACAAGCTTCACCACCGTTTCACCGCGCCAGTCGCACTCGCAGCCCAATACGCGCATCCGATCGAACACATCGTCGCGAATGTCTTGCCCATAACACTCCCACCGGCGCTACTAAGGAGCCACATCTTGACCTTCTGGACGTTCCTTGCCTACGAGCTGTCCAATACAGCACTTGTCCACAGTGGATATGATTTCTTCAGCGGGATAGCCAAGATGCATGATTTGCATCATGAGAAATTCAATTTGAATTATGGGTCAATTGGATTACTAGATTGGTTTCATGGCACGGATAAACTGCACAAGCGCACTGCGTAG


In [29]:
my_dna = "ATGGCATTCCCGATGGAGCGTCTCATCACCAGCCTGACTAAAGGCGCCTCCGTGGAGACAATATGGACCAACATTACCGGCTCCTTTTCCCCGGCGGTCATTGAATTCTTCGGCACCCTCAGCGTCCAGCTTGTCACCTTCTGGCTCCCATCACTCTTCTTCTTGTCCCTGGACATCTGGGCGCCGTCCTTCTCTAACCGACACAAGCTCCAGCCCATCCCCAAGCAACCGACGAGCAAGGAAATCAAATCCTGCGTTCTCCTCGTCCTGCGGAACCAAATCATCAATTCCATCCTGCACATCATCCTCATCTTCATCTCTCCGCAACGCCCCTACCGAATCGAACCCTCACTCCCTACTCTCCCCGAAATCGCCCGGGATTTTATCATCTCCCTCCTGATCCGCGAAGCCCTATTCTACTACAGCCACCGGCTCCTCCACCACCGCATCTTCTACGCACGAATCCACAAGCTTCACCACCGTTTCACCGCGCCAGTCGCACTCGCAGCCCAATACGCGCATCCGATCGAACACATCGTCGCGAATGTCTTGCCCATAACACTCCCACCGGCGCTACTAAGGAGCCACATCTTGACCTTCTGGACGTTCCTTGCCTACGAGCTGTCCAATACAGCACTTGTCCACAGTGGATATGATTTCTTCAGCGGGATAGCCAAGATGCATGATTTGCATCATGAGAAATTCAATTTGAATTATGGGTCAATTGGATTACTAGATTGGTTTCATGGCACGGATAAACTGCACAAGCGCACTGCGTAG"

In [30]:
set1 = set(true)
set2 = set(my_dna)
print (set1 == set2)

True


In [31]:
true == my_dna

True

In [32]:
my_dna

'ATGGCATTCCCGATGGAGCGTCTCATCACCAGCCTGACTAAAGGCGCCTCCGTGGAGACAATATGGACCAACATTACCGGCTCCTTTTCCCCGGCGGTCATTGAATTCTTCGGCACCCTCAGCGTCCAGCTTGTCACCTTCTGGCTCCCATCACTCTTCTTCTTGTCCCTGGACATCTGGGCGCCGTCCTTCTCTAACCGACACAAGCTCCAGCCCATCCCCAAGCAACCGACGAGCAAGGAAATCAAATCCTGCGTTCTCCTCGTCCTGCGGAACCAAATCATCAATTCCATCCTGCACATCATCCTCATCTTCATCTCTCCGCAACGCCCCTACCGAATCGAACCCTCACTCCCTACTCTCCCCGAAATCGCCCGGGATTTTATCATCTCCCTCCTGATCCGCGAAGCCCTATTCTACTACAGCCACCGGCTCCTCCACCACCGCATCTTCTACGCACGAATCCACAAGCTTCACCACCGTTTCACCGCGCCAGTCGCACTCGCAGCCCAATACGCGCATCCGATCGAACACATCGTCGCGAATGTCTTGCCCATAACACTCCCACCGGCGCTACTAAGGAGCCACATCTTGACCTTCTGGACGTTCCTTGCCTACGAGCTGTCCAATACAGCACTTGTCCACAGTGGATATGATTTCTTCAGCGGGATAGCCAAGATGCATGATTTGCATCATGAGAAATTCAATTTGAATTATGGGTCAATTGGATTACTAGATTGGTTTCATGGCACGGATAAACTGCACAAGCGCACTGCGTAG'

In [33]:
print(true)

ATGGCATTCCCGATGGAGCGTCTCATCACCAGCCTGACTAAAGGCGCCTCCGTGGAGACAATATGGACCAACATTACCGGCTCCTTTTCCCCGGCGGTCATTGAATTCTTCGGCACCCTCAGCGTCCAGCTTGTCACCTTCTGGCTCCCATCACTCTTCTTCTTGTCCCTGGACATCTGGGCGCCGTCCTTCTCTAACCGACACAAGCTCCAGCCCATCCCCAAGCAACCGACGAGCAAGGAAATCAAATCCTGCGTTCTCCTCGTCCTGCGGAACCAAATCATCAATTCCATCCTGCACATCATCCTCATCTTCATCTCTCCGCAACGCCCCTACCGAATCGAACCCTCACTCCCTACTCTCCCCGAAATCGCCCGGGATTTTATCATCTCCCTCCTGATCCGCGAAGCCCTATTCTACTACAGCCACCGGCTCCTCCACCACCGCATCTTCTACGCACGAATCCACAAGCTTCACCACCGTTTCACCGCGCCAGTCGCACTCGCAGCCCAATACGCGCATCCGATCGAACACATCGTCGCGAATGTCTTGCCCATAACACTCCCACCGGCGCTACTAAGGAGCCACATCTTGACCTTCTGGACGTTCCTTGCCTACGAGCTGTCCAATACAGCACTTGTCCACAGTGGATATGATTTCTTCAGCGGGATAGCCAAGATGCATGATTTGCATCATGAGAAATTCAATTTGAATTATGGGTCAATTGGATTACTAGATTGGTTTCATGGCACGGATAAACTGCACAAGCGCACTGCGTAG
