### Guide to using Jupyter Notebook Christina's DNA Analysis codes.

<font color='green'>**Using functions you need**</font> \
You will need to read cells 1-5 regardless of which functions you want to analyse your DNA with. After that select whichever cell you need. Cells 6 and 7 must be used chronologically. Cells 8 and 9 must be used chronologically. 

<font color='green'>**Keeping Outputs concise**</font> \
Some readable code has been muted (#) to make analysis more streamlined. Delete only the four letters in #CODE, but don't change any of the spacing. Spacing is essential for readability of python scripts.

<font color='green'>**Data Quality checkpoints**</font> \
A number of cells are numbered as .1 and are data quality checkpoints. These are to make sure your imports and code is being read correctly. You don't need to run them if your code is working, but it's always good to double check.

<font color='green'>**Troubleshooting**</font> \
There is a separate Jupyter Notebook called Troubleshooting if you are having issues importing your files.

In [336]:
#1. Import relevant packages. Ensure you have them installed on your Anaconda environment or they will fail to import.

import numpy as np
from Bio import SeqIO
import pandas as pd
import os
import sys

from IPython.display import HTML, display
def set_background(color):    
    script = (
        "var cell = this.closest('.jp-CodeCell');"
        "var editor = cell.querySelector('.jp-Editor');"
        "editor.style.background='{}';"
        "this.parentNode.removeChild(this)"
    ).format(color)
    display(HTML('<img src onerror="{}" style="display:none">'.format(script)))

set_background('#000000')

In [337]:
#2. Import Fasta from Relative Path
    #Tip: Fasta file needs to be in the same folder as your Jupyter Notebook

file_name = 'C1.308.fa'
set_background('#000000')

In [338]:
#2. Import Fasta from Absolute Path
    #Tip: Make sure your file path has .fasta at the end.
    #Example: 'C:\User\chris\c1.308.fasta'

#CODE file_name = 'C:\Benutzer\Christina Bugert\DNA analysis projects\dna_sequences\C1.308.fa'
set_background('#000000')

In [339]:
#3. Open your Sequence file and look at the contents

with open(file_name, 'r') as file:
    sequences = SeqIO.parse(file, 'fasta')
    for seq_record in sequences:
        print(f"ID: {seq_record.id}")
        print(f"Description: {seq_record.description}")
        print(f"Sequence: {seq_record.seq}") #add f before "Sequence to see your whole file

set_background('#000000')

In [340]:
#4. Create a DataFrame from the parsed records

df = pd.DataFrame(fasta_records, columns=['FastaID', 'Description', 'Sequence'])

#Display the DataFrame to verify sorting of the columns.
#CODEprint(df.head())

set_background('#000000')

In [341]:
#5. Data cleansing

#Convert column Sequence to a string
df['Sequence'] = df['Sequence'].astype(str)
#CODEprint(df['Sequence'].dtypes)

# Add Spaces between the nucleotides in your dataframe.
#df['Sequence'] = df['Sequence'].apply(lambda x: ' '.join(list(x)))

#Turn all letters from upper to lowercase
df['Sequence'] = df['Sequence'].str.lower()

#Check your output
#CODEprint(df[f'Sequence'])


set_background('#000000')

In [342]:
#6. DNA sequence count

df['G_count'] = df['Sequence'].str.count('g')
df['A_count'] = df['Sequence'].str.count('a')
df['C_count'] = df['Sequence'].str.count('c')
df['T_count'] = df['Sequence'].str.count('t')

#Calculate sequence length
df['DNAcount'] = df['Sequence'].apply(lambda x: len(x.replace(' ', '')))

#Convert 'G_count' and 'C_count' to integers
df['G_count'] = df['G_count'].astype(int)
df['A_count'] = df['A_count'].astype(int)
df['C_count'] = df['C_count'].astype(int)
df['T_count'] = df['T_count'].astype(int)

#CODEprint(df.dtypes)  # Check the data types after conversion

#Display the updated DataFrame with G and C counts
print(df[['FastaID', 'G_count', 'A_count', 'C_count', 'T_count', 'DNAcount']].head())  # Displaying FastaID, G_count, and C_count columns

  FastaID  G_count  A_count  C_count  T_count  DNAcount
0  C1.308     1921     2552     1724     2290      8487


In [343]:
#6.1 Check quality of sequence counts.

#Add basepairs together: Produce DNAcount2
df['DNAcount2'] = df['G_count'] + df['A_count'] + df['C_count'] + df['T_count']
print(df[['DNAcount', 'DNAcount2']])

# Check if any DNAcount2 value is less than DNAcount
if (df['DNAcount2'] < df['DNAcount']).any():
    print("Go back to step 5 and assess data quality")
else:
    print("Data quality is consistent.")

   DNAcount  DNAcount2
0      8487       8487
Data quality is consistent.


In [344]:
#7. Calculate GC content of sequence

df['GC_percentage'] = ((df['G_count'] + df['C_count']) / df['DNAcount']) * 100
print(df[['FastaID','GC_percentage']])

gc_percentage = df['GC_percentage'].iloc[0]  # Fetch the percentage

# Formatting and printing the output
print(f"The GC Percentage of this sequence is {gc_percentage:.2f} Percent (%).")

  FastaID  GC_percentage
0  C1.308      42.948038
The GC Percentage of this sequence is 42.95 Percent (%).


In [345]:
#8. Export nucleotide content and GC Percentage to a text file. 
#Make sure you've run cells 6 to 7.

#Store the current system output
original_stdout = sys.stdout

#Define the file to which you want to redirect the print statements
file_name2 = 'Nucleotide_GC.txt'

#Open the file in write mode
with open(file_name2, 'w') as f:
    #Redirect sys.stdout to the file
    sys.stdout = f
    #Your code with print statements
    print(df[['FastaID', 'G_count', 'A_count', 'C_count', 'T_count', 'DNAcount', 'GC_percentage']])
    #Restore the original stdout
    sys.stdout = original_stdout

#Confirm that the print statements were redirected by reading the file.
with open(file_name2, 'r') as f:
    content = f.read()
    print(content)


  FastaID  G_count  A_count  C_count  T_count  DNAcount  GC_percentage
0  C1.308     1921     2552     1724     2290      8487      42.948038



In [346]:
#9. Convert DNA nucleotides to RNA

# Assuming your DataFrame is named df and the column containing DNA sequences is 'Sequence'
df['RNASequence'] = df['Sequence'].str.replace('t', 'u', case=False)
df['RNASequence'] = df['RNASequence'].astype(str)

#Make column U_count and count number of U's in the sequence.
df['U_count'] = df['RNASequence'].str.count('u')
df['U_count'] = df['U_count'].astype(int)


In [347]:
#9.1 Check quality of sequence counts

#Make sure none of your data has been lost.
df['RNAcount'] = df['RNASequence'].apply(lambda x: len(x.replace(' ', '')))
print(df['RNAcount'])

if (df['RNAcount'] < df['DNAcount']).any():
    print("Oops, go back to step 7 and assess data quality")
else:
    print("Well done, total number of nucleotides in the sequence is correct.")


# Make sure the number of Uracil and Thymine is the same.
if (df['U_count'] < df['T_count']).any():
    print("Oops, go back to step 7 and assess data quality")
else:
    print("Well done, number of Uracils is consistent.")


0    8487
Name: RNAcount, dtype: int64
Well done, total number of nucleotides in the sequence is correct.
Well done, number of Uracils is consistent.


In [350]:
#10. Export nucleotide content and GC Percentage to a text file. 
#Make sure you've run cells 6 to 7.

#Store the current system output
original_stdout = sys.stdout

#Define the file to which you want to redirect the print statements
file_name3 = 'RNASequence.txt'

#Open the file in write mode
with open(file_name3, 'w') as f:
    #Redirect sys.stdout to the file
    sys.stdout = f
    #Your code with print statements
    print(df[['FastaID', 'G_count', 'A_count', 'C_count', 'U_count', 'RNASequence']])
    #Restore the original stdout
    sys.stdout = original_stdout

#Confirm that the print statements were redirected by reading the file.
with open(file_name2, 'r') as f:
    content = f.read()
    print(content)

  FastaID  G_count  A_count  C_count  T_count  DNAcount  GC_percentage
0  C1.308     1921     2552     1724     2290      8487      42.948038



In [348]:
# Assuming 'rna_sequence' is the variable containing your RNA sequence
file_name4 = 'rna_output.txt'

try:
    # Open the file in write mode
    with open(file_name4, 'w') as file:
        # Iterate through each RNA sequence in the 'RNAsequence' column and write to the file
        for rna_sequence in df['RNAsequence']:
            file.write(str(rna_sequence) + '\n')  # Assuming the sequences are strings
            
    print("RNA sequences successfully exported to", file_name4)
except Exception as e:
    print("An error occurred while exporting RNA sequences:", str(e))


An error occurred while exporting RNA sequences: 'RNAsequence'


In [349]:
#11 Function to translate DNA sequence to amino acids

# Function to perform circular translation
def translate_circular_dna(sequence):
    # Find the index of the first occurrence of 'aug' (start codon)
    start_index = sequence.lower().find('aug')

    if start_index != -1:
        # Circular translation from 'AUG' to the end of the sequence
        translated_seq = translate_dna_to_protein(sequence[start_index:] + sequence[:start_index])
        return translated_seq
    else:
        # If 'AUG' is not found, return a placeholder or handle as needed
        return "No start codon found"

# Applying the circular translation function to the 'Sequence' column
df['Circular_Protein_Sequence'] = df['Sequence'].apply(translate_circular_dna)

# Display the result
print(df['Circular_Protein_Sequence'])

0    No start codon found
Name: Circular_Protein_Sequence, dtype: object
