In [7]:
import os
import tkinter as tk
from tkinter import filedialog
from CreateWebLogo import createWebLogo
tk.Tk().withdraw()
bioinformaticsDir = filedialog.askdirectory(title = "Choose Bioinformatics Directory")
deaminationDeterminationDataDir = os.path.join(bioinformaticsDir, "deamination_determination", "data")
generalReadSequencesDirectories = dict()
cellTypes = ("NHF1", "Arabidopsis", "yeast")
timepointsByCellType = {"NHF1":"1h", "Arabidopsis":"ZT2", "yeast":"20min"}
for cellType in cellTypes:
    generalReadSequencesDirectories[cellType] = os.path.join(deaminationDeterminationDataDir, cellType, "general_read_sequences")

In [8]:
CG_ContentByCellType = dict()
for cellType in cellTypes:
    readsFilePath = os.path.join(generalReadSequencesDirectories[cellType],
                                 f"{cellType}_CPD_{timepointsByCellType[cellType]}_all_reps_aligned_reads_10bp_expanded_TGG_filtered.bed")
    nucleotideCounts = dict()
    with open(readsFilePath, 'r') as readsFile:
        for line in readsFile:
            readSequence = line.split()[6]
            for nucleotide in readSequence:
                nucleotideCounts[nucleotide] = nucleotideCounts.setdefault(nucleotide, 0) + 1
    
    CG_ContentByCellType[cellType] = ((nucleotideCounts["C"] + nucleotideCounts["G"]) /
                                      (nucleotideCounts["A"] + nucleotideCounts["C"] +
                                       nucleotideCounts["G"] + nucleotideCounts["T"]) * 100)
    print(f"Nucleotide counts for {os.path.basename(readsFilePath)}:")
    for key,value in nucleotideCounts.items():  print(f"\t{key}: {value}")
    print(f"CG content: {CG_ContentByCellType[cellType]}\n")

Nucleotide counts for NHF1_CPD_1h_all_reps_aligned_reads_10bp_expanded_TGG_filtered.bed:
	C: 534080702
	G: 410638437
	A: 476978921
	T: 659649498
	N: 681
CG content: 45.389782949455864

Nucleotide counts for Arabidopsis_CPD_ZT2_all_reps_aligned_reads_10bp_expanded_TGG_filtered.bed:
	T: 980381666
	A: 903671459
	C: 633206373
	G: 516089717
	N: 5544
	S: 28
	W: 90
	R: 18
	M: 13
	Y: 35
	K: 20
CG content: 37.8886837135895

Nucleotide counts for yeast_CPD_20min_all_reps_aligned_reads_10bp_expanded_TGG_filtered.bed:
	G: 253621908
	C: 312220444
	A: 423300044
	T: 516628857
CG content: 37.578241108843905



In [12]:
for cellType in cellTypes:
    for strandPolarity in ("three_prime", "five_prime"):
        if strandPolarity == "five_prime" and (cellType == "Arabidopsis" or cellType == "yeast"): continue
        print(f"Creating {strandPolarity.replace('_',' ')} sequence logo for {cellType}")
        webLogoInputFilePath = os.path.join(
            generalReadSequencesDirectories[cellType],
            f"{cellType}_CPD_{timepointsByCellType[cellType]}_all_reps_{strandPolarity}"
             "_sequence_logo_input_TGG_filtered_nonstandard_nuc_filtered.txt"
        )
        createWebLogo([webLogoInputFilePath], round(CG_ContentByCellType[cellType]))
    

Creating three prime sequence logo for NHF1
Working with NHF1_CPD_1h_all_reps_three_prime_sequence_logo_input_TGG_filtered_nonstandard_nuc_filtered.txt...
Creating five prime sequence logo for NHF1
Working with NHF1_CPD_1h_all_reps_five_prime_sequence_logo_input_TGG_filtered_nonstandard_nuc_filtered.txt...
Creating three prime sequence logo for Arabidopsis
Working with Arabidopsis_CPD_ZT2_all_reps_three_prime_sequence_logo_input_TGG_filtered_nonstandard_nuc_filtered.txt...
Creating three prime sequence logo for yeast
Working with yeast_CPD_20min_all_reps_three_prime_sequence_logo_input_TGG_filtered_nonstandard_nuc_filtered.txt...
