In [2]:
# Setup
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.axes_grid1 import make_axes_locatable

motif_directory = os.path.join("..", "..", "data/motifs")
tert_motif_directory = (
        os.path.join("..", "..", "data/tertiary_contacts")
    )
tert_contact_csv_directory = os.path.join("..", "..", "data/out_csvs", "unique_tert_contacts.csv")
twoway_csv_path = os.path.join("..", "..", "data/out_csvs", "twoway_motif_list.csv")


In [3]:
# Plot number of types of motifs

# Define the count_files_with_extension function
def count_files_with_extension(directory, extension):
    return sum(1 for root, dirs, files in os.walk(directory) for file in files if file.endswith(extension))

# print("Plotting...")
# graphs
# Create a dictionary to store counts for each folder
folder_counts = {
    "TWOWAY": 0,
    "NWAY": 0,
    "HAIRPIN": 0,
    "HELIX": 0,
    "SSTRAND": 0,
}  # Initialize counts

# for folder in directory, count numbers:
# try:
# Iterate over all items in the specified directory
for item_name in os.listdir(motif_directory):
    item_path = os.path.join(motif_directory, item_name)

    # Check if the current item is a directory
    if os.path.isdir(item_path):
        # Perform your action for each folder
        file_count = count_files_with_extension(item_path, ".cif")
        # Check if the folder name is "2ways"
        if item_name == "2ways":
            # If folder name is "2ways", register the count as TWOWAY
            folder_counts["TWOWAY"] += file_count
        elif "ways" in item_name:
            # If folder name contains "ways" but is not "2ways", register the count as NWAY
            folder_counts["NWAY"] += file_count
        elif item_name == "hairpins":
            # If folder name is "hairpins", register the count as HAIRPIN
            folder_counts["HAIRPIN"] += file_count
        elif item_name == "helices":
            # If folder name is "helices", register the count as HELIX
            folder_counts["HELIX"] += file_count
        elif item_name == "sstrand":
            # If folder name is "sstrand", register count as SSTRAND
            folder_counts["SSTRAND"] += file_count
        else:
            # If the folder name doesn't match any condition, use it as is
            folder_counts[item_name] = file_count

# Convert the dictionary to a DataFrame
data = pd.DataFrame(list(folder_counts.items()), columns=["Motif Type", "Count"])
# Sort the DataFrame by 'Motif Type'
data = data.sort_values("Motif Type")
# Set the Seaborn theme
sns.set_theme(style="white")  # palette='deep', color_codes=True)
# Create the bar plot
plt.figure(figsize=(6, 6), facecolor="white")
plt.rcParams.update({"font.size": 20})  # Set overall text size
sns.barplot(
    data=data,
    x="Motif Type",
    y="Count",
    color=sns.color_palette()[0],
    edgecolor="black",
)
# Set labels and title
plt.xlabel("Motif Type")
plt.ylabel("Count")
plt.title("")
# Adjust layout for better fit
plt.tight_layout()
# Save the plot as a PNG file
plt.savefig("figure_2_bar_graph_motif_counts.png", dpi=600)
# Close the plot to avoid display
plt.close()

In [4]:
# Plot number and length of hairpins found

# of the hairpins, how long are they (histogram)
hairpin_directory = motif_directory + "/hairpins"
hairpin_counts = {}
# Iterate over all items in the specified directory
for item_name in os.listdir(hairpin_directory):
    item_path = os.path.join(hairpin_directory, item_name)
    # Check if the current item is a directory
    if os.path.isdir(item_path):
        # Perform your action for each folder
        file_count = count_files_with_extension(item_path, ".cif")
        # Store the count in the dictionary
        hairpin_counts[item_name] = file_count
# Convert hairpin folder names to integers and sort them
sorted_hairpin_counts = dict(
    sorted(hairpin_counts.items(), key=lambda item: int(item[0]))
)
# Extract sorted keys and values
hairpin_folder_names_sorted = list(sorted_hairpin_counts.keys())
hairpin_file_counts_sorted = list(sorted_hairpin_counts.values())
# Convert hairpin folder names to integers
hairpin_bins = sorted([int(name) for name in hairpin_folder_names_sorted])
# Calculate the positions for the tick marks (midpoints between bins)
tick_positions = np.arange(min(hairpin_bins), max(hairpin_bins) + 1)
sns.set_theme(style="white")
plt.figure(figsize=(6, 6))
plt.hist(
    hairpin_bins,
    bins=np.arange(min(hairpin_bins) - 0.5, max(hairpin_bins) + 1.5, 1),
    weights=hairpin_file_counts_sorted,
    color=sns.color_palette()[0],
    edgecolor="black",
    align="mid",
    width=0.8,
)
plt.xlabel("Hairpin Length")
plt.ylabel("Count")
plt.title("")  # Hairpins with Given Length
# Set custom tick positions and labels
plt.xticks(tick_positions, tick_positions)
plt.xticks(
    np.arange(min(hairpin_bins), max(hairpin_bins) + 1, 5)
)  # Display ticks every 5 integers
# plt.xticks(rotation=0, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of labels
# Save the bar graph as a PNG file
plt.savefig("figure_2_hairpin_counts_bar_graph.png", dpi=600)
# Don't display the plot
plt.close()

In [5]:
# Plot number and length of helices found

# of the helices, how long are they (bar graph)
helix_directory = motif_directory + "/helices"

helix_counts = {}
# Iterate over all items in the specified directory
for item_name in os.listdir(helix_directory):
    item_path = os.path.join(helix_directory, item_name)
    # Check if the current item is a directory
    if os.path.isdir(item_path):
        # Perform your action for each folder
        file_count = count_files_with_extension(item_path, ".cif")
        # Store the count in the dictionary
        helix_counts[item_name] = file_count
# Convert helix folder names to integers and sort them
sorted_helix_counts = dict(
    sorted(helix_counts.items(), key=lambda item: int(item[0]))
)
# Extract sorted keys and values
helix_folder_names_sorted = list(sorted_helix_counts.keys())
helix_file_counts_sorted = list(sorted_helix_counts.values())
# Convert helix folder names to integers
helix_bins = sorted([int(name) for name in helix_folder_names_sorted])
# Calculate the positions for the tick marks (midpoints between bins)
tick_positions = np.arange(min(helix_bins), max(helix_bins) + 1)
plt.figure(figsize=(6, 6))
plt.hist(
    helix_bins,
    bins=np.arange(min(helix_bins) - 0.5, max(helix_bins) + 1.5, 1),
    weights=helix_file_counts_sorted,
    color=sns.color_palette()[0],
    edgecolor="black",
    align="mid",
    width=0.8,
)
plt.xlabel("Helix Length")
plt.ylabel("Count")
plt.title("")  # Helices with Given Length
# Set custom tick positions and labels
plt.xticks(tick_positions, tick_positions)
plt.xticks(
    np.arange(min(helix_bins), max(helix_bins) + 1, 5)
)  # Display ticks every 5 integers
plt.tight_layout()  # Adjust layout to prevent clipping of labels
# Save the bar graph as a PNG file
plt.savefig("figure_2_helix_counts_bar_graph.png", dpi=600)
# Don't display the plot
plt.close()


In [6]:
# Plot number and length of single strands found

# Of the single strands, how long are they (bar graph)
sstrand_directory = motif_directory + "/sstrand"

sstrand_counts = {}
# Iterate over all items in the specified directory
for item_name in os.listdir(sstrand_directory):
    item_path = os.path.join(sstrand_directory, item_name)
    # Check if the current item is a directory
    if os.path.isdir(item_path):
        # Perform your action for each folder
        file_count = count_files_with_extension(item_path, ".cif")
        # Store the count in the dictionary
        sstrand_counts[item_name] = file_count
# Convert helix folder names to integers and sort them
sorted_sstrand_counts = dict(
    sorted(sstrand_counts.items(), key=lambda item: int(item[0]))
)
# Extract sorted keys and values
sstrand_folder_names_sorted = list(sorted_sstrand_counts.keys())
sstrand_file_counts_sorted = list(sorted_sstrand_counts.values())
# Convert helix folder names to integers
sstrand_bins = sorted([int(name) for name in sstrand_folder_names_sorted])
# Calculate the positions for the tick marks (midpoints between bins)
tick_positions = np.arange(min(sstrand_bins), max(sstrand_bins) + 1)
plt.figure(figsize=(6, 6))
plt.hist(
    sstrand_bins,
    bins=np.arange(min(sstrand_bins) - 0.5, max(sstrand_bins) + 1.5, 1),
    weights=sstrand_file_counts_sorted,
    color=sns.color_palette()[0],
    width=0.8,
    edgecolor="black",
    align="mid",
)
plt.xlabel("Single Strand Length")
plt.ylabel("Count")
plt.title("")  # Helices with Given Length
# Set custom tick positions and labels
plt.xticks(tick_positions, tick_positions)
plt.xticks(
    np.arange(min(sstrand_bins), max(sstrand_bins) + 1, 5)
)  # Display ticks every 5 integers
plt.tight_layout()  # Adjust layout to prevent clipping of labels
# Save the bar graph as a PNG file
plt.savefig("figure_2_sstrand_counts_bar_graph.png", dpi=600)
# Don't display the plot
plt.close()


In [7]:
# Plot heatmap of composition of twoway junctions

df = pd.read_csv(twoway_csv_path)
# df["bridging_nts_0"] = df["bridging_nts_0"] - 2
# df["bridging_nts_1"] = df["bridging_nts_1"] - 2
df = df[(df["bridging_nts_0"] >= 2) & (df["bridging_nts_1"] >= 2)]
twoway_heatmap_df = df.pivot_table(
    index="bridging_nts_0", columns="bridging_nts_1", aggfunc="size", fill_value=0
)

x = twoway_heatmap_df.columns.astype(float)
y = twoway_heatmap_df.index.astype(float)
z = twoway_heatmap_df.values
x_mesh, y_mesh = np.meshgrid(x, y)

x_range = np.arange(int(x.min()), min(int(x.max()) + 1, 12))
y_range = np.arange(int(y.min()), min(int(y.max()) + 1, 12))

sns.set_theme(style="white")
plt.figure(figsize=(7, 6))
plt.rcParams.update({"font.size": 20})
heatmap = plt.hist2d(
    x_mesh.ravel(),
    y_mesh.ravel(),
    weights=z.ravel(),
    bins=[x_range, y_range],
    cmap="gray_r",
)

plt.xlabel("Strand 1 Nucleotides")
plt.ylabel("Strand 2 Nucleotides")

plt.xticks(
    np.arange(x_range.min() + 0.5, x_range.max() + 1.5, 1),
    [
        f"{int(tick - 0.5)}"
        for tick in np.arange(x_range.min() + 0.5, x_range.max() + 1.5, 1)
    ],
)
plt.yticks(
    np.arange(y_range.min() + 0.5, y_range.max() + 1.5, 1),
    [
        f"{int(tick - 0.5)}"
        for tick in np.arange(y_range.min() + 0.5, y_range.max() + 1.5, 1)
    ],
)

plt.gca().set_aspect("equal", adjustable="box")
# Adjust margins
plt.subplots_adjust(left=0.1, right=0.88, top=0.95, bottom=0.06)

divider = make_axes_locatable(plt.gca())
cax = divider.append_axes("right", size="5%", pad=0.1)
cbar = plt.colorbar(heatmap[3], cax=cax)
cbar.set_label("Count")

plt.savefig("figure_2_twoway_motif_heatmap.png", dpi=600)
plt.close()


In [12]:
# Counting number of x type motifs present
# Edit variables accordingly to count what you need

directory_to_count = os.path.join(motif_directory, '2ways')
file_count = 0

for sub_dir in os.listdir(directory_to_count):
    dir_name = str(sub_dir)
    spl = dir_name.split("-")
    if int(spl[0]) > 10 or int(spl[1]) > 10:
        print(dir_name)
        file_count += count_files_with_extension(os.path.join(directory_to_count, sub_dir), ".cif")

print(file_count)


14-11
21-6
9-11
5-15
10-11
1-13
14-10
8-18
8-20
12-3
8-16
8-11
12-4
14-6
14-1
11-11
11-18
21-10
12-5
12-2
8-21
15-11
14-9
0-23
11-10
4-14
25-10
12-12
16-14
7-18
3-11
12-13
20-5
17-12
13-14
17-8
15-5
13-13
13-7
11-3
13-9
13-12
6-11
6-16
17-9
13-15
15-4
11-5
13-8
11-2
13-6
10-25
14-15
14-12
5-17
25-2
10-13
5-11
10-12
10-15
10-23
5-20
14-13
14-14
10-24
9-24
9-12
9-15
14-2
4-11
11-12
14-5
16-8
4-20
15-13
15-14
21-13
0-17
12-9
8-23
19-10
8-12
12-7
15-15
4-17
11-14
16-9
11-13
8-13
12-6
12-8
8-22
12-16
12-11
7-12
3-14
3-13
12-21
7-22
12-19
12-10
7-14
13-4
11-9
11-7
19-3
2-12
6-22
17-11
6-13
13-10
15-6
11-6
13-5
11-8
15-7
13-11
6-12
15-9
2-13
550
