In [1]:
# Setup
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# CSVs
motif_directory = os.path.join("..", "..", "data/motifs")
tert_motif_directory = os.path.join("..", "..", "data/tertiary_contacts")
csv_dir = os.path.join("..", "..", "data/out_csvs")
tert_contact_csv_directory = os.path.join("..", "..", "data/out_csvs", "unique_tert_contacts.csv")


In [2]:
# Plot number of each type of tertiary contacts

# Define the count_files_with_extension function
def count_files_with_extension(directory, extension):
    return sum(1 for root, dirs, files in os.walk(directory) for file in files if file.endswith(extension))


# Create a dictionary to store counts for each folder
tert_folder_counts = {}
# Iterate over all items in the specified directory
for item_name in os.listdir(tert_motif_directory):
    item_path = os.path.join(tert_motif_directory, item_name)
    # Check if the current item is a directory
    if os.path.isdir(item_path):
        # Perform your action for each folder
        file_count = count_files_with_extension(item_path, ".cif")
        # Store the count in the dictionary
        tert_folder_counts[item_name] = file_count
# make a bar graph of all types of motifs
tert_folder_names = list(tert_folder_counts.keys())
tert_file_counts = list(tert_folder_counts.values())
# Sort the folder names and file counts alphabetically
tert_folder_names_sorted, tert_file_counts_sorted = zip(
    *sorted(zip(tert_folder_names, tert_file_counts))
)
sns.set_theme(style="white")  # palette='deep', color_codes=True)
plt.figure(figsize=(6, 6))
plt.barh(
    tert_folder_names_sorted,
    tert_file_counts_sorted,
    edgecolor="black",
    color=sns.color_palette()[0],
    height=0.8,
)
plt.xlabel("Count")
plt.ylabel("Tertiary Contact Type")
plt.title("")  # tertiary contact types
# Adjust x-axis ticks for a tight fit
# plt.autoscale(enable=True, axis='x', tight=True)
plt.tight_layout()
# Save the graph as a PNG file
plt.savefig("figure_3_tertiary_motif_counts.png", dpi=600)
# Don't display the plot
plt.close()


In [3]:
# Plot tertiary contact types by residue component

# Load the CSV into a DataFrame and keep only the columns 'res_type_1' and 'res_type_2'
df = pd.read_csv(tert_contact_csv_directory, usecols=["type_1", "type_2"])
# To get rid of residual incorrect classifications, very rare but covers our bases
# df = df.replace({"type_1": "aa", "type_2": "aa"}, "base")

tuples_list = [tuple(sorted(x)) for x in df.to_records(index=False)]

# Count the occurrences of each tuple
tuple_counts = Counter(tuples_list)

# Convert the counts to a DataFrame for easier plotting
tuple_counts_df = pd.DataFrame(
    tuple_counts.items(), columns=["Contact Type", "Count"]
)

# Sort the DataFrame by Count (optional)
tuple_counts_df = tuple_counts_df.sort_values(by="Count")

# Plot the bar graph
plt.figure(figsize=(6, 6))
plt.barh(
    tuple_counts_df["Contact Type"].astype(str),
    tuple_counts_df["Count"],
    edgecolor="black",
    height=0.8,
    color=sns.color_palette()[0],
)
plt.xlabel("Count")
plt.ylabel("Tertiary Contact Type")
plt.title("")  # tertiary contact types
plt.tight_layout()

# Save the graph as a PNG file
plt.savefig("figure_3_residue_contact_types.png", dpi=600)
plt.close()



In [4]:
# Setup for next few graphs

# Load the CSV file
unique_tert_contact_df_for_hbonds = pd.read_csv(
    os.path.join(csv_dir, "all_tert_contact_hbonds.csv")
)

# Normalize the motif pairs by sorting them, so (motif_1, motif_2) and (motif_2, motif_1) are treated the same
unique_tert_contact_df_for_hbonds['motif_pair'] = unique_tert_contact_df_for_hbonds.apply(
    lambda row: tuple(sorted([row['motif_1'], row['motif_2']])),
    axis=1
)

# Drop duplicates by res_1, res_2, atom_1, and atom_2 within each motif_pair
unique_tert_contact_df_for_hbonds = unique_tert_contact_df_for_hbonds.drop_duplicates(
    subset=['motif_pair', 'res_1', 'res_2', 'atom_1', 'atom_2']
)

# Group by the motif_pair and count the number of occurrences in each group
grouped_df = unique_tert_contact_df_for_hbonds.groupby('motif_pair').size().reset_index(name='sum_hbonds')

# Keep only the first line from every group in a new dataframe
unique_grouped_df = grouped_df.drop_duplicates(subset='motif_pair', keep='first')

# Add columns seq_1 and seq_2 by splitting the motif_1 and motif_2 by "." and getting the 4th element
unique_grouped_df['seq_1'] = unique_grouped_df['motif_pair'].apply(lambda x: x[0].split('.')[3] if len(x[0].split('.')) > 3 else None)
unique_grouped_df['seq_2'] = unique_grouped_df['motif_pair'].apply(lambda x: x[1].split('.')[3] if len(x[1].split('.')) > 3 else None)

# Plot settings
tick_positions = np.arange(
    unique_grouped_df["sum_hbonds"].min(),
    unique_grouped_df["sum_hbonds"].max() + 1,
)

# sns.set_theme(style="white") 
# plt.rcParams.update({"font.size": 20})

unique_tert_contact_df = pd.read_csv(tert_contact_csv_directory)


In [5]:
# Plot lengths and counts of single strands in tertiary contacts
# sstrands in tertiary contacts

# Filter to get only SSTRAND entries for motif_1
sstrand_tert_contact_df_1 = unique_tert_contact_df[["motif_1"]].copy()
sstrand_tert_contact_df_1 = sstrand_tert_contact_df_1[
    sstrand_tert_contact_df_1["motif_1"].apply(lambda x: x.split(".")[0] == "SSTRAND")
]

# Filter to get only SSTRAND entries for motif_2
sstrand_tert_contact_df_2 = unique_tert_contact_df[["motif_2"]].copy()
sstrand_tert_contact_df_2 = sstrand_tert_contact_df_2[
    sstrand_tert_contact_df_2["motif_2"].apply(lambda x: x.split(".")[0] == "SSTRAND")
]

# Extract length and sequence from the split strings
sstrand_tert_contact_df_1["length"] = sstrand_tert_contact_df_1["motif_1"].str.split(".").str[2].astype(int)
sstrand_tert_contact_df_1["sequence"] = sstrand_tert_contact_df_1["motif_1"].str.split(".").str[3]

sstrand_tert_contact_df_2["length"] = sstrand_tert_contact_df_2["motif_2"].str.split(".").str[2].astype(int)
sstrand_tert_contact_df_2["sequence"] = sstrand_tert_contact_df_2["motif_2"].str.split(".").str[3]

# Rename the columns to have consistent names for concatenation
sstrand_tert_contact_df_1.rename(columns={"motif_1": "motif"}, inplace=True)
sstrand_tert_contact_df_2.rename(columns={"motif_2": "motif"}, inplace=True)

# Concatenate the two DataFrames and drop duplicates
new_tert_df = pd.concat(
    [sstrand_tert_contact_df_1, sstrand_tert_contact_df_2],
    ignore_index=True,
    axis=0,
).drop_duplicates()
new_tert_df.drop_duplicates(subset=["sequence"], keep="first", inplace=True)

for index, row in new_tert_df.iterrows():
    seq_value = row["sequence"]
    if isinstance(seq_value, float):
        seq_value = str(seq_value)
    parts = seq_value.split(".")
    if len(parts) > 2:
        sstrand_length_value = int(parts[2])
        new_tert_df.at[index, "length"] = sstrand_length_value

# Print for debug
new_tert_df.to_csv(os.path.join(csv_dir, "sstrand_tert.csv"), index=False)
# Convert 'sstrand_length' column to numeric type
new_tert_df["length"] = pd.to_numeric(
    new_tert_df["length"], errors="coerce"
)
tick_positions = np.arange(
    new_tert_df["length"].min(), new_tert_df["length"].max() + 1
)

# Now make a histogram
# Plot histogram
plt.figure(figsize=(6, 6))
plt.hist(
    new_tert_df["length"],
    bins=np.arange(
        new_tert_df["length"].min() + 0.5,
        new_tert_df["length"].max() + 2.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("Length of sstrands in tertiary contacts")
plt.ylabel("Count")

# Adjust tick marks to show at 2, 7, 12, etc.
tick_positions = np.arange(
    new_tert_df["length"].min() + 0.9,  # Start at 2 (1.9 rounded up)
    new_tert_df["length"].max() + 1.9,
    5
)
plt.xticks(tick_positions, [int(tick) for tick in tick_positions])

# Save the plot as PNG file
plt.savefig("figure_3_sstrand_in_tert.png", dpi=600)
# Close the plot
plt.close()


In [6]:
# Plot lengths and counts of helices in tertiary contacts

# Plot lengths and counts of helices in tertiary contacts
# helices in tertiary contacts

# Filter to get only HELIX entries for motif_1
helix_tert_contact_df_1 = unique_tert_contact_df[["motif_1"]].copy()
helix_tert_contact_df_1 = helix_tert_contact_df_1[
    helix_tert_contact_df_1["motif_1"].apply(lambda x: x.split(".")[0] == "HELIX")
]

# Filter to get only HELIX entries for motif_2
helix_tert_contact_df_2 = unique_tert_contact_df[["motif_2"]].copy()
helix_tert_contact_df_2 = helix_tert_contact_df_2[
    helix_tert_contact_df_2["motif_2"].apply(lambda x: x.split(".")[0] == "HELIX")
]

# Extract length and sequence from the split strings
helix_tert_contact_df_1["length"] = helix_tert_contact_df_1["motif_1"].str.split(".").str[2].astype(int)
helix_tert_contact_df_1["sequence"] = helix_tert_contact_df_1["motif_1"].str.split(".").str[3]

helix_tert_contact_df_2["length"] = helix_tert_contact_df_2["motif_2"].str.split(".").str[2].astype(int)
helix_tert_contact_df_2["sequence"] = helix_tert_contact_df_2["motif_2"].str.split(".").str[3]

# Rename the columns to have consistent names for concatenation
helix_tert_contact_df_1.rename(columns={"motif_1": "motif"}, inplace=True)
helix_tert_contact_df_2.rename(columns={"motif_2": "motif"}, inplace=True)

# Concatenate the two DataFrames and drop duplicates
new_tert_df = pd.concat(
    [helix_tert_contact_df_1, helix_tert_contact_df_2],
    ignore_index=True,
    axis=0,
).drop_duplicates()
new_tert_df.drop_duplicates(subset=["sequence"], keep="first", inplace=True)

for index, row in new_tert_df.iterrows():
    seq_value = row["sequence"]
    if isinstance(seq_value, float):
        seq_value = str(seq_value)
    parts = seq_value.split(".")
    if len(parts) > 2:
        helix_length_value = int(parts[2])
        new_tert_df.at[index, "length"] = helix_length_value

# Print for debug
new_tert_df.to_csv(os.path.join(csv_dir, "helix_tert.csv"), index=False)
# Convert 'helix_length' column to numeric type
new_tert_df["length"] = pd.to_numeric(
    new_tert_df["length"], errors="coerce"
)
tick_positions = np.arange(
    new_tert_df["length"].min(), new_tert_df["length"].max() + 1
)

# Now make a histogram
# Plot histogram
plt.figure(figsize=(6, 6))
plt.hist(
    new_tert_df["length"],
    bins=np.arange(
        new_tert_df["length"].min() + 0.5,
        new_tert_df["length"].max() + 2.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("Length of helices in tertiary contacts")
plt.ylabel("Count")

# Adjust tick marks to show at 2, 7, 12, etc.
tick_positions = np.arange(
    new_tert_df["length"].min() + 0.9,  # Start at 2 (1.9 rounded up)
    new_tert_df["length"].max() + 1.9,
    5
)
plt.xticks(tick_positions, [int(tick) for tick in tick_positions])

# Save the plot as PNG file
plt.savefig("figure_3_helix_in_tert.png", dpi=600)
# Close the plot
plt.close()

In [7]:
# Plot lengths and counts of hairpins in tertiary contacts

# Filter to get only HAIRPIN entries for motif_1
hairpin_tert_contact_df_1 = unique_tert_contact_df[["motif_1"]].copy()
hairpin_tert_contact_df_1 = hairpin_tert_contact_df_1[
    hairpin_tert_contact_df_1["motif_1"].apply(lambda x: x.split(".")[0] == "HAIRPIN")
]

# Filter to get only HAIRPIN entries for motif_2
hairpin_tert_contact_df_2 = unique_tert_contact_df[["motif_2"]].copy()
hairpin_tert_contact_df_2 = hairpin_tert_contact_df_2[
    hairpin_tert_contact_df_2["motif_2"].apply(lambda x: x.split(".")[0] == "HAIRPIN")
]

# Extract length and sequence from the split strings
hairpin_tert_contact_df_1["length"] = hairpin_tert_contact_df_1["motif_1"].str.split(".").str[2].astype(int)
hairpin_tert_contact_df_1["sequence"] = hairpin_tert_contact_df_1["motif_1"].str.split(".").str[3]

hairpin_tert_contact_df_2["length"] = hairpin_tert_contact_df_2["motif_2"].str.split(".").str[2].astype(int)
hairpin_tert_contact_df_2["sequence"] = hairpin_tert_contact_df_2["motif_2"].str.split(".").str[3]

# Rename the columns to have consistent names for concatenation
hairpin_tert_contact_df_1.rename(columns={"motif_1": "motif"}, inplace=True)
hairpin_tert_contact_df_2.rename(columns={"motif_2": "motif"}, inplace=True)

# Concatenate the two DataFrames and drop duplicates
new_tert_df = pd.concat(
    [hairpin_tert_contact_df_1, hairpin_tert_contact_df_2],
    ignore_index=True,
    axis=0,
).drop_duplicates()
new_tert_df.drop_duplicates(subset=["sequence"], keep="first", inplace=True)

for index, row in new_tert_df.iterrows():
    seq_value = row["sequence"]
    if isinstance(seq_value, float):
        seq_value = str(seq_value)
    parts = seq_value.split(".")
    if len(parts) > 2:
        hairpin_length_value = int(parts[2])
        new_tert_df.at[index, "length"] = hairpin_length_value

# Print for debug
new_tert_df.to_csv(os.path.join(csv_dir, "hairpin_tert.csv"), index=False)
# Convert 'hairpin_length' column to numeric type
new_tert_df["length"] = pd.to_numeric(
    new_tert_df["length"], errors="coerce"
)
tick_positions = np.arange(
    new_tert_df["length"].min(), new_tert_df["length"].max() + 1
)

# Now make a histogram
# Plot histogram
plt.figure(figsize=(6, 6))
plt.hist(
    new_tert_df["length"],
    bins=np.arange(
        new_tert_df["length"].min() + 0.5,
        new_tert_df["length"].max() + 2.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("Length of hairpins in tertiary contacts")
plt.ylabel("Count")

# Adjust tick marks to show at 2, 7, 12, etc.
tick_positions = np.arange(
    new_tert_df["length"].min() + 0.9,  # Start at 2 (1.9 rounded up)
    new_tert_df["length"].max() + 1.9,
    5
)
plt.xticks(tick_positions, [int(tick) for tick in tick_positions])

# Save the plot as PNG file
plt.savefig("figure_3_hairpin_in_tert.png", dpi=600)
# Close the plot
plt.close()

In [8]:
# Plot histogram
# H-bonds per tert, need to group the ones with like motifs and sum the tert contacts

# first delete ones with less than 2 hbonds
unique_grouped_df = unique_grouped_df[unique_grouped_df['sum_hbonds'] >= 2]

plt.figure(figsize=(6, 6))
plt.hist(
    unique_grouped_df["sum_hbonds"],
    bins=np.arange(
        unique_grouped_df["sum_hbonds"].min() + 0.5,
        unique_grouped_df["sum_hbonds"].max() + 2.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("H-bonds per tertiary contact")
plt.ylabel("Count")
# Plot settings
tick_positions = np.arange(
    unique_grouped_df["sum_hbonds"].min() + 0.9,
    unique_grouped_df["sum_hbonds"].max() + 1.9,
    5
)
unique_grouped_df.to_csv("unique_grouped_df.csv", index=False)

# Set ticks to start at 2 and step every 5 values
adjusted_tick_positions = np.arange(2, unique_grouped_df["sum_hbonds"].max(), 5)
plt.xticks(adjusted_tick_positions, [str(tick) for tick in adjusted_tick_positions])
# Add tick marks on x-axis
# plt.xticks(tick_positions[::5], [int(tick) for tick in tick_positions[::5]])
# Save the plot as PNG file
plt.savefig("figure_3_hbonds_per_tert.png", dpi=600)
# Close the plot
plt.close()
