In [1]:
# Setup
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# CSVs
motif_directory = os.path.join("..", "..", "data/motifs")
tert_motif_directory = os.path.join("..", "..", "data/tertiary_contacts")
csv_dir = os.path.join("..", "..", "data/out_csvs")
tert_contact_csv_directory = os.path.join("..", "..", "data/out_csvs", "unique_tert_contacts.csv")
twoway_csv_path = os.path.join("..", "..", "data/out_csvs", "twoway_motif_list.csv")


In [2]:
# Plot number of each type of tertiary contacts

# Define the count_files_with_extension function
def count_files_with_extension(directory, extension):
    return sum(1 for root, dirs, files in os.walk(directory) for file in files if file.endswith(extension))


# Create a dictionary to store counts for each folder
tert_folder_counts = {}
# Iterate over all items in the specified directory
for item_name in os.listdir(tert_motif_directory):
    item_path = os.path.join(tert_motif_directory, item_name)
    # Check if the current item is a directory
    if os.path.isdir(item_path):
        # Perform your action for each folder
        file_count = count_files_with_extension(item_path, ".cif")
        # Store the count in the dictionary
        tert_folder_counts[item_name] = file_count
# make a bar graph of all types of motifs
tert_folder_names = list(tert_folder_counts.keys())
tert_file_counts = list(tert_folder_counts.values())
# Sort the folder names and file counts alphabetically
tert_folder_names_sorted, tert_file_counts_sorted = zip(
    *sorted(zip(tert_folder_names, tert_file_counts))
)
sns.set_theme(style="white")  # palette='deep', color_codes=True)
plt.figure(figsize=(6, 6))
plt.barh(
    tert_folder_names_sorted,
    tert_file_counts_sorted,
    edgecolor="black",
    color=sns.color_palette()[0],
    height=0.8,
)
plt.xlabel("Count")
plt.ylabel("Tertiary Contact Type")
plt.title("")  # tertiary contact types
# Adjust x-axis ticks for a tight fit
# plt.autoscale(enable=True, axis='x', tight=True)
plt.tight_layout()
# Save the graph as a PNG file
plt.savefig("figure_3_tertiary_motif_counts.png", dpi=600)
# Don't display the plot
plt.close()


In [3]:
# Plot tertiary contact types by residue component

# Load the CSV into a DataFrame and keep only the columns 'res_type_1' and 'res_type_2'
df = pd.read_csv(tert_contact_csv_directory, usecols=["res_type_1", "res_type_2"])
# To get rid of residual incorrect classifications, very rare but covers our bases
df = df.replace({"res_type_1": "aa", "res_type_2": "aa"}, "base")

tuples_list = [tuple(sorted(x)) for x in df.to_records(index=False)]

# Count the occurrences of each tuple
tuple_counts = Counter(tuples_list)

# Convert the counts to a DataFrame for easier plotting
tuple_counts_df = pd.DataFrame(
    tuple_counts.items(), columns=["Contact Type", "Count"]
)

# Sort the DataFrame by Count (optional)
tuple_counts_df = tuple_counts_df.sort_values(by="Count")

# Plot the bar graph
plt.figure(figsize=(6, 6))
plt.barh(
    tuple_counts_df["Contact Type"].astype(str),
    tuple_counts_df["Count"],
    edgecolor="black",
    height=0.8,
    color=sns.color_palette()[0],
)
plt.xlabel("Count")
plt.ylabel("Tertiary Contact Type")
plt.title("")  # tertiary contact types
plt.tight_layout()

# Save the graph as a PNG file
plt.savefig("figure_3_residue_contact_types.png", dpi=600)
plt.close()



In [4]:
# Setup for next few graphs

unique_tert_contact_df_for_hbonds = pd.read_csv(
    os.path.join(csv_dir, "unique_tert_contacts_for_hbonds.csv")
)

# Group by motif_1 and motif_2 and sum the counts; to determine how many h-bonds there are between tert contacts
hbond_counts_in_terts = (
    unique_tert_contact_df_for_hbonds.groupby(["motif_1", "motif_2"])["count"]
    .sum()
    .reset_index()
)

# Rename the 'count' column to 'sum_hbonds'
hbond_counts_in_terts.rename(columns={"count": "sum_hbonds"}, inplace=True)
# Remove duplicate lines based on motif_1 and motif_2 columns
hbond_counts_in_terts.drop_duplicates(subset=["motif_1", "motif_2"], inplace=True)
# Plot settings
tick_positions = np.arange(
    hbond_counts_in_terts["sum_hbonds"].min(),
    hbond_counts_in_terts["sum_hbonds"].max() + 1,
)

# sns.set_theme(style="white") 
# plt.rcParams.update({"font.size": 20})

unique_tert_contact_df = pd.read_csv(tert_contact_csv_directory)


In [5]:
# Plot lengths and counts of single strands in tertiary contacts

# sstrands in tertiary contacts
# filter to get only sstrands
sstrand_cols_1 = ["motif_1", "type_1", "res_1", "seq_1"]
sstrand_tert_contact_df_1 = unique_tert_contact_df[sstrand_cols_1]
sstrand_cols_2 = ["motif_2", "type_2", "res_2", "seq_2"]
sstrand_tert_contact_df_2 = unique_tert_contact_df[sstrand_cols_2]
# Filter rows where types are equal to "SSTRAND"
sstrand_tert_contact_df_1 = sstrand_tert_contact_df_1[
    sstrand_tert_contact_df_1["type_1"] == "SSTRAND"
    ]
sstrand_tert_contact_df_2 = sstrand_tert_contact_df_2[
    sstrand_tert_contact_df_2["type_2"] == "SSTRAND"
    ]
# split
split_column_1 = sstrand_tert_contact_df_1["motif_1"].str.split(".")
split_column_2 = sstrand_tert_contact_df_2["motif_2"].str.split(".")
# extract length
length_1 = split_column_1.str[2]
length_2 = split_column_2.str[2]
sstrand_tert_contact_df_1 = sstrand_tert_contact_df_1.assign(length_1=length_1)
sstrand_tert_contact_df_2 = sstrand_tert_contact_df_2.assign(length_2=length_2)
# Concatenate and drop dupes
new_tert_df = pd.concat(
    [sstrand_tert_contact_df_1, sstrand_tert_contact_df_2],
    ignore_index=True,
    axis=0,
)
new_tert_df.drop_duplicates(subset=["seq_1"], keep="first", inplace=True)
# List of column names to delete
columns_to_delete = ["motif_2", "type_2", "res_2", "seq_2", "length_2"]
# Delete the specified columns
new_tert_df.drop(columns=columns_to_delete, inplace=True)
# rename columns
new_tert_df.columns = ["motif", "type", "res", "seq", "sstrand_length"]
for index, row in new_tert_df.iterrows():
    seq_value = row["seq"]
    if isinstance(seq_value, float):
        seq_value = str(seq_value)
    parts = seq_value.split(".")
    if len(parts) > 2:
        sstrand_length_value = int(parts[2])
        new_tert_df.at[index, "sstrand_length"] = sstrand_length_value

# Print for debug
# new_tert_df.to_csv(os.path.join(csv_dir, "sstrand_tert.csv"), index=False)
# Convert 'sstrand_length' column to numeric type
new_tert_df["sstrand_length"] = pd.to_numeric(
    new_tert_df["sstrand_length"], errors="coerce"
)
tick_positions = np.arange(
    new_tert_df["sstrand_length"].min(), new_tert_df["sstrand_length"].max() + 1
)

# Now make a histogram
# Plot histogram
plt.figure(figsize=(6, 6))
plt.hist(
    new_tert_df["sstrand_length"],
    bins=np.arange(
        new_tert_df["sstrand_length"].min() - 0.5,
        new_tert_df["sstrand_length"].max() + 1.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("Length of sstrands in tertiary contacts")
plt.ylabel("Count")
# Add tick marks on x-axis
plt.xticks(tick_positions[::5], [int(tick) for tick in tick_positions[::5]])
# plt.xticks(np.arange(new_tert_df['hairpin_length'].min(), new_tert_df['hairpin_length'].max() + 1), 5)

# Save the plot as PNG file
plt.savefig("figure_3_sstrand_in_tert.png", dpi=600)
# Close the plot
plt.close()


In [6]:
# Plot lengths and counts of helices in tertiary contacts

# filter to get only helices
helix_cols_1 = ["motif_1", "type_1", "res_1", "seq_1"]
helix_tert_contact_df_1 = unique_tert_contact_df[helix_cols_1]
helix_cols_2 = ["motif_2", "type_2", "res_2", "seq_2"]
helix_tert_contact_df_2 = unique_tert_contact_df[helix_cols_2]
# Filter rows where types are equal to "HELIX"
helix_tert_contact_df_1 = helix_tert_contact_df_1[
    helix_tert_contact_df_1["type_1"] == "HELIX"
    ]
helix_tert_contact_df_2 = helix_tert_contact_df_2[
    helix_tert_contact_df_2["type_2"] == "HELIX"
    ]
# split
split_column_1 = helix_tert_contact_df_1["motif_1"].str.split(".")
split_column_2 = helix_tert_contact_df_2["motif_2"].str.split(".")
# extract length
length_1 = split_column_1.str[2]
length_2 = split_column_2.str[2]
helix_tert_contact_df_1 = helix_tert_contact_df_1.assign(length_1=length_1)
helix_tert_contact_df_2 = helix_tert_contact_df_2.assign(length_2=length_2)
# concatenate and get rid of dupes
new_tert_df = pd.concat(
    [helix_tert_contact_df_1, helix_tert_contact_df_2], ignore_index=True, axis=0
)
new_tert_df.drop_duplicates(subset=["seq_1"], keep="first", inplace=True)
# List of column names to delete
columns_to_delete = ["motif_2", "type_2", "res_2", "seq_2", "length_2"]
# Delete the specified columns
new_tert_df.drop(columns=columns_to_delete, inplace=True)
new_tert_df.columns = ["motif", "type", "res", "seq", "helix_length"]

for index, row in new_tert_df.iterrows():
    seq_value = row["seq"]
    if isinstance(seq_value, float):
        seq_value = str(seq_value)
    parts = seq_value.split(".")
    if len(parts) > 2:
        helix_length_value = int(parts[2])
        new_tert_df.at[index, "helix_length"] = helix_length_value

# Print for debug
# new_tert_df.to_csv(os.path.join(csv_dir, "helices_tert.csv"), index=False)
# Convert 'helix_length' column to numeric type
new_tert_df["helix_length"] = pd.to_numeric(
    new_tert_df["helix_length"], errors="coerce"
)
tick_positions = np.arange(
    new_tert_df["helix_length"].min(), new_tert_df["helix_length"].max() + 1
)
# Now make a histogram
# Plot histogram
plt.figure(figsize=(6, 6))
plt.hist(
    new_tert_df["helix_length"],
    bins=np.arange(
        new_tert_df["helix_length"].min() - 0.5,
        new_tert_df["helix_length"].max() + 1.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("Length of helices in tertiary contacts")
plt.ylabel("Count")
# Add tick marks on x-axis
plt.xticks(tick_positions[::5], [int(tick) for tick in tick_positions[::5]])
# plt.xticks(np.arange(new_tert_df['hairpin_length'].min(), new_tert_df['hairpin_length'].max() + 1), 5)

# Save the plot as PNG file
plt.savefig("figure_3_helices_in_tert.png", dpi=600)
# Close the plot
plt.close()


In [7]:
# Plot lengths and counts of hairpins in tertiary contacts

# Now make a histogram for lengths of hairpins in tertiary contacts
# split into two DFs
df_cols_1 = ["motif_1", "type_1", "res_1", "seq_1"]
tert_contact_df_1 = unique_tert_contact_df[df_cols_1]
df_cols_2 = ["motif_2", "type_2", "res_2", "seq_2"]
tert_contact_df_2 = unique_tert_contact_df[df_cols_2]
# Filter rows where hairpins_1 and hairpins_2 are equal to "HAIRPIN"
tert_contact_df_1 = tert_contact_df_1[tert_contact_df_1["type_1"] == "HAIRPIN"]
tert_contact_df_2 = tert_contact_df_2[tert_contact_df_2["type_2"] == "HAIRPIN"]
# split
split_column_1 = tert_contact_df_1["motif_1"].str.split(".")
split_column_2 = tert_contact_df_2["motif_2"].str.split(".")
# extract length
length_1 = split_column_1.str[2]
length_2 = split_column_2.str[2]
tert_contact_df_1 = tert_contact_df_1.assign(length_1=length_1)
tert_contact_df_2 = tert_contact_df_1.assign(length_2=length_2)
# Concatenate tert_contact_df_1 and tert_contact_df_2
new_tert_df = pd.concat(
    [tert_contact_df_1, tert_contact_df_2], ignore_index=True, axis=0
)
# List of column names to delete (duplicates, since the were concatenated one on top of another)
columns_to_delete = ["length_2"]
# Delete the specified columns
new_tert_df.drop(columns=columns_to_delete, inplace=True)
# And delete dupes
new_tert_df.drop_duplicates(subset=["seq_1"], keep="first", inplace=True)
# Rename columns of tert_contact_df_1
new_tert_df.columns = ["motif", "type", "res", "seq", "hairpin_length"]
for index, row in new_tert_df.iterrows():
    seq_value = row["seq"]
    if isinstance(seq_value, float):
        seq_value = str(seq_value)
    parts = seq_value.split(".")
    if len(parts) > 2:
        hairpin_length_value = int(parts[2])
        new_tert_df.at[index, "hairpin_length"] = hairpin_length_value

# Print for debug reasons
# new_tert_df.to_csv(os.path.join(csv_dir, "hairpins_tert.csv"), index=False)
# Convert data to numeric from string
new_tert_df["hairpin_length"] = pd.to_numeric(
    new_tert_df["hairpin_length"], errors="coerce"
)
# Set tick positions to fit the range of data
tick_positions = np.arange(
    new_tert_df["hairpin_length"].min(), new_tert_df["hairpin_length"].max() + 1
)
# Now make a histogram
# Plot histogram
plt.figure(figsize=(6, 6))
plt.hist(
    new_tert_df["hairpin_length"],
    bins=np.arange(
        new_tert_df["hairpin_length"].min() - 0.5,
        new_tert_df["hairpin_length"].max() + 1.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("Length of hairpins in tertiary contacts")
plt.ylabel("Count")
# Add tick marks on x-axis
plt.xticks(tick_positions[::5], [int(tick) for tick in tick_positions[::5]])
# plt.xticks(np.arange(new_tert_df['hairpin_length'].min(), new_tert_df['hairpin_length'].max() + 1), 5); old code
# Save the plot as PNG file
plt.savefig("figure_3_hairpins_in_tert.png", dpi=600)
# Close the plot
plt.close()


In [8]:
# Plot histogram
# H-bonds per tert, need to group the ones with like motifs and sum the tert contacts
plt.figure(figsize=(6, 6))
plt.hist(
    hbond_counts_in_terts["sum_hbonds"],
    bins=np.arange(
        hbond_counts_in_terts["sum_hbonds"].min() + 0.5,
        hbond_counts_in_terts["sum_hbonds"].max() + 1.5,
        1,
    ),
    edgecolor="black",
    width=0.8,
)  # adjust bins as needed
plt.xlabel("H-bonds per tertiary contact")
plt.ylabel("Count")
# Set ticks to start at 2 and step every 5 values
adjusted_tick_positions = np.arange(2, hbond_counts_in_terts["sum_hbonds"].max() + 1, 5)
plt.xticks(adjusted_tick_positions, [str(tick) for tick in adjusted_tick_positions])
# Add tick marks on x-axis
# plt.xticks(tick_positions[::5], [int(tick) for tick in tick_positions[::5]])
# Save the plot as PNG file
plt.savefig("figure_3_hbonds_per_tert.png", dpi=600)
# Close the plot
plt.close()
