# Dataset Statistics

This notebook plots statistics for our RNA inverse design dataset created using [RNASolo](https://rnasolo.cs.put.poznan.pl).
We visualise the diversity of our dataset in terms of sequence length, number of structures per sequence, as well as structural variations among conformations per sequence.

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

import dotenv
dotenv.load_dotenv("../.env")

In [None]:
!echo $DATA_PATH

In [None]:
import os
import ast
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import InsetPosition, mark_inset
import seaborn as sns

In [None]:
DATA_PATH = os.environ.get("DATA_PATH")
seq_to_data = torch.load(os.path.join(DATA_PATH, "processed.pt"))
data_list = list(seq_to_data.values())

df = pd.read_csv(os.path.join(DATA_PATH, "processed_df.csv"))
# convert strings to lists
# df["id_list"] = df["id_list"].apply(ast.literal_eval)
df["rfam_list"] = df["rfam_list"].apply(ast.literal_eval)
# df["eq_class_list"] = df["eq_class_list"].apply(ast.literal_eval)
df["type_list"] = df["type_list"].apply(ast.literal_eval)
# create new columns for the most common value in each list
df["type"] = df["type_list"].apply(lambda x: max(x, key=x.count))
df["rfam"] = df["rfam_list"].apply(lambda x: max(x, key=x.count))
df

In [None]:
# Number of unique sequences
len(df)

In [None]:
# Total number of structures
df["num_structures"].sum()

In [None]:
# Types of structures
df["type"].value_counts()

In [None]:
# Number of unique rfam families
for family, count in zip(df["rfam"].unique(), df["rfam"].value_counts()):
    print(f"{family}: {count}")

In [None]:
# Distribution of sequence length

seq_lens = df["length"].values

print(f"Distribution: {np.mean(seq_lens):.2f} +- {np.std(seq_lens):.2f}")
print(f"Max: {np.max(seq_lens)}, Min: {np.min(seq_lens)}")

In [None]:
fig, ax1 = plt.subplots()

# Plot a histogram
ax1.hist(seq_lens, bins=100)

# Add labels and title
ax1.set_xlabel('Sequence length')
ax1.set_ylabel('Frequency')
ax1.set_title( 
    r"$\bf{" + "Histogram \ of \ sequence \ lengths" + "}$" + f"\nDistribution: {np.mean(seq_lens):.1f} ± {np.std(seq_lens):.1f}, Max: {np.max(seq_lens)}, Min: {np.min(seq_lens)}"
)

# Create a set of inset Axes
ax2 = plt.axes([0,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0.4,0.4,0.5,0.5])
ax2.set_axes_locator(ip)
# Mark the region corresponding to the inset axes on ax1 
# and draw lines in grey linking the two axes.
mark_inset(ax1, ax2, loc1=2, loc2=4, fc="none", ec='0.6')

# Plot second histogram
ax2.hist(seq_lens, bins=1000)
ax2.set_xlim([0,200])

# Some ad hoc tweaks.
# ax2.set_xticklabels(ax2.get_xticks(), backgroundcolor='w')
# ax2.tick_params(axis='x', which='major', pad=8)

# Display the plot
# plt.savefig('hist_seq_len.pdf', dpi=300)
plt.show()

In [None]:
# Distribution of number of structures per unique sequence

num_struct_per_seq = df["num_structures"].values
print(f"Distribution: {np.mean(num_struct_per_seq)} +- {np.std(num_struct_per_seq)}")
print(f"Max: {np.max(num_struct_per_seq)}, Min: {np.min(num_struct_per_seq)}")

In [None]:
fig, ax1 = plt.subplots()

# Plot a histogram
ax1.hist(num_struct_per_seq, bins=267)

# Add labels and title
ax1.set_xlabel('Number of structures per sequence')
ax1.set_ylabel('Frequency')
ax1.set_title( 
    r"$\bf{" + "Histogram \ of \ no. \ of \ structures \ per \ unique \ sequence" + "}$" + f"\nDistribution: {np.mean(num_struct_per_seq):.2f} ± {np.std(num_struct_per_seq):.2f}, Max: {np.max(num_struct_per_seq)}, Min: {np.min(num_struct_per_seq)}"
)
ax1.set_xlim([0,50])

# Create a set of inset Axes
ax2 = plt.axes([0,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0.4,0.4,0.5,0.5])
ax2.set_axes_locator(ip)
# Mark the region corresponding to the inset axes on ax1 
# and draw lines in grey linking the two axes.
mark_inset(ax1, ax2, loc1=2, loc2=4, fc="none", ec='0.6')

# Plot second histogram
ax2.hist(num_struct_per_seq, bins=267)
ax2.set_xlim([2,20])
ax2.set_ylim([0,800])
ax2.set_title("Sequences with >1 structure")

# Some ad hoc tweaks.
# ax2.set_xticklabels(ax2.get_xticks(), backgroundcolor='w')
# ax2.tick_params(axis='x', which='major', pad=8)

# Display the plot
# plt.savefig('hist_num_struct_per_seq.pdf', dpi=300)
plt.show()

In [None]:
# Raw values of number of some structures per sequence distribution
bin, count = np.unique(num_struct_per_seq, return_counts=True)
for i, pair in enumerate(zip(bin, count)):
    if i < 10 or i > len(bin) - 4:
        print(f"{pair[0]} structures for that sequence -> {pair[1]} samples")
    elif i == len(bin) // 2:
        print("...")

In [None]:
# Frequency of each base in the sequences
# Pyrimidine (C, U)
# Purine (A, G)

base_counts = {'A': 0, 'G': 0, 'C': 0, 'U': 0, 'other': 0}
for data in tqdm(data_list):
    for base in data['sequence']:
        if base in base_counts:
            base_counts[base] += 1
        else:
            base_counts['other'] += 1

for base in base_counts:
    print(f"{base}: {base_counts[base]}")

In [None]:
# Distribution of average RMSD per sequence
# Note: Omit avg. RMSD = 0, which are sequences with a single structure

rmsd_per_seq = [x for x in df["mean_rmsd"].values if x > 0.0]
print(f"Distribution: {np.mean(rmsd_per_seq)} +- {np.std(rmsd_per_seq)}")
print(f"Max: {np.max(rmsd_per_seq)}, Min: {np.min(rmsd_per_seq)}")

fig, ax1 = plt.subplots()

# Plot a histogram
ax1.hist(rmsd_per_seq, bins=50)

# Add labels and title
ax1.set_xlabel('Avg. pairwise RMSD among structures per sequence (Å)')
ax1.set_ylabel('Frequency')
# plt.title('Histogram of average RMSD per sequence')
ax1.set_title( 
    r"$\bf{" + "Histogram \ of \ avg. \ pairwise \ RMSD \ per \ sequence" + "}$" + f"\nDistribution: {np.mean(rmsd_per_seq):.2f}Å ± {np.std(rmsd_per_seq):.2f}, Max: {np.max(rmsd_per_seq):.2f}Å, Min: {np.min(rmsd_per_seq):.2f}Å"
)

# Create a set of inset Axes
ax2 = plt.axes([0,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0.4,0.4,0.5,0.5])
ax2.set_axes_locator(ip)
# Mark the region corresponding to the inset axes on ax1 
# and draw lines in grey linking the two axes.
mark_inset(ax1, ax2, loc1=2, loc2=4, fc="none", ec='0.6')

# Plot second histogram
ax2.hist(rmsd_per_seq, bins=400)
ax2.set_xlim([0,5])
# ax2.set_ylim([0,1000])

# Display the plot
# plt.savefig('hist_rmsd_per_sequence.pdf', dpi=300)
plt.show()

In [None]:
# df = pd.DataFrame({'Sequence length': seq_len, 'Average RMSD': rmsd_per_seq})
# create a bivariate distribution plot using seaborn
ax = sns.jointplot(
    data=df.loc[(df["mean_rmsd"] > 0.0) & (df["mean_rmsd"] < 7.5)], 
    x='length', 
    y='mean_rmsd', 
    kind='hist', 
    log_scale=(True, False), 
    marginal_kws=dict(element='step', fill=True)
)

# ax.set_ylim([0,5])

# Add labels and title
plt.xlabel('Sequence length (log scale)')
plt.ylabel('Avg. pairwise RMSD among structures (Å)')
# plt.title( 
#     "Bivariate Distribution Plot for sequence length vs. avg. pairwise RMSD"
#     # r"$\bf{" + "Histogram \ of \ avg. \ pairwise \ RMSD \ per \ sequence" + "}$" + f"\nDistribution: {np.mean(rmsd_per_seq):.2f} ± {np.std(rmsd_per_seq):.2f}, Max: {np.max(rmsd_per_seq):.2f}, Min: {np.min(rmsd_per_seq):.2f}"
# )

# Display the plot
# plt.savefig('bivariate_seq_vs_rmsd.pdf', dpi=300)
plt.show()
