# Stats Circle

This notebook replicates the stats circle plots provided by BGI in their HTML reports using the outputs from STARsolo.

This is not a perfect replication as the analysis pipelines are different, but it is a very close approximation.

Notably, we do not have a count for `non-relevant` HDMIs

In [None]:
from math import pi
import numpy as np
import matplotlib as mpl
import pandas as pd
import scipy.io as sio

from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'

In [None]:
samples = [] # A list of the samples to process

In [None]:
group_colours = [
    [(255, 97, 79)],
    [(255, 189, 132), (255, 206, 162)],
    [(162, 210, 135)],
    [(152, 193, 221), (194, 226, 249), (215, 237, 251)],
    [(148, 158, 206), (177, 184, 215)],
    [(247, 188, 255), (237, 210, 240)],
]

group_colours = [[tuple([c / 255 for c in rgb]) for rgb in group] for group in group_colours]

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(10, 10), subplot_kw=dict(projection="polar"))
for n, sample in enumerate(samples):
    # ax = axs[n // 3, n % 3]
    ax = axs
    sample_folder = (
        f"output/{sample}/"
    )
    main_log = {}
    with open(f"{sample_folder}/{sample}_Log.final.out") as f:
        for line in f:
            if "|" in line:
                line = line.strip().split("|")
                main_log[line[0].strip()] = line[1].strip()
    bc_stats = {}
    with open(f"{sample_folder}/{sample}_Solo.out/Barcodes.stats") as f:
        for line in f:
            line = line.strip().split()
            bc_stats[line[0]] = int(line[1])
    feature_stats = {}
    with open(f"{sample_folder}/{sample}_Solo.out/GeneFull/Features.stats") as f:
        for line in f:
            line = line.strip().split()
            feature_stats[line[0]] = int(line[1])
    summary = pd.read_csv(f"{sample_folder}/{sample}_Solo.out/GeneFull/Summary.csv", sep=",", index_col=0, header=None)
    mtx = sio.mmread(f"{sample_folder}/{sample}_Solo.out/GeneFull/raw/matrix.mtx")
    dedup_umis = mtx.sum()
    data = {
        "total_reads": int(summary.loc["Number of Reads"].values[0]),
        "valid_hdmis": bc_stats["yesWLmatchExact"] + bc_stats["yesOneWLmatchWithMM"],
        "invalid_hdmis": bc_stats["noNoWLmatch"] + bc_stats["noUMIhomopolymer"] + bc_stats["noTooManyWLmatches"],
        "discarded_hdmis": 0,
        "not_relevant": 0,
        "mapped": feature_stats["yesWLmatch"] + feature_stats["noNoFeature"],
        "unmapped": feature_stats["noUnmapped"],
        "multimap": feature_stats["MultiFeature"],
        "annotated": feature_stats["yessubWLmatch_UniqueFeature"],
        "unannotated": feature_stats["noNoFeature"],
        "dedup_umis": dedup_umis,
        "dup_umis": feature_stats["yessubWLmatch_UniqueFeature"] - dedup_umis,
    }
    data["relevant"] = data["valid_hdmis"]
    data_norm = {k: v / data["total_reads"] * 100 for k, v in data.items()}
    groups = [
        ["total_reads"],
        ["valid_hdmis", "invalid_hdmis"],
        ["relevant"],
        ["mapped", "unmapped", "multimap"],
        ["annotated", "unannotated"],
        ["dedup_umis", "dup_umis"],
    ]
    group_heights = [
        -0.2,
        1,
        2.2,
        3.4,
        4.6,
        5.8,
    ]

    for group, g_colours, height in zip(groups, group_colours, group_heights):
        group_data = [data_norm[k] for k in group]
        group_labels = [k for k in group]
        startangle = 0
        xs = [(i * pi * 2) / 100 for i in group_data]
        for n, (dp, c) in enumerate(zip(xs, g_colours)):
            left = (startangle * pi * 2) / 360  # this is to control where the bar starts
            if n == 0:
                startangle += dp * 360 / (2 * pi)
            else:
                startangle += dp * 360 / (2 * pi)
            dp = dp - ((0.5 * pi * 2) / 100)
            ax.barh(height, dp, left=left, height=1, color=c)


    ax.spines.clear()
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title(f'{sample}\nSaturation {float(summary.loc["Sequencing Saturation"]) * 100:.4f}%')
plt.tight_layout()
plt.show()