In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import sys
sys.path.append('../src')
import data_imports
from pathlib import Path
Path("out").mkdir(parents=True, exist_ok=True)

# For inline plotting
%matplotlib inline

In [None]:
df = data_imports.import_biosamples()
df = df.sort_values(['patient_id','age_at_diagnosis'])
df = df.drop_duplicates('patient_id')
df = df[(df['age_at_diagnosis'] <= 365.25 * 20.0)]
df['age_years'] = df['age_at_diagnosis'] / 365.25
df['amplicon_class'] = df['amplicon_class'].map(lambda x: 'chromosomal' if x=='intrachromosomal' else x)
print(len(df))
df.head()

#df = df.drop_duplicates()

In [None]:
# --- Encode amplicon class ---
df = df[df['amplicon_class'].notna()].copy()
le = LabelEncoder()
df['amplicon_class_encoded'] = le.fit_transform(df['amplicon_class'])

amplicon_classes = le.classes_
palette = sns.color_palette("Set2", n_colors=len(amplicon_classes))
#palette = ['magenta','red','blue']
class_color_map = dict(zip(amplicon_classes, palette))

# --- Parameters ---
rolling_window_years = 1
bin_precision = 1  # 0.1-year bin
volume_bin_size_years = 1

# --- Preprocessing ---
df['age_rounded'] = df['age_years'].round(bin_precision)

# Count by age and class
counts = df.groupby(['age_rounded', 'amplicon_class']).size().unstack(fill_value=0)
proportions = counts.divide(counts.sum(axis=1), axis=0).sort_index()

# Smooth with rolling window
window_size = int(rolling_window_years * (10**bin_precision))
smoothed = proportions.rolling(window=window_size, center=True, min_periods=1).mean()

# Prepare for regression
X = smoothed.index.values.reshape(-1, 1)
r2_scores = {}
formulas = {}
predicted_curves = {}
models = {}

# Fit regression per class
for class_label in smoothed.columns:
    y = smoothed[class_label].values
    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    r2_scores[class_label] = r2_score(y, y_pred)
    predicted_curves[class_label] = y_pred
    formulas[class_label] = f"P({class_label}) = {model.intercept_:.4f} + {model.coef_[0]:.4f} * age"
    # compute statistics
    X2 = sm.add_constant(X)
    est = sm.OLS(y,X2).fit()
    models[class_label] = est.summary()

# Volume per year bin
df['age_year_bin'] = (df['age_years'] // volume_bin_size_years * volume_bin_size_years).astype(int)
volume = df['age_year_bin'].value_counts().sort_index()

# --- Plotting ---
fig = plt.figure(figsize=(14, 12))
gs = fig.add_gridspec(nrows=5, ncols=1, height_ratios=[3, 1, 2, 2, 2], hspace=0.5)

# 1. Stacked area plot
ax1 = fig.add_subplot(gs[0])
stack_colors = [class_color_map[c] for c in smoothed.columns]
ax1.stackplot(smoothed.index,
              [smoothed[col] for col in smoothed.columns],
              labels=smoothed.columns, colors=stack_colors, alpha=1)
ax1.set_ylabel("Proportion")
ax1.set_title("Smoothed Amplicon Class Distribution")
ax1.legend(loc="upper right", title="Class", fontsize=9)
ax1.set_ylim(0, 1)
# 2. Volume bar plot
ax2 = fig.add_subplot(gs[1], sharex=ax1)
ax2.bar(volume.index+0.5, volume.values, width=volume_bin_size_years * 0.8,
        color='gray', edgecolor='black')
ax2.set_ylabel("Count")
# ax2.set_xlabel("Age at Diagnosis (years)")
ax2.set_title("Sample Volume per Age Bin")

# 3–5. Individual regressions per class
for i, class_label in enumerate(smoothed.columns):
    ax = fig.add_subplot(gs[i + 2], sharex=ax1)
    color = class_color_map[class_label]

    # Smoothed data
    ax.plot(smoothed.index, smoothed[class_label], color=color, label=f"{class_label} (smoothed)", alpha=1)

    # Linear regression fit
    ax.plot(smoothed.index, predicted_curves[class_label], color=color, lw=2, label="Linear fit")

    # Annotate formula and R²
    mid = int(len(X) * 0.6)
    ax.text(smoothed.index[mid], predicted_curves[class_label][mid] + 0.04,
            f"{formulas[class_label]}\nR² = {r2_scores[class_label]:.4f}",
            color=color, fontsize=10)

    ax.set_ylabel("Proportion")
    ax.set_title(f"{class_label} — Linear Fit")
    ax.legend()

# ax.set_xlabel("Age at Diagnosis (years)")
for ax in fig.axes:
    ax.spines[['right', 'top']].set_visible(False)
    ax.set_xlim(0, 20)
fig.axes[-1].set_xlabel("Age at Diagnosis (years)")

plt.tight_layout()
plt.savefig("./out/all_amplicon_class_linear_panel.png")
plt.savefig("./out/all_amplicon_class_linear_panel.svg")
plt.show()


In [None]:
for m in models:
    print(m)
    print(models[m])

In [None]:
# Any association between ecDNA and sex? 
import scipy.stats
contingency_tbl = pd.crosstab(df.sex, df.amplicon_class == 'ecDNA')
scipy.stats.chi2_contingency(contingency_tbl)