## Predictability of Features and Samples

In this notebook we'll ask a basic question:

How predictable are different features and samples in general?

PCA makes the best possible orthogonal representation of a dataset using up to n different linear components, so it's the platonic ideal of how well a dataset is represented by a multivariate normal distribution with some covariance matrix. 

So let's ask ourselves, how much information can we recover from various scRNAseq datasests if we project them into a lower-dimensional subspace using PCA and then recover them? 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

import pickle 

data_location = "../data/aging_brain/"

young = pickle.load(open(data_location + "aging_brain_young.pickle",mode='rb'))
old = pickle.load(open(data_location + "aging_brain_old.pickle",mode='rb'))


In [None]:
from sklearn.decomposition import PCA

model = PCA(n_components=25).fit(young.X)
transformed = model.transform(young.X)
recovered = model.inverse_transform(transformed)

centered = young.X - np.mean(young.X,axis=0)
null_squared_residual = np.power(centered,2)

recovered_residual = young.X - recovered
recovered_squared_residual = np.power(recovered_residual,2)

pca_recovered_per_sample = np.sum(recovered_squared_residual,axis=1)
pca_recovered_fraction_per_sample = np.sum(recovered_squared_residual,axis=1) / np.sum(null_squared_residual,axis=1)
print(np.sum(null_squared_residual))
print(np.sum(recovered_squared_residual))

print(f"Remaining variance:{(np.sum(recovered_squared_residual) / np.sum(null_squared_residual))}")

In [None]:
for i,pc in enumerate(transformed.T):
    plt.figure()
    plt.title(i)
    plt.scatter(*young.obsm["X_umap"].T,c=pc,s=3,alpha=.4,cmap='bwr',vmin=-20,vmax=20)
    plt.colorbar()
    plt.show()

# f1 = "Ctsd"
# f2 = "H2-Ab1"

# f1_index = forest.truth_dictionary.feature_dictionary[f1]
# f2_index = forest.truth_dictionary.feature_dictionary[f2]

# for i,component in enumerate(model.components_):
#     print(f"{i}: {f1}:{component[f1_index]},{f2}:{component[f2_index]}")

# plt.figure()
# plt.scatter(model.components_[:,f1_index],model.components_[:,f2_index])
# plt.plot([.2,-.2],[-.2,.2],color='red')
# plt.show()
    

In [None]:
feature_null = np.sum(null_squared_residual,axis=0) + 1
sample_null = np.sum(null_squared_residual,axis=1) + 1

pca_feature_error = np.sum(recovered_squared_residual,axis=0) + 1
pca_feature_remaining = pca_feature_error/feature_null

pca_sample_error = np.sum(recovered_squared_residual,axis=1) + 1
pca_sample_remaining = pca_sample_error / sample_null


plt.figure()
plt.title("Fraction of Variance Unexplained, Per Feature")
plt.hist(pca_feature_remaining,bins=50)
plt.ylabel("Frequency")
plt.xlabel("Fraction of Variance Unexplained")
plt.show()

plt.figure()
plt.title("Fraction of Variance Unexplained, Per Sample")
plt.hist(pca_sample_remaining,bins=50)
plt.ylabel("Frequency")
plt.xlabel("Fraction of Variance Unexplained")
plt.show()

print(f"PCA Variance Unexplained:{np.sum(recovered_squared_residual)/np.sum(null_squared_residual)}")

In [None]:
feature_null = np.sum(np.abs(centered),axis=0) + 1
sample_null = np.sum(np.abs(centered),axis=1) + 1

pca_feature_error = np.sum(np.abs(recovered_residual),axis=0) + 1
pca_feature_remaining = pca_feature_error/feature_null

pca_sample_error = np.sum(np.abs(recovered_residual),axis=1) + 1
pca_sample_remaining = pca_sample_error / sample_null


plt.figure()
plt.title("Fraction of Variance Unexplained, Per Feature")
plt.hist(pca_feature_remaining,bins=50)
plt.ylabel("Frequency")
plt.xlabel("Fraction of Variance Unexplained")
plt.show()

plt.figure()
plt.title("Fraction of Variance Unexplained, Per Sample")
plt.hist(pca_sample_remaining,bins=50)
plt.ylabel("Frequency")
plt.xlabel("Fraction of Variance Unexplained")
plt.show()

In [None]:
import sys
# sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
sys.path.append('../src')
import tree_reader as tr 
import lumberjack

data_location = "../data/aging_brain/"

forest = tr.Forest.load(data_location + 'scanpy_cmp_aging_brain_true_l1')
forest.arguments

In [None]:
forest_residuals = forest.young_predicitons.residuals()

In [None]:
# forest_squared_residuals = np.power(forest_residuals,2)

# forest_feature_error = np.sum(forest_squared_residuals,axis=0) + 1
# forest_feature_remaining = forest_feature_error/feature_null

# forest_sample_error = np.sum(forest_squared_residuals,axis=1) + 1
# forest_sample_remaining = forest_sample_error/sample_null


# plt.figure()
# plt.title("Fraction of Variance Unexplained, Per Feature")
# plt.hist(forest_feature_remaining,bins=50)
# plt.ylabel("Frequency")
# plt.xlabel("Fraction of Variance Unexplained")
# plt.show()

# plt.figure()
# plt.title("Fraction of Variance Unexplained, Per Sample")
# plt.hist(forest_sample_remaining,bins=50)
# plt.ylabel("Frequency")
# plt.xlabel("Fraction of Variance Unexplained")
# plt.show()

# print(f"Forest Variance Unexplained:{np.sum(forest_squared_residuals)/np.sum(null_squared_residual)}")


# delta_sort = np.argsort(pca_feature_remaining-forest_feature_remaining)

# print(f"PCA best:{forest.output_features[delta_sort[:20]]}")
# print(f"Forest best:{forest.output_features[delta_sort[-20:]]}")

# for fb in delta_sort[-20:]:
#     print(f"Forest best: {forest.output_features[fb]}")
#     print(f"Forest: {forest_feature_remaining[fb]}")
#     print(f"PCA:{pca_feature_remaining[fb]}")

# ctsd_index = forest.truth_dictionary.feature_dictionary["Ctsd"]

# print(forest_feature_remaining[ctsd_index])
# print(pca_feature_remaining[ctsd_index])

# feature_mean = np.mean(young.X,axis=0)
# feature_mean.shape

h2_index = forest.truth_dictionary.feature_dictionary["H2-Ab1"]

plt.figure()
plt.scatter(*forest.tsne_coordinates.T,c=recovered_residual[:,h2_index],s=2,cmap='bwr')
plt.colorbar()
plt.show()

# Cat 1a,:  S100a9, S100a8, Wfdc21,Retnlg, Lcn2,Ngp,Camp,Mmp8,Hp, Ltf, Slpi, Trem3
# Cat 1b: Plac8, 
# Cat 1c: H2-Eb1,H2-Aa,H2-Ab1,


# Cat 2a: Slc22a6,Slc6a13,Fmod

# Cat3: Myoc

In [None]:
plt.figure(figsize=(4,4))
plt.title("Fraction of Variance Unexplained Per Feature, Forest Vs PCA")
plt.scatter(pca_feature_remaining,forest_feature_remaining,s=3,c=feature_mean)
plt.colorbar(label="Mean Expression")
plt.plot([0,1],[0,1],color='red')
plt.xlabel("PCA FVU")
plt.ylabel("Forest FVU")
plt.show()

plt.figure(figsize=(4,4))
plt.title("Fraction of Variance Unexplained Per Sample, Forest Vs PCA")
plt.scatter(pca_sample_remaining,forest_sample_remaining,s=3)
plt.plot([0,1],[0,1],color='red')
plt.xlabel("PCA FVU")
plt.ylabel("Forest FVU")
plt.show()


In [None]:
plt.figure()
plt.title("Forest Error Vs PCA Error")
plt.scatter(*young.obsm["X_umap"].T,s=2,c=forest_sample_remaining-pca_sample_remaining,cmap='seismic',vmin=-.5,vmax=.5)
plt.colorbar(label="Forest FVU - PCA FVU")
plt.show()

In [None]:
gene = "Hp"
gene_index = forest.truth_dictionary.feature_dictionary[gene]

print(forest_feature_remaining[gene_index])
print(pca_feature_remaining[gene_index])