## Simulations

In this notebook we wish to run a simulation demonstrating some of the basic claims we make regarding the random 
forest. 

The key claims we would like to demonstrate are thus:

- A dataset can have heirarchal behavior
    - an RF will identify such hierarchal structure 
    - an RF will capture local changes in covariance etc
    
    - A PCA CANNOT capture some of the effects that we will identify as local in distinct PCs.

- When a dataset undergoes changes in population prevalence, we identify this as a shift in factor values

- When a dataset undergoes a change in population behavior we identify this as a shift in predictive power

To reflect a hierarchal structure with meaningful local behavior, we will need several features that have different means among different clusters, but importantly also interact with each other. In order to approximately reflect the behavior of single-cell data we will draw from a mixture of multi-dimensional gaussians with known covariance and then randomly sample over the draw as per poisson. 

Let's operate on 10 features total. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from scipy.stats import multivariate_normal,norm,beta
from sklearn.datasets import make_blobs
from sklearn.manifold import TSNE


In [None]:
# First we generate the macro-structure. 

# We will produce a simple pattern with 5 features with a mean and covariance, 3 features with only a mean, 
# and two features of pure noise

macro_cov = np.array(
    [
        [1, 0, 1, 0, 2, 0, 0, 0, 0, 0],
        [0, 1,-1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 2, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 3, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
    ]
)

# We want a positive semi-definite (or definite) matrix
macro_cov = np.dot(macro_cov,macro_cov.T)



macro_mean_1 = [
                    2,3,1,1,0, 
                    0,1,2, 
                    3,3
]

macro_mean_2 = [
                    0,-1,1,0,0, 
                    -3,2,2, 
                    3,3
]

macro_mean_3 = [
                    0,-1,1,0,0, 
                    -3,0,4, 
                    1,3
]




In [None]:
macro_cov

In [None]:
macro_samples_1 = multivariate_normal(macro_mean_1,macro_cov).rvs(1000)
macro_samples_2 = multivariate_normal(macro_mean_2,macro_cov).rvs(1500)
macro_samples_3 = multivariate_normal(macro_mean_3,macro_cov).rvs(500)

coordinates = np.vstack([macro_samples_1,macro_samples_2,macro_samples_3])

In [None]:

t_coordinates = TSNE().fit_transform(coordinates)

plt.figure()
plt.scatter(*t_coordinates.T)

In [None]:
# To introduce a local effect, we would like to use a similar but not identical covariance matrix for a smaller part of the dataset

micro_cov_1 = np.array(
    [
        [1, 0, 1, 0, 2, 0, 0, 2, 0, 0],
        [0, 1,-1, 0, 0, 0, 0, 0, 3, 0],
        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 2, 0, 0, 0, 2, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 2, 0, 0],
        [0, 0, 0, 0, 0, 2, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 2, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 3, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
    ]
)
micro_cov_1 = np.dot(micro_cov_1,micro_cov_1.T)

micro_cov_2 = np.array(
    [
        [1, 0, 1, 0, 2, 0, 0,-1, 0, 0],
        [0, 1,-1, 0, 0, 0, 0, 0, 3, 0],
        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 2, 0, 0],
        [0, 0, 0, 0, 0, 2, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1,-1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 2, 2, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 3, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
    ]
)

micro_cov_2 = np.dot(micro_cov_2,micro_cov_2.T)

micro_samples_1 = multivariate_normal(macro_mean_1,macro_cov).rvs(1000)
micro_samples_2 = multivariate_normal(macro_mean_2,micro_cov_1).rvs(1500)
micro_samples_3 = multivariate_normal(macro_mean_3,micro_cov_2).rvs(500)



In [None]:
coordinates = np.vstack([micro_samples_1,micro_samples_2,micro_samples_3])

t_coordinates = TSNE().fit_transform(coordinates)

plt.figure()
plt.scatter(*t_coordinates.T)
plt.show()

In [None]:
colors = np.zeros(3000)
colors[:1000] = 1
colors[1000:2500] = 2
colors[-500:] = 3

In [None]:

plt.figure()
plt.scatter(*t_coordinates.T,c=colors)
plt.show()

In [None]:
import sys
sys.path.append('../src/')
import tree_reader as tr 
import lumberjack

In [None]:
forest = lumberjack.fit(
    coordinates,
    trees=300,
    ifs=5,
    ofs=5,
    braids=1,
    ss=200,
    leaves=10,
    depth=3,
    norm='l1',
    sfr=0,
    reduce_input='true',
    reduce_output='true',
#     reduce_input='false',
#     reduce_output='false'
)

In [None]:
forest.tsne_coordinates = t_coordinates

In [None]:
forest.interpret_splits(mode='additive_mean',metric='euclidean',pca=0,k=200,depth=2,relatives=True)

In [None]:
forest.maximum_spanning_tree(mode='samples')

In [None]:
forest.html_tree_summary(n=5)

## On The Basis of Component Vectors

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import multivariate_normal,norm,beta
from sklearn.datasets import make_blobs
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale


In [None]:
# First we will generate the macro-structure. We will generate an eigenvector that applies globally, has a 
# multivariate normal set of loadings and a bimodal normal distribution of values

global_noise = [
    1,1,1,1,1,
    1,1,1,
    1,1,
]

loading_means_global = [
        1,0,-2,3,5,
        0,0,2,
        3,3
    ]
    
true_factor_scores = np.zeros((10000,3))
    
noise = multivariate_normal(global_noise,np.identity(10)/10).rvs(10000)    
loadings = multivariate_normal(loading_means_global,np.identity(10)/3).rvs(10000)
# loadings = np.tile(loading_means_global,(3000,1))

score_draws = norm().rvs(10000) / 3
# score_draws = np.zeros(10000)

score_draws[:3000] += 2
score_draws[3000:] += 5

# score_draws = beta(.5,.5).rvs(3000)

true_factor_scores[:,0] = score_draws

coordinates = (loadings * np.tile(true_factor_scores[:,0],(10,1)).T) + noise


In [None]:

plt.figure()
plt.imshow(coordinates,aspect='auto',interpolation='none')
plt.colorbar()
plt.show()


In [None]:
local_loading_means_1 = [
    0,2,0,0,0,
    1,3,1,
    0,2,
]

local_loading_means_2 = [
    0,-2,0,-2,0,
    1,3,3,
    3,0,
]

local_loadings_1 = multivariate_normal(local_loading_means_1,np.identity(10)/10).rvs(5000)
local_loadings_2 = multivariate_normal(local_loading_means_2,np.identity(10)/10).rvs(2000)
# local_loadings_1 = np.tile(local_loading_means_1,(5000,1))
# local_loadings_2 = np.tile(local_loading_means_2,(2000,1))

# true_factor_scores[3000:8000,1] = norm().rvs(5000) + 1
# true_factor_scores[8000:,2] = norm().rvs(2000) + 1

true_factor_scores[3000:8000,1] = np.array(sorted((beta(.1,.1).rvs(5000) * 3 ) + 3))
true_factor_scores[8000:,2] = np.array(sorted((beta(.3,.3).rvs(2000) * 3 ) + 3))

# true_factor_scores[3000:8000,1] = beta(.5,.5).rvs(5000) 
# true_factor_scores[8000:,2] = beta(.5,.5).rvs(2000)


local_coordinates_1 = np.tile(true_factor_scores[3000:8000,1],(10,1)).T * local_loadings_1
local_coordinates_2 = np.tile(true_factor_scores[8000:,2],(10,1)).T * local_loadings_2

coordinates[3000:8000] += local_coordinates_1
coordinates[8000:] += local_coordinates_2


In [None]:
# local_loadings_1
# coordinates = scale(coordinates,axis=0)

In [None]:
from scipy.cluster.hierarchy import linkage,dendrogram

# sample_agglomeration = dendrogram(linkage(coordinates, metric='cosine', method='average'), no_plot=True)['leaves']

plt.figure()
plt.imshow(coordinates,aspect='auto',interpolation='none')
plt.colorbar()
plt.show()

plt.figure()
plt.imshow(true_factor_scores,aspect='auto',interpolation='none')
plt.colorbar()
plt.show()

# plt.figure()
# plt.imshow(coordinates[sample_agglomeration],aspect='auto',interpolation='none')
# plt.show()

In [None]:
true_factor_scores[2995:3005]
true_factor_scores[7995:8005]

In [None]:
t_coordinates = TSNE().fit_transform(coordinates)

plt.figure()
plt.scatter(*t_coordinates.T)
plt.show()



In [None]:
plt.figure()
plt.title("True Factor 1 Scores")
plt.scatter(*t_coordinates.T,c=true_factor_scores[:,0],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("True Factor 2 Scores")
plt.scatter(*t_coordinates.T,c=true_factor_scores[:,1],cmap='bwr',vmin=3,vmax=6)
plt.colorbar()
plt.show()

plt.figure()
plt.title("True Factor 3 Scores")
plt.scatter(*t_coordinates.T,c=true_factor_scores[:,2],cmap='bwr',vmin=3,vmax=6)
plt.colorbar()
plt.show()

In [None]:
from sklearn.decomposition import PCA

model = PCA().fit(coordinates)

In [None]:
model.explained_variance_ratio_

In [None]:
model.components_

In [None]:
pct = model.transform(coordinates)
pct.shape

In [None]:
plt.figure()
plt.title("PC1 Scores")
plt.scatter(*t_coordinates.T,c=pct[:,0],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("PC2 Scores")
plt.scatter(*t_coordinates.T,c=pct[:,1],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("PC3 Scores")
plt.scatter(*t_coordinates.T,c=pct[:,2],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("PC4 Scores")
plt.scatter(*t_coordinates.T,c=pct[:,3],cmap='bwr')
plt.colorbar()
plt.show()

In [None]:
plt.figure()
plt.scatter(true_factor_scores[:,1],pct[:,2])
plt.show()

plt.figure()
plt.scatter(true_factor_scores[:,1],pct[:,3])
plt.show()

plt.figure()
plt.scatter(true_factor_scores[:,2],pct[:,2])
plt.show()

plt.figure()
plt.scatter(true_factor_scores[:,2],pct[:,3])
plt.show()


In [None]:
import sys
sys.path.append('../src/')
import tree_reader as tr 
import lumberjack

In [None]:
forest = lumberjack.fit(
    coordinates,
    trees=300,
    ifs=8,
    ofs=8,
    braids=1,
    ss=1000,
    leaves=10,
    depth=4,
    norm='l1',
    sfr=0,
    reduce_input='true',
    reduce_output='true',
#     reduce_input='false',
#     reduce_output='false',
)

In [None]:
forest.tsne_coordinates = t_coordinates
forest.reset_split_clusters()
forest.interpret_splits(mode='additive_mean',metric='cosine',depth=4,pca=3,k=500,relatives=True)
forest.maximum_spanning_tree(mode='samples')

In [None]:
forest.html_tree_summary(n=5)