In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd  
import pyreadr
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from load_data import *
from pred_score import *

In [2]:
#Fixing seed to get reproducible results
np.random.seed(3)

# Load all data/info for AE4

In [3]:
#AE4
AE4_df = pyreadr.read_r('../data/family_datasets/data_norm/AE4_scran_norm.rds')
AE4_df = AE4_df[None]

data_families_interest_AE4 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_AE4_nocellcyclesplit.RData')
data_families_interest_AE4 = data_families_interest_AE4['fois_1']

data_families_info_AE4 = pyreadr.read_r('../data/family_datasets/family_info/family_info_AE4_nocellcyclesplit.RData')
data_families_info_AE4 = data_families_info_AE4['family_info_1']

#Conversion into array 
data_families_interest_AE4 = np.array(data_families_interest_AE4)
data_families_info_AE4 = np.array(data_families_info_AE4)

In [4]:
#Norm data with only the cells belonging to the family of interest
AE4_df = select_family_interest_norm_data(data_families_info_AE4, data_families_interest_AE4, AE4_df)
y_AE4 = AE4_df[1]
y_AE4 = y_AE4[:,0]
y_AE4 = y_AE4.astype(np.int32)
AE4_df = AE4_df[0]
y_AE4 = pd.DataFrame(y_AE4, index = AE4_df.columns)

cells_interest = AE4_df.columns
len(cells_interest)

1473

In [5]:
#Only keep optimized subset of genes for expression 
gene_interest = np.squeeze(pd.read_csv ('../data/optimized_subsets/AE4genes_best.csv'))
AE4_df = AE4_df.loc[gene_interest,:]

In [6]:
#Load data and only keep cells of interest
velocity_length = pyreadr.read_r('../data/AE4_for_PCA_based_hclust/Lengths_AE4_under10_allgenes.RData')
velocity_length = velocity_length['Lengths_1']
velocity_length = velocity_length[velocity_length.columns.intersection(cells_interest)] 

velocity_angle = pyreadr.read_r('../data/AE4_for_PCA_based_hclust/Angles_AE4_under10_allgenes.RData')
velocity_angle  = velocity_angle['Angles_1']
velocity_angle = velocity_angle[velocity_angle.columns.intersection(cells_interest)] 

cyclone_cell_cycle = pd.read_csv ('../data/AE4_for_PCA_based_hclust/cyclone_AE4.csv')
cyclone_cell_cycle = cyclone_cell_cycle[©] 

revelio_cell_cycle  = pd.read_csv ('../data/AE4_for_PCA_based_hclust/revelio.csv')
revelio_cell_cycle = revelio_cell_cycle.set_index('Unnamed: 0')
revelio_cell_cycle = revelio_cell_cycle.T[revelio_cell_cycle.index.intersection(cells_interest)]

#Fuse cell cycle info together 

complexity = pyreadr.read_r('../data/AE4_for_PCA_based_hclust/Complexity_AE4.RData')
complexity = complexity['Genenumber_1']
complexity = complexity[complexity.columns.intersection(cells_interest)]

momenti = pyreadr.read_r('../data/AE4_for_PCA_based_hclust/Momentum_AE4.RData')
momenti = momenti['momentum']
momenti = momenti[momenti.columns.intersection(cells_interest)]

intron = pd.read_csv ('../data/AE4_for_PCA_based_hclust/introns_AE4.csv')
intron = intron.set_index('Unnamed: 0')
intron = intron[intron.columns.intersection(cells_interest)]

#Remove cells without info 
print(AE4_df.shape)
cells_interest = intron.columns
AE4_df = AE4_df[cells_interest]
y_AE4 = np.squeeze(np.array(y_AE4.T[cells_interest].T))
print(AE4_df.shape, y_AE4.shape)

(180, 1473)
(180, 1469) (1469,)


In [7]:
cells_interest = cyclone_cell_cycle.columns.intersection(cells_interest)

cyclone = pd.DataFrame(0, columns = cells_interest, index = np.unique(cyclone_cell_cycle))
revelio = pd.DataFrame(0, columns = cells_interest, index = np.unique(revelio_cell_cycle))

for cell in cells_interest:
    phase = cyclone_cell_cycle[cell][0]
    cyclone[cell][phase] = 1
    
for cell in revelio_cell_cycle.columns.intersection(cells_interest):
    phase = revelio_cell_cycle[cell][0]
    revelio[cell][phase] = 1    

# Fuse all info about AE4

In [8]:
AE4_cyclone = pd.concat([AE4_df,velocity_length, velocity_angle, complexity, momenti, intron, cyclone], axis=0)
AE4_revelio = pd.concat([AE4_df,velocity_length, velocity_angle, complexity, momenti, intron, revelio], axis=0)
AE4 = pd.concat([AE4_df,velocity_length, velocity_angle, complexity, momenti, intron], axis=0)

In [9]:
#Standardize the data to have mean = 0 and var = 1
AE4_std = StandardScaler().fit_transform(AE4)
print(AE4_std.shape)

pca = PCA()
pca.fit(AE4_std.T)
var_explained = pca.explained_variance_ratio_

(50762, 1469)


In [10]:
var_explained

array([4.51596749e-01, 6.65427397e-02, 3.77007452e-02, ...,
       2.79075937e-06, 2.55148820e-06, 1.96165667e-30])

In [11]:
model = FamiliesClusters(np.unique(y_AE4),compute_precision,True)
pred = model.fit_predict(pca.components_,y_AE4)
print(model.score_, model.recovery)

0.0037413148049171567 1.0
