In [187]:
import pandas as pd
import numpy as np
import scanpy
from scipy.spatial.distance import cosine
from sklearn.preprocessing import StandardScaler



In [151]:
# check arguments
# check that no columns or rows with all zeros
#ensure that factor variables are NOT integers or real

class condPCA(object):
    def __init__(self, count_matrix_path, metadata_path, object_columns, vars_to_regress=True):
        """
        Parameters
        ----------
        count_matrix:
            Count matrix that must be QC'd

        metadata:
            metadata containing cell type labels named "celltype"

        object_columns:
            columns that will be one hot encoded/columns that are factors 

        vars_to_regress:
            list of variables to regress out

        """
        self.count_matrix = scanpy.read(count_matrix_path) # cells x genes, pd.read_csv(count_matrix_path, sep='\t', header=0, index_col=0)
        self.metadata = pd.read_csv(metadata_path, sep='\t', header=0, index_col=0)
        if vars_to_regress:
            self.vars_to_regress = self.metadata.columns
        else: # if vars_to_regress is a list, convert to pandas core Index object
            self.vars_to_regress = pd.Index(vars_to_regress)

        # one hot encode necessary metadata variables
        self.object_columns = object_columns # obtain columns that must be one hot encoded
        self.metadata[self.object_columns] = self.metadata[self.object_columns].astype(object) # convert these columns to objects

    def Normalize(self):
        """ 
        Normalize count data
        """
        scanpy.pp.normalize_total(self.count_matrix, target_sum = 10000) # update scanpy object to normalize all rows, so every cell sums to 10k

    def Standardize(self):
        """ 
        Standardize count data AND metadata
        """
        # Standardize count data
        mean_count = np.mean(self.count_matrix.X, axis=0)
        std_count = np.std(self.count_matrix.X, axis=0)
        self.standardized_count_data = (self.count_matrix.X - mean_count) / std_count # standardize


        # Standardize metadata
        self.metadata = self.metadata[self.vars_to_regress] # subset to only variables that you want to regress out
        # WARNING IN FOLLOWING LINE BECAUSE CONVERTING OBJECT THAT LOOKS NUMERIC TO BE ONE HOT ENCODED, this is batch
        self.metadata = pd.get_dummies(self.metadata, drop_first=True) # Convert factor covariates to dummy variables dropping one column 
        mean = np.mean(self.metadata, axis=0) # compute mean of each column of metadata
        std = np.std(self.metadata, axis=0) # compute standard deviation of each column of metadata

        self.standardized_metadata = (self.metadata - mean) / std # standardize
        

    def fit(self):
        self.standardized_data = np.c_[np.ones((self.standardized_data.shape[0], 1)), self.standardized_data] # append ones to standardized meta for intercept
        

In [152]:
# instantiate class
test = condPCA(count_matrix_path="/Users/shayecarver/condPCA/final_method/test_matrix.txt", metadata_path="/Users/shayecarver/condPCA/final_method/test_metadata.txt", object_columns=['Batch', 'Sex','celltype'])
test.Normalize()
test.Standardize()

  self.metadata = pd.get_dummies(self.metadata, drop_first=True) # Convert factor covariates to dummy variables dropping one column


In [169]:

pd.DataFrame(test.standardized_count_data).values.shape

(5000, 2000)

In [160]:
df = pd.read_csv("/Users/shayecarver/condPCA/final_method/scale_counts.txt", sep='\t', header=0, index_col=0)

In [168]:
df.T.values.shape

(5000, 2000)

In [188]:


scaler = StandardScaler()

# Fit the scaler to the data and transform the data
scaled_data = scaler.fit_transform(test.count_matrix.X)

In [189]:
scaled_data

array([[-0.16112103, -0.14701271, -0.11706316, ..., -0.11785106,
        -0.17115065, -0.3545562 ],
       [-0.16112103, -0.14701271, -0.11706316, ..., -0.11785106,
        -0.17115065,  0.84766704],
       [-0.16112103, -0.14701271, -0.11706316, ..., -0.11785106,
        -0.17115065,  1.3561352 ],
       ...,
       [-0.16112103, -0.14701271, -0.11706316, ..., -0.11785106,
        -0.17115065,  2.8036437 ],
       [-0.16112103, -0.14701271, -0.11706316, ..., -0.11785106,
        -0.17115065, -0.3545562 ],
       [-0.16112103, -0.14701271, -0.11706316, ..., -0.11785106,
         0.12553231,  0.04144233]], dtype=float32)

In [190]:
test.standardized_count_data

array([[-0.1611161 , -0.14700814, -0.11706336, ..., -0.11785103,
        -0.17114605, -0.35456222],
       [-0.1611161 , -0.14700814, -0.11706336, ..., -0.11785103,
        -0.17114605,  0.8476815 ],
       [-0.1611161 , -0.14700814, -0.11706336, ..., -0.11785103,
        -0.17114605,  1.3561585 ],
       ...,
       [-0.1611161 , -0.14700814, -0.11706336, ..., -0.11785103,
        -0.17114605,  2.8036914 ],
       [-0.1611161 , -0.14700814, -0.11706336, ..., -0.11785103,
        -0.17114605, -0.35456222],
       [-0.1611161 , -0.14700814, -0.11706336, ..., -0.11785103,
         0.12552899,  0.04144307]], dtype=float32)

In [183]:
#cosine_similarity = 1 - cosine(df.T.values, pd.DataFrame(test.standardized_count_data).values)

np.corrcoef(df.T.values[:,4] ,pd.DataFrame(test.standardized_count_data).values[:,4])


array([[1.        , 0.85052399],
       [0.85052399, 1.        ]])

test.