# Normalize Counts
Andrew E. Davidson
aedavids@ucsc.edu
2/23/24

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

Calculates DESeq2 normalization scaling factors

ref:

https://bioconductor.org/packages/release/bioc/html/DESeq2.html
Love, M. I., Huber, W. & Anders, S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol 15, 550 (2014).


In [1]:
import ipynbname
from IPython.display import display
import pandas as pd
# display all rows
#pd.set_option('display.max_rows', None)

# https://joelmccune.com/pandas-dataframe-to-markdown/
#from pandas.io.clipboards import to_clipboard

#import pathlib as pl
import pprint as pp
import numpy as np
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
dataRoot = "/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets"

def loadCSV(path : str) -> pd.DataFrame:
    print(path)
    df = pd.read_csv(path)
    print(df.shape)
    
    return df

In [3]:
%%time
groupbyPath = f'{dataRoot}/GTEx_TCGA_Groupby.csv'
groupbyDF   = loadCSV( groupbyPath )
print( f'groupbyDF.shape : {groupbyDF.shape}' )

/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_Groupby.csv
(74777, 26338)
groupbyDF.shape : (74777, 26338)
CPU times: user 9min 27s, sys: 35 s, total: 10min 2s
Wall time: 10min 6s


In [4]:
%%time
estimatedSizeFactorsPath = f'{dataRoot}/GTEx_TCGA_GroupbyEstimatedSizeFactors.csv'
scalingFactorsDF = loadCSV( estimatedSizeFactorsPath )
print( f'scalingFactorsDF.shape : {scalingFactorsDF.shape}' )

/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_GroupbyEstimatedSizeFactors.csv
(26337, 2)
scalingFactorsDF.shape : (26337, 2)
CPU times: user 16.3 ms, sys: 7.46 ms, total: 23.7 ms
Wall time: 23.1 ms


## scale the groupby data frame

In [5]:
# make sure we do not loose the geneId info when we multiply
print(f'groupbyDF.shape {groupbyDF.shape}' )

groupbyDF = groupbyDF.set_index('geneId')
print(f'groupbyDF.shape {groupbyDF.shape}' )

groupbyDF.shape (74777, 26338)
groupbyDF.shape (74777, 26337)


In [6]:
%%time
groupbyTransposeDF   = groupbyDF.transpose(copy=True)
print( f'groupbyTransposeDF.shape : {groupbyTransposeDF.shape}' )

groupbyTransposeDF.shape : (26337, 74777)
CPU times: user 2.31 s, sys: 4.52 s, total: 6.82 s
Wall time: 6.82 s


In [7]:
# make sure we do not loose sampleId info when we multiply
print(f'scalingFactorsDF.shape {scalingFactorsDF.shape}' )
scalingFactorsDF = scalingFactorsDF.set_index('sampleId')
print(f'scalingFactorsDF.shape {scalingFactorsDF.shape}' )

scalingFactorsDF.shape (26337, 2)
scalingFactorsDF.shape (26337, 1)


In [8]:
# element wises multiplication 
transposedScaledDF = groupbyTransposeDF * scalingFactorsDF.values
print(f'transposedScaledDF.shape {transposedScaledDF.shape}' )

transposedScaledDF.shape (26337, 74777)


In [9]:
%%time
# transpose so that shape is the same as the original groupBy dataFrame
scaledGroupbyDF = transposedScaledDF.transpose()
print(f'scaledGroupbyDF.shape {scaledGroupbyDF.shape}' )

scaledGroupbyDF.shape (74777, 26337)
CPU times: user 19 ms, sys: 277 µs, total: 19.3 ms
Wall time: 18.3 ms


In [10]:
%%time

aedwip save in HDF format. csv is very slow see csv2hdf5.ipynb
normalizedGroupbyHDF5_DF = pd.read_hdf(normalizedGroupbyHDF5Path)
print( f'normalizedGroupbyHDF5_DF.shape : {normalizedGroupbyHDF5_DF.shape}' )

# save
# the index is the geneId
scaledGroupbyPath = f'{dataRoot}/GTEx_TCGA_NormalizedGroupby.csv'
scaledGroupbyDF.to_csv( scaledGroupbyPath, index=True)

CPU times: user 44min 9s, sys: 15.4 s, total: 44min 24s
Wall time: 54min 27s
