# TODO does how to correct for PCR in RNA expriment


In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
transcriptCountsNP = np.array( 
                    [
                        np.power( [2, 4, 8, 16], 1),
                        np.power( [2, 4, 8, 16], 2),
                        np.power( [2, 4, 8, 16], 3),
                        np.power( [2, 4, 8, 16], 4),

                    ])
transcriptCountCols = ["g1", "g2", "g3", "g4"]
PCR_transcriptCountsDF = pd.DataFrame( 
                    transcriptCountsNP, 
                    index = ["groundTruth", "cycle1", "cycle2", "cycle3"],
                    columns = transcriptCountCols
                 )

PCR_transcriptCountsDF

Unnamed: 0,g1,g2,g3,g4
groundTruth,2,4,8,16
cycle1,4,16,64,256
cycle2,8,64,512,4096
cycle3,16,256,4096,65536


In [3]:
byCols = 1
rowSumsSeries = PCR_transcriptCountsDF.sum(axis=byCols) 
byIndex = 0
propDistributionDF = PCR_transcriptCountsDF.loc[:, transcriptCountCols].div( rowSumsSeries, axis=byIndex )

propDistributionDF

Unnamed: 0,g1,g2,g3,g4
groundTruth,0.066667,0.133333,0.266667,0.533333
cycle1,0.011765,0.047059,0.188235,0.752941
cycle2,0.001709,0.013675,0.109402,0.875214
cycle3,0.000229,0.003662,0.058595,0.937514


# Normalization Overiew

In [4]:
def calculateScalingFactor(intensityDF : pd.DataFrame,
                          verbose : bool = False) -> pd.Series :
    # print("\n********** intensity")
    # print(intensityDF)
    
    # remove rows with zeros. log is undefined
    byRows = 1
    selectRowsWithoutZeros = (intensityDF != 0).all(axis=byRows)
    if verbose:
        print("\n********** selectRowsWithoutZeros")
        display(selectRowsWithoutZeros)
    
    intensityDF = intensityDF.loc[selectRowsWithoutZeros, :]
    if verbose:
        print("\n********** intensity")
        print(intensityDF)
    
    # natural log
    # smooths over outliers
    logIntensityDF = np.log(intensityDF)
    if verbose:
        print("\n************ logIntensityDF ")
        display(logIntensityDF)   
    
    # calculate row average of logs. ie geometric means
    # geometric mean is not swayed by outliers. It will 
    # always be <= the arithmetic mean
    rowMeansSeries = logIntensityDF.mean(axis=byRows)
    if verbose:
        print("\n************ rowMeansSeries ")
        display(rowMeansSeries)   
    
    # find metabolites that are signifigantly larger or smaller than 
    # the average subtracting averages from intensities
    # log(a) - log(b) = log(a/b)
    byCols= 0
    outlierDF = logIntensityDF.subtract(rowMeansSeries, axis=byCols)
    if verbose:
        print("\n************ outlierDF ")
        print(type(outlierDF))
        display(outlierDF)   
    
    # calculate the scan/sample median log(estimated scaling factors)
    medianDF = outlierDF.median(axis=byCols)
    if verbose:
        print("\n************ medianDF ")
        print(type(medianDF))
        display(medianDF)      
    
    # convert back to base 10
    extimatedScalingFactorsSeries = np.exp(medianDF)
    # print("\n************ extimatedScalingFactorsSeries ")
    # print(type(extimatedScalingFactorsSeries))
    # display(extimatedScalingFactorsSeries)  
    
    return extimatedScalingFactorsSeries

## Case 1 : Library Size Normalization
No biologic difference between samples. Experiments designed to create different number of reads

In [5]:
def createLibSizeTestData() -> pd.DataFrame :
    '''
    The samples are identical. The only different was number of reads produced
    '''
    readsNP = np.array([30, 24, 0, 563, 5, 13]).reshape(6,1)

    byCols = 1
    countsNP = np.append(readsNP, 
                        readsNP * 2., 
                        axis=byCols)

    # use a data frame to make it easier to understand the data
    retDF = pd.DataFrame(countsNP, 
                             columns=["replicate1", "replicate2"],
                             index = ["transcript1", 
                                      "transcript2", 
                                      "transcript3", 
                                      "transcript4", 
                                      "transcript5",
                                      "transcript6",
                                     ])
                         
    return retDF
    
libSizeTestcountsDF = createLibSizeTestData()
libSizeTestcountsDF

Unnamed: 0,replicate1,replicate2
transcript1,30.0,60.0
transcript2,24.0,48.0
transcript3,0.0,0.0
transcript4,563.0,1126.0
transcript5,5.0,10.0
transcript6,13.0,26.0


In [6]:
def testLibarySizeNormalization():
    libSizeTestcountsDF = createLibSizeTestData()
    print("**** raw counts")
    display(libSizeTestcountsDF)
    
    extimatedScalingFactorsSeries = calculateScalingFactor(libSizeTestcountsDF)
    print(f'extimatedScalingFactorsSeries :\n{extimatedScalingFactorsSeries}')
    
    normalizedDF = libSizeTestcountsDF / extimatedScalingFactorsSeries
    
    print("\n**** normalized counts. replicate1 and replicate2 should be identical")
    display(normalizedDF)
    #display(np.isclose(normalizedDF['scan1'], normalizedDF['scan2']))
    np.testing.assert_allclose( normalizedDF['replicate1'],
                                normalizedDF['replicate2'],
                                err_msg="ERROR testLibarySizeNormalization() failed ") 

testLibarySizeNormalization()

**** raw counts


Unnamed: 0,replicate1,replicate2
transcript1,30.0,60.0
transcript2,24.0,48.0
transcript3,0.0,0.0
transcript4,563.0,1126.0
transcript5,5.0,10.0
transcript6,13.0,26.0


extimatedScalingFactorsSeries :
replicate1    0.707107
replicate2    1.414214
dtype: float64

**** normalized counts. replicate1 and replicate2 should be identical


Unnamed: 0,replicate1,replicate2
transcript1,42.426407,42.426407
transcript2,33.941125,33.941125
transcript3,0.0,0.0
transcript4,796.202236,796.202236
transcript5,7.071068,7.071068
transcript6,18.384776,18.384776


## Case 2: Library Compositions
Samples are biologically different. One sample has more targets. This causes the sample with fewer samples to have arifically high counts.

Example: 
- A drop out expereiment
- RNA collected from different cell types

In [7]:
def createLibCompositionTestData() -> pd.DataFrame :
    '''
    step 1)
    Create two samples. One sample represents a health control, the other a toxic exposure.
    The only difference is the sample with the toxic exposure has an additional target
    
    To make it easy to evaluate the normlization algorithym the toxic's target has a very high intensity
    
    step 2) simulate the raw intensities
    both sample will contain the same number of ions.
    '''
    tokinIntensity = 563
    toxinIntensityNP = np.array([30, 24, 0, tokinIntensity, 5, 13]).reshape(6,1)
    # print(f'\n****** total intensity\n{np.sum(toxinIntensityNP)}')
    
    # normal is missing toxin_a_mz_103, idx = 3
    # simulate where extra 563 ions will be distributed
    controlIntensityNP = np.array([30, 24, 0, 0, 5, 13]).reshape(6,1)
    propDistNP = (controlIntensityNP / np.sum(controlIntensityNP)).reshape(1,6)
    #print(f'\n****** propDistNP\n{propDistNP}')
    
    intensity = tokinIntensity * propDistNP
    # print(f'\n****** intensity\n{intensity}')
    # print(np.sum(intensity))
    
    controlRawIntensity = controlIntensityNP  + intensity.reshape(6,1)
    # print(f'\n****** controlRawIntensity\n{controlRawIntensity}')
    # print(np.sum(controlRawIntensity))

    byCols = 1
    scansNP = np.append(toxinIntensityNP, 
                        controlRawIntensity, 
                        axis=byCols)

    # use a data frame to make it easier to under the data
    retDF = pd.DataFrame(scansNP, 
                             columns=["sample1", "sample2"],
                             index = ["metabolite_1_mz_34.5", 
                                      "metabolite_2_mz_67.0", 
                                      "metabolite_3_mz_89.5", 
                                      "toxin_a_mz_103.1", 
                                      "metabolite_5_mz_121.5",
                                      "metabolite_6_mz_137.5",
                                     ])
                         
    return retDF

createLibCompositionTestData()

Unnamed: 0,sample1,sample2
metabolite_1_mz_34.5,30.0,264.583333
metabolite_2_mz_67.0,24.0,211.666667
metabolite_3_mz_89.5,0.0,0.0
toxin_a_mz_103.1,563.0,0.0
metabolite_5_mz_121.5,5.0,44.097222
metabolite_6_mz_137.5,13.0,114.652778


In [8]:
def testLibaryCompisionNormalization():
    rawIntensitiesDF = createLibCompositionTestData()
    print("**** raw intensities")
    display(rawIntensitiesDF)
    
    extimatedScalingFactorsSeries = calculateScalingFactor(rawIntensitiesDF)
    print("\n**** extimatedScalingFactorsSeries")
    display(extimatedScalingFactorsSeries)
    
    normalizedDF = rawIntensitiesDF / extimatedScalingFactorsSeries
    print("\n**** normalized intensities")
    print("target found in boths samples should have identical intensities")
    display(normalizedDF)
    
    # test the intersection of mz targets is the same after normalization
    byRows = 1
    selectRowsWithoutZeros = (normalizedDF != 0).all(axis=byRows)
    testResultsDF = normalizedDF.loc[selectRowsWithoutZeros, :]
    np.testing.assert_allclose( testResultsDF['sample1'],
                                testResultsDF['sample2'],
                                err_msg="ERROR testLibaryCompisionNormalization() failed ")     

testLibaryCompisionNormalization()

**** raw intensities


Unnamed: 0,sample1,sample2
metabolite_1_mz_34.5,30.0,264.583333
metabolite_2_mz_67.0,24.0,211.666667
metabolite_3_mz_89.5,0.0,0.0
toxin_a_mz_103.1,563.0,0.0
metabolite_5_mz_121.5,5.0,44.097222
metabolite_6_mz_137.5,13.0,114.652778



**** extimatedScalingFactorsSeries


sample1    0.336728
sample2    2.969755
dtype: float64


**** normalized intensities
target found in boths samples should have identical intensities


Unnamed: 0,sample1,sample2
metabolite_1_mz_34.5,89.092648,89.092648
metabolite_2_mz_67.0,71.274119,71.274119
metabolite_3_mz_89.5,0.0,0.0
toxin_a_mz_103.1,1671.972035,0.0
metabolite_5_mz_121.5,14.848775,14.848775
metabolite_6_mz_137.5,38.606814,38.606814


# Evaluate normalization on PCR products

In [9]:
def testPCRNormalization(df : pd.DataFrame, 
                         idx1 : str, 
                         idx2 : str) -> bool :
    '''
    TODO
    '''
    testDF =  PCR_transcriptCountsDF.loc[ [idx1, idx2] ]
    extimatedScalingFactorsSeries = calculateScalingFactor( testDF )
    print(f'extimatedScalingFactorsSeries :\n{extimatedScalingFactorsSeries}')

    print("\n**** normalized counts. replicate1 and replicate2 should be identical")    
    normalizedDF = testDF / extimatedScalingFactorsSeries
    display(normalizedDF)

    if normalizedDF.loc[idx1, :].equ normalizedDF.loc[idx2, :] :
        ret = True
        print(f'test passed: replicants are identical')
    else:
        ret = False
        print(f'test failed: replicants are not identical')

    return ret

In [10]:
testPCRNormalization(PCR_transcriptCountsDF, "groundTruth",  "cycle1")

extimatedScalingFactorsSeries :
g1    0.210224
g2    0.594604
g3    1.681793
g4    4.756828
dtype: float64

**** normalized counts. replicate1 and replicate2 should be identical


Unnamed: 0,g1,g2,g3,g4
groundTruth,9.513657,6.727171,4.756828,3.363586
cycle1,19.027314,26.908685,38.054628,53.817371


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
testPCRNormalization(PCR_transcriptCountsDF,  "cycle1" , "cycle2")

In [None]:
testPCRNormalization(PCR_transcriptCountsDF,  "cycle2" , "cycle3")

In [None]:
# # 1 cycle of PCR
# cycle1TestDF =  PCR_transcriptCountsDF.loc[ ["groundTruth", "cycle1"] ]
# extimatedScalingFactorsSeries = calculateScalingFactor( cycle1TestDF )
# print(f'cycle1ScalingFactors :\n{cycle1ScalingFactors}')
# normalizedDF = cycle1TestDF / extimatedScalingFactorsSeries

# print("\n**** normalized counts. replicate1 and replicate2 should be identical")
# display(normalizedDF)
# #display(np.isclose(normalizedDF['scan1'], normalizedDF['scan2']))
# # np.testing.assert_allclose( normalizedDF.loc['groundTruth', :],
# #                             normalizedDF.loc['cycle1', :],
# #                             err_msg="ERROR testLibarySizeNormalization() failed ") 
# # pd.testing.assert_series_equal(normalizedDF.loc['groundTruth', :], 
# #                               normalizedDF.loc['cycle1', :],
# #                              ) 
# normalizedDF.loc['groundTruth', :] == normalizedDF.loc['cycle1', :]