# Create Fractions Matrix Overview
```
Andrew E. Davidson
aedavids@ucsc.edu
9/10/22
```

In [1]:
import numpy as np
import pandas as pd

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

## Create sample data

In [2]:
# s1 and s1b data are designed to make it easy to test
# if we calculate the expected values correctly
# assume we have already selected signature genes and scaled samples
groupByGeneDF = pd.DataFrame( {
    'geneId':['g1', 'g2', 'g3', 'g4'] ,   
    "s1" :[  1,   2,   3,   4],
    "s1b":[1.5, 2.5, 3.5, 4.5],    
    "s2" :[ 10,  20,  30,  40],
    "s3" :[100, 200, 300, 400]
})

# set index to geneId. will make join easier' When we transpose
# the data frame the index will become the column names
groupByGeneDF = groupByGeneDF.set_index('geneId')

display(groupByGeneDF)

colDataDF = pd.DataFrame( {
    'sample_id':['s1', 's1b', 's2',   's3'] ,   
    "category":[ 'c1', 'c1', 'c2',   'c3']
})
colDataDF

Unnamed: 0_level_0,s1,s1b,s2,s3
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
g1,1,1.5,10,100
g2,2,2.5,20,200
g3,3,3.5,30,300
g4,4,4.5,40,400


Unnamed: 0,sample_id,category
0,s1,c1
1,s1b,c1
2,s2,c2
3,s3,c3


In [3]:
# copy so we do not accidently change origina groupDF
transposeGroupByDF = groupByGeneDF.transpose(copy=True)

joinDF =  pd.merge(left=transposeGroupByDF, 
                right=colDataDF, 
                how='inner', 
                left_index=True, #left_on="index",
                right_on="sample_id")


display(joinDF)

Unnamed: 0,g1,g2,g3,g4,sample_id,category
0,1.0,2.0,3.0,4.0,s1,c1
1,1.5,2.5,3.5,4.5,s1b,c1
2,10.0,20.0,30.0,40.0,s2,c2
3,100.0,200.0,300.0,400.0,s3,c3


## Create the Fractions Matrix

In [4]:
df = joinDF.loc[:,["category"]].drop_duplicates().sort_values(by='category')
listOfTypes = df["category"].values.tolist()
listOfTypes

['c1', 'c2', 'c3']

In [5]:
fractionDF = pd.DataFrame(columns=['sample_id'] + listOfTypes)
numTypes = len(listOfTypes)

for index, row in joinDF.iterrows():
    sample_id = row['sample_id']
    category = row['category']
    #print("sample_id:{} category:{}".format(sample_id, category))
    idx = listOfTypes.index( category)
    linearCombination = np.zeros(numTypes)
    linearCombination[idx] = 1.0
    fractionDF.loc[ len(fractionDF.index)] = [sample_id] + [i for i in linearCombination]

fractionDF

Unnamed: 0,sample_id,c1,c2,c3
0,s1,1.0,0.0,0.0
1,s1b,1.0,0.0,0.0
2,s2,0.0,1.0,0.0
3,s3,0.0,0.0,1.0
