# Create output transformation matrix for DHS733

Used to convert n_biosamples (733) accessibility values to n_unique_biosamples (261) values by averaging biosample accessibilities if they have the same biosample name.

Dimensions will be n_biosamples x n_unique_biosamples (733 x 261)

Every column should sum to 1

In [1]:
import os
import sys

import numpy
import pandas

BASE_DIR = '../'
sys.path.append(BASE_DIR)
import src.definitions

DHS733_BIOSAMPLE_META_PATH = os.path.join(BASE_DIR, src.definitions.DHS733_BIOSAMPLE_META_PATH)

In [2]:
# Read metadata file
biosample_metadata_df = pandas.read_csv(DHS733_BIOSAMPLE_META_PATH, sep='\t', skipfooter=1)
biosample_metadata_df

  biosample_metadata_df = pandas.read_csv(DHS733_BIOSAMPLE_META_PATH, sep='\t', skipfooter=1)


Unnamed: 0,library order,Biosample name,Vocabulary representative,DCC Experiment ID,DCC Library ID,DCC Biosample ID,DCC File ID,Altius Aggregation ID,Altius Library ID,Altius Biosample ID,...,Library cleanup,DNaseI units/mL,Amount Nucleic Acid (ng),Nuclei count,Protease inhibitor,Library sequencing date,Reads used,DCC SPOT score,Per-biosample peaks,DHSs in Index
0,1,GM06990,,ENCSR000EMQ,ENCLB435ZZZ,ENCBS057ENC,ENCFF983CTQ,AG5636,LN1203,DS7748,...,Sucrose,,50,,,2009-02-23,142681590,0.6790,83639,82918
1,2,HepG2,,ENCSR000ENP,ENCLB480ZZZ,ENCBS114ENC,ENCFF419JVG,AG5635,LN1207,DS7764,...,Sucrose,,50,,,2009-02-23,138826342,0.5858,89748,89235
2,3,hTH1,,ENCSR000EQC,ENCLB591ZZZ,ENCBS345AAA,ENCFF575KOF,AG5634,LN1222,DS7840,...,Sucrose,6.0,534.9,,,2007-06-06,149158633,0.6470,94360,93665
3,4,Hela,,ENCSR000ENO,ENCLB479ZZZ,ENCBS890POO,ENCFF503PAE,AG4219,LN1264,DS8200,...,new Sucrose,4.0,50,,,2007-08-24,23372724,0.6444,59098,59024
4,5,CACO2,,ENCSR000EMI,ENCLB422ZZZ,ENCBS391ENC,ENCFF977BRD,AG4218,LN1269,DS8235,...,Sucrose,8.0,1,,,2007-09-05,22760059,0.7190,29894,29724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,729,fUmbilical_cord,,ENCSR512CWR,ENCLB771UER,ENCBS518LEK,ENCFF267RUD,AG7441,LN45036A,DS24820A,...,,,0.9,1260000.0,A+Sucrose,2017-02-17,195057523,0.5652,113517,112469
729,730,fBone_femur,Musculoskeletal,ENCSR805XIF,ENCLB236BWV,ENCBS337FPV,ENCFF604WIO,AG7442,LN45038B,DS36206B,...,,,8.8,1050000.0,A+Sucrose,2017-02-17,252066174,0.5823,146918,145356
730,731,fLiver,,ENCSR562FNN,ENCLB638FEH,ENCBS275VNY,ENCFF795ZXN,AG7443,LN45070C,DS37372C,...,,,4.48,2140000.0,A+Sucrose,,190541422,0.3703,76639,75369
731,732,fPlacenta,,ENCSR552RKI,ENCLB423VBC,ENCBS565KNL,ENCFF084UVH,AG8805,LN45072C,DS37386C,...,,,1.325,1050000.0,A+Sucrose,,203699532,0.3869,107611,106022


In [3]:
# Get unique biosample names
# As of pandas 2.2, df.unique() returns elements "in order of appearance"
unique_biosample_names = biosample_metadata_df['Biosample name'].unique()
print(f'Number of unique biosample names: {len(unique_biosample_names)}')

Number of unique biosample names: 261


In [4]:
# Generate transformation matrix
transf_mat = numpy.zeros((len(biosample_metadata_df), len(unique_biosample_names)))
for i, unique_biosample_name in enumerate(unique_biosample_names):
    biosample_indices = biosample_metadata_df[biosample_metadata_df['Biosample name'] == unique_biosample_name].index
    transf_mat[biosample_indices, i] = 1 / len(biosample_indices)
# check that every column sums to 1
print(transf_mat.sum(axis=0))
# save as npy
numpy.save('dhs733_nonredundant_transformation_matrix.npy', transf_mat)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


# Dataframe / biosample metadata

In [5]:
# create dataframe with names to original indices and Altius IDs
biosample_idx_per_unique_name = []
biosample_altius_ids_per_unique_name = []
for biosample_name in unique_biosample_names:
    biosample_name_df = biosample_metadata_df[biosample_metadata_df['Biosample name'] == biosample_name]
    biosample_idx_per_unique_name.append(biosample_name_df.index.tolist())
    biosample_altius_ids_per_unique_name.append(biosample_name_df['Altius Biosample ID'].tolist())

# Assemble dataframe
nonredundant_biosample_df = pandas.DataFrame({
    'Biosample name': unique_biosample_names,
    'Original biosample indices': biosample_idx_per_unique_name,
    'Altius Biosample IDs': biosample_altius_ids_per_unique_name
})

nonredundant_biosample_df = nonredundant_biosample_df[['Biosample name', 'Original biosample indices', 'Altius Biosample IDs']]

# Save
nonredundant_biosample_df.to_csv('dhs733_nonredundant_biosample_metadata.tsv', sep='\t', index=False)
nonredundant_biosample_df

Unnamed: 0,Biosample name,Original biosample indices,Altius Biosample IDs
0,GM06990,"[0, 706]","[DS7748, DS7784]"
1,HepG2,"[1, 597, 598, 705]","[DS7764, DS24845A, DS24838A, DS7768]"
2,hTH1,"[2, 300, 340]","[DS7840, DS17592, DS18015]"
3,Hela,[3],[DS8200]
4,CACO2,"[4, 5]","[DS8235, DS8416]"
...,...,...,...
256,fBone_arm_right,[720],[DS39410B]
257,fBone_leg_left,[721],[DS39413A]
258,fBone_leg_right,[722],[DS39417A]
259,fBone_femur,[729],[DS36206B]
