### TF - motif bias matrix

1. downloading gene - motif name mapping in here: https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/snapshots/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl
2. then load the motif -> gene enrichment mapping in here: https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/the hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.scores.feather
3. then we replace the motif name with gene names
4. then we take the max enrichment if same gene-motif
5. then renormalize this per gene and save it as our new attention bias

----

### PPI bias matrix

1. we get the human PPI matrix from stringdb here: https://string-db.org/cgi/download?sessionId=bwVBywlOX6i8&species_text=Homo+sapiens
2. we get the same one for other species (e.g. mus musculus): https://stringdb-downloads.org/download/protein.links.v12.0/10090.protein.links.v12.0.txt.gz
3. we open them and use ensembl's biomart to map ENSP to ENSG. we merge all similar pairs
4. we map it to a grn with associated scores.
5. we use them as our new PPI bias matrix

In [1]:
import pandas as pd
from scdataloader.utils import load_genes
from scdataloader.utils import getBiomartTable
import numpy as np
from tqdm import tqdm
import torch
from scipy import sparse

%load_ext autoreload
%autoreload 2


  from anndata import __version__ as anndata_version
  from anndata import __version__ as anndata_version


[92m→[0m connected lamindb: jkobject/scprint2


In [2]:
genedf = load_genes()

In [7]:
LOC = "../../data/main/" #"./data/main/"

In [4]:
da = pd.read_feather(LOC+'main_scenic+_database.feather')
da = da.set_index('motifs')


tbl = pd.read_csv(LOC+'motifs-v10-nr.hgnc-m0.00001-o0.0.tbl'
, sep='\t', header=None, names=['motif_id','motif_name',
'motif_description','source_name','source_version','gene_name','motif_similarity_qvalue','similar_motif_id','similar_motif_description','orthologous_identity','orthologous_gene_name','orthologous_species','description'])
tbl = tbl.set_index('motif_id')
tbl = tbl[tbl.gene_name.isin(genedf.symbol)]


FileNotFoundError: [Errno 2] No such file or directory: './data/main/main_scenic+_database.feather'

In [None]:
rn = {}
toadd = {}
for i in tqdm.tqdm(da.index):
    try:
        res = tbl.loc[i, 'gene_name']
    except KeyError:
        da = da.drop(i)
    if type(res) != str:
        for v in res.values[1:]:
            toadd.update({v: da.loc[i].values})
        res = res.values[0]
    rn.update({i: res})

In [None]:
da = da.rename(index=rn)
da = pd.concat([da, pd.DataFrame(toadd, index=da.columns).T])
da = da.loc[~da.index.duplicated(keep='first')]


In [None]:
da.to_parquet(LOC+'main_scenic+.parquet')

In [8]:
da = pd.read_parquet(LOC+'main_scenic+.parquet')

In [None]:
biomart = getBiomartTable(attributes=['ensembl_gene_id', "hgnc_symbol"], bypass_attributes=True)

downloading gene names from biomart
['ensembl_gene_id', 'hgnc_symbol']


In [28]:
mp = {}
for v, k in biomart.iloc[:,:2].values:
    if v is np.nan or k is np.nan:
        continue
    if k in mp:
        if v not in mp[k]:
            mp[k]+=[v]
    else:
        mp[k] = [v]


In [14]:
MAX=5
MIN=0

In [15]:
da = MIN + (da - da.min(1).values[:, None]) * (MAX - MIN) / (da.max(1) - da.min(1)).values[:, None]


In [29]:
new_index = []
new_data = []

# Remap indices
for idx in da.index:
    if idx in mp:
        for mapped_idx in mp[idx]:
            new_index.append(mapped_idx)
            new_data.append(da.loc[idx].values)
    else:
        new_index.append(idx)
        new_data.append(da.loc[idx].values)

# Remap columns
new_columns_set = set()
remapped_data = []

for row in tqdm(new_data):
    new_row = []
    for col, value in zip(da.columns, row):
        if col in mp:
            for mapped_col in mp[col]:
                new_row.append(value)
                new_columns_set.add(mapped_col)
        else:
            new_row.append(value)
            new_columns_set.add(col)
    remapped_data.append(new_row)

new_columns = list(new_columns_set)

new_da = pd.DataFrame(remapped_data, index=new_index, columns=new_columns)


100%|██████████| 1692/1692 [00:11<00:00, 144.60it/s]


ValueError: 30498 columns passed, passed data had 30500 columns

In [30]:
new_columns

['ENSG00000180016',
 'ENSG00000136040',
 'LOC101926933',
 'ENSG00000007350',
 'ENSG00000163539',
 'ENSG00000276221',
 'ENSG00000204478',
 'ENSG00000231676',
 'ENSG00000163964',
 'ENSG00000275136',
 'ENSG00000119682',
 'ENSG00000116685',
 'LOC105377143',
 'FAM231B',
 'ENSG00000284979',
 'ENSG00000108797',
 'HSPC324',
 'ENSG00000154001',
 'LOC100506207',
 'LOC100506368',
 'ENSG00000138136',
 'ENSG00000200367',
 'C7orf55',
 'ENSG00000196544',
 'ENSG00000236751',
 'LOC100505918',
 'ENSG00000213401',
 'ENSG00000090097',
 'ENSG00000284013',
 'ENSG00000175416',
 'ENSG00000166136',
 'ENSG00000064042',
 'ENSG00000223638',
 'SPACA6P-AS',
 'ENSG00000147378',
 'FAM159B',
 'HCG16',
 'ENSG00000008988',
 'ENSG00000211590',
 'ENSG00000146535',
 'ENSG00000221649',
 'ENSG00000052126',
 'ENSG00000187733',
 'ENSG00000134248',
 'ENSG00000108829',
 'ENSG00000166793',
 'ENSG00000229027',
 'ENSG00000184434',
 'ENSG00000073792',
 'ENSG00000291865',
 'ENSG00000144218',
 'ENSG00000118369',
 'ENSG00000281934',
 '

In [None]:
new_da

Unnamed: 0,CADM2-AS2,PHGR1,LOC101926933,MIR3119-1,TLN2,VSTM2A,ZDHHC24,PQBP1,LOC105377143,FAM231B,ADCY3,RPS15,PBDC1,HSPC324,APOBEC4,UTP18,LOC100506207,LOC100506368,SNX29P2,C7orf55,MIR5007,IGF2BP3,LOC100505918,SPACA6P-AS,IKZF4,FAM159B,HCG16,MIR4769,CD70,MIOX,PRNT,NEO1,CD55,DAND5,CCDC158,MVB12A,SH2D1A,DEFB112,RLIM,TP53BP1,EPB41L2,UCP2,ZNF546,DDC-AS1,MAP4K4,LINC00572,PPARD,CHKB-CPT1B,LOC101929372,IGFL3,BCOR,LINC01257,LOC101928551,ATXN1L,FLJ38576,TTC7A,FAM159A,MIR6770-3,BMP8A,L3MBTL4-AS1,SNORD3A,LOC102723885,MANBA,LOC152225,ARHGAP35,ELP4,DAB2IP,LOC100505920,LOC101928307,GOLGA6L17P,KNG1,DZANK1,DLX5,SMS,YBEY,OR51G1,MIR638,COL3A1,DSTN,MIR4259,MIR548AB,NLGN4X,SHISA9,USP3,RAP2B,MIR6792,ADAMTS18,TMTC2,LINC01048,S100A1,ROGDI,MIR466,TAF4,ATG4C,HRH3,FGFBP2,ALG10B,FRAT1,AGTPBP1,LRP5L,...,LOC100507412,PLIN4,ACKR2,PCDHA9,MIR4477B,LINC01184,SLC16A4,MBD4,ERICH3,MIR181A2,MMP25-AS1,CELA3B,MAFF,GLIS2,KCTD14,MIR6828,GTF3C6,LINC00308,LOC105372280,LOC105373100,SULT1E1,SRM,CHAD,LINC01204,FUT4,SRD5A1,TAS2R5,TMEM132E,TTC9,ATG13,PRDM4,SNORD112,C16orf59,ITGAD,LOC283683,SLC6A17,NCK2,ATL3,MIR1244-4,WFDC10B,NFASC,SNORA59B,UEVLD,SOX8,MEIOB,WIPF2,KIF3B,GSTA1,ZNF525,HOXA5,MYO1D,SNORD109B,TMEM189-UBE2V1,ARRDC1,MUC7,RPP14,TNS3,JCHAIN,LINC01508,SAMD1,TTTY23B,OR5M1,ZC3HC1,SNORD31,MUM1L1,UQCRBP1,RNF175,LRRTM4,SLC15A5,MIR3668,SRC,LOC101928812,C6orf141,ATAD1,GEMIN6,LINC01467,SNORD86,CDK2,MEF2A,LOC101929161,WDR43,FLJ31104,NFYC,CLTC,KPNA6,LOC101928435,ST3GAL5-AS1,ZNF527,CMSS1,H1FX,FAP,C2CD4D,MDGA1,TCEB1,CLASP2,PTGS1,LOC105373876,MIR4713,NRIP1,CES1
RBPJ,0.060400,0.060800,0.197400,0.128600,0.082600,0.149600,0.095200,0.150200,0.125000,0.119600,0.123600,0.147600,0.181400,0.126000,0.090200,0.083600,0.177000,0.156400,0.086200,0.097400,0.100200,0.105400,0.093000,0.087600,0.130600,0.135800,0.076400,0.150000,0.230000,0.106400,0.125400,0.141200,0.079000,0.069000,0.154000,0.117000,0.121800,0.162000,0.161800,0.143400,0.109000,0.164800,0.094600,0.081600,0.146800,0.116800,0.063400,0.102400,0.131800,0.145000,0.202000,0.133000,0.153200,0.089200,0.116800,0.094800,0.191400,0.140200,0.082000,0.076000,0.175000,0.172800,0.128400,0.089000,0.136200,0.130400,0.110600,0.151200,0.112400,0.146600,0.079400,0.138400,0.097200,0.086600,0.111200,0.081800,0.074200,0.067000,0.070000,0.099800,0.187200,0.142800,0.078800,0.116400,0.139400,0.086400,0.136400,0.067200,0.178400,0.187400,0.254000,0.126400,0.122400,0.107600,0.115000,0.155000,0.066200,0.122400,0.108400,0.143600,...,0.172400,0.128600,0.124000,0.091200,0.065200,0.116600,0.187400,0.073800,3.040000,0.074000,0.166400,0.071200,0.069800,0.124800,0.148600,0.124400,0.119000,0.108400,0.100600,0.085000,0.137200,0.131400,0.100800,0.131000,0.068800,0.152600,0.236000,0.222000,0.117000,0.070400,0.068200,0.058600,0.122800,0.093800,0.126600,0.127400,0.136000,0.162400,0.188000,0.222000,0.129800,0.126200,0.070400,0.110600,0.129000,0.143000,0.076000,0.091600,0.111400,0.262000,0.115000,0.085200,0.121200,0.131200,0.085600,0.076600,0.078200,0.143400,0.176200,0.198200,0.078400,0.111400,0.101800,0.131000,0.166400,0.154200,0.083600,0.194000,0.145200,0.154800,0.127200,0.116000,0.125400,0.103200,0.114400,0.190800,0.978000,0.146800,0.128400,0.240000,0.142200,0.092000,0.090000,0.111400,0.166000,0.061800,0.256000,0.122000,0.148600,0.119800,0.086400,0.141000,0.226000,0.177200,0.135400,0.067000,0.081400,0.151200,0.131000,0.131400
FOXC2,0.387407,0.362222,0.851852,0.650370,0.445185,0.371852,0.304444,0.429630,0.537778,0.396296,0.337778,0.371111,0.785185,0.400741,0.478519,0.400000,0.403704,0.619259,0.242222,0.325185,0.417778,0.370370,0.814815,0.560741,0.339259,0.628889,0.419259,0.434815,0.390370,0.208889,0.358519,0.268889,0.540000,0.962963,0.529630,0.837037,0.408889,0.417037,0.415556,0.340741,0.495556,0.443704,0.532593,0.599259,0.512593,0.455556,1.429630,0.038074,1.718518,0.455556,0.384444,0.442963,0.388889,0.544444,0.393333,0.242222,0.662222,0.440741,0.911111,0.343704,0.718518,0.282222,0.520741,0.356296,0.382222,0.393333,0.601481,0.301481,0.405926,0.377778,0.445185,0.711852,0.474074,0.658519,0.630370,0.508148,0.488148,0.524444,0.320000,0.557778,0.197778,0.453333,0.556296,0.275556,0.537037,0.197037,0.504444,0.300000,0.522222,1.281482,0.339259,0.434815,0.465185,0.354815,0.401481,0.345926,0.321481,0.426667,0.351111,0.471111,...,0.593333,0.422222,0.549630,0.499259,0.286667,0.302963,0.365926,0.245185,0.640000,0.263704,0.408148,0.731852,0.685926,0.762963,0.312593,0.859259,0.209630,0.514815,0.279259,0.320000,0.417778,0.933333,0.503704,0.308889,0.387407,0.518519,0.428148,0.562963,0.368148,0.396296,0.341482,0.405926,0.289630,0.431852,0.285185,0.277037,0.320741,0.337037,0.582222,0.447407,0.582963,0.417778,0.181481,0.464444,0.532593,0.353333,0.369630,0.307407,0.367407,0.319259,0.357037,0.291852,0.468148,0.572593,0.422963,0.308148,0.297778,0.554074,0.721481,0.600741,0.337037,0.499259,0.436296,0.690370,0.374074,0.402222,0.288889,0.437778,0.551111,0.705926,0.438519,0.380000,0.397037,0.424444,0.251111,1.340741,0.539259,0.386667,0.458519,0.243704,0.725185,0.199259,0.080741,0.374815,0.335556,0.372593,0.322963,0.254074,0.588148,0.437037,0.325185,0.276296,0.279259,0.297037,0.325926,0.367407,0.807407,0.428889,0.344444,0.412593
YY1,0.039437,0.016690,0.402817,0.323944,0.024859,0.299296,0.294366,0.282394,0.286620,0.290141,0.283803,0.299296,0.283803,0.287324,0.316197,0.309155,0.302113,0.282394,0.290845,0.309155,0.318310,0.319718,0.303521,0.280986,0.277465,0.025352,0.283099,0.283099,0.301408,0.297887,0.510563,0.296479,0.286620,0.310563,0.287324,0.307746,0.356338,0.310563,0.310563,0.292958,0.287324,0.295070,0.572535,0.309859,0.596479,0.338732,0.280986,0.351408,0.283803,0.438028,0.514789,0.307746,0.294366,0.295775,0.302817,0.306338,0.316197,0.288732,0.294366,0.306338,0.411972,0.316901,0.303521,0.363380,0.290141,0.286620,0.295775,0.318310,0.285211,0.297887,0.289437,0.292254,0.281690,0.302817,0.488028,0.290845,0.289437,0.282394,0.288732,0.300000,0.294366,0.314789,0.297887,0.296479,0.321127,0.302817,0.032394,0.311268,0.558451,0.295070,0.550704,0.290845,0.283803,0.289437,0.311268,0.273239,0.287324,0.288028,0.287324,0.016479,...,0.293662,0.283803,0.291549,0.012113,0.278873,0.502817,0.280282,0.301408,0.291549,0.591549,0.322535,0.291549,0.279577,0.292254,0.281690,0.284507,0.501408,0.538732,0.334507,0.300704,0.276761,0.291549,0.278873,0.295070,0.291549,0.010845,0.311268,0.009930,0.291549,0.284507,0.022606,0.008521,0.302113,0.309859,0.319718,0.285211,0.290845,0.303521,0.024648,0.031620,0.288732,0.461268,0.279577,0.288028,0.711268,0.293662,0.643662,0.300000,0.326761,0.315493,0.298592,0.528873,0.292958,0.293662,0.291549,0.297887,0.302817,0.287324,0.285915,0.318310,0.288028,0.557042,0.279577,0.545775,0.278873,0.285211,0.283803,0.287324,0.296479,0.304930,0.547887,0.321127,0.280282,0.300704,0.288028,0.547183,0.450704,0.304225,0.301408,0.278873,0.554225,0.311972,0.309859,0.282394,0.576761,0.328873,0.290141,0.016268,0.312676,0.288028,0.280986,0.304930,0.299296,0.289437,0.010423,0.283099,0.291549,0.322535,0.291549,0.657042
NR2E1,0.677861,0.096393,0.424751,0.297886,0.250000,0.294776,0.438433,0.139303,0.373134,0.263060,0.295398,0.442786,0.393035,0.291667,0.262438,0.347015,0.438433,0.235075,0.322761,0.416667,0.289801,0.541045,0.238806,0.572139,0.269279,0.092662,0.095149,0.236940,0.424129,0.340796,0.297264,0.286070,0.518035,0.206468,0.493781,0.233831,0.284826,0.526741,0.526741,0.336443,0.342662,0.276119,0.400498,0.288557,0.439677,0.452736,0.362562,0.000000,0.424129,0.251244,0.671642,0.383706,0.097637,0.277363,0.218284,0.267413,0.883085,0.589552,0.248134,0.294776,0.552861,0.117537,0.315298,0.060261,0.176617,0.371269,0.284826,0.283582,0.380597,0.346393,0.358831,0.102612,0.294776,0.411070,0.268657,0.268657,0.330224,0.039614,0.075871,0.257463,0.424751,0.287935,0.273010,0.250622,0.368781,0.055473,0.353234,0.449005,0.345149,0.430970,0.086443,0.292289,0.049192,0.321517,0.263060,0.404851,0.367537,0.342662,0.340796,0.250000,...,0.158582,0.320895,0.294776,0.282338,0.329602,0.328358,0.312811,0.061194,0.358209,0.258706,0.287313,0.371891,0.315920,0.387438,0.384328,0.570274,0.062189,0.287935,0.272388,0.305348,0.366294,0.322139,0.311567,0.281716,0.304726,0.287935,0.269901,0.360075,0.275497,0.069030,0.087065,0.062127,0.378109,0.483209,0.266169,0.259328,0.305348,0.314055,0.501244,0.479478,0.491915,0.315920,0.148632,0.394279,0.274876,0.470771,0.282960,0.218905,0.610696,0.387438,0.100124,0.471393,0.279229,0.423507,0.238184,0.031405,0.321517,0.143657,0.435945,0.258085,0.268657,0.271144,0.268657,0.634328,0.317786,0.337065,0.306592,0.366915,0.269901,0.303483,0.245025,0.312811,0.041853,0.269901,0.634328,0.267413,0.447139,0.452736,0.341418,0.470149,0.291045,0.074627,0.063433,0.210821,0.330846,0.282960,0.294776,0.050684,0.280473,0.394901,0.226368,0.212687,0.296020,0.294776,0.347637,0.329602,0.250622,0.388060,0.297886,0.431592
BCL6B,0.183955,0.179851,0.402985,0.414179,0.159328,0.432836,0.988806,0.194403,0.470149,0.507463,0.544776,0.582090,0.304851,0.600746,0.787313,0.488806,0.249627,0.317537,0.216791,0.328358,0.325373,0.313433,0.372761,0.578358,0.347388,0.260821,0.250373,0.297015,0.417910,0.202239,0.288806,0.192164,0.187687,0.641791,0.832090,0.578358,0.589552,0.309702,0.268284,0.578358,0.582090,0.768657,0.291045,0.207090,0.421642,0.302239,0.388060,0.131343,0.690298,0.615672,1.022388,0.466418,0.188433,0.306343,0.421642,0.169030,0.421642,0.462687,0.701493,0.552239,0.360821,0.364552,0.597015,0.370522,0.243657,0.514925,0.645522,0.257836,0.485075,1.194030,0.395522,0.548507,0.278358,0.481343,0.305224,0.253358,0.263806,0.276493,0.533582,0.511194,0.214925,0.325746,0.458955,0.410448,0.518657,0.347761,0.349254,0.183955,0.809702,0.652985,0.481343,0.690298,0.340299,0.245522,0.447761,0.455224,0.204478,0.238060,0.578358,0.417910,...,0.255597,0.247761,0.216045,1.197761,0.260821,0.267910,0.432836,0.199627,0.529851,0.125000,0.320149,0.563433,0.444030,0.391791,0.425373,0.492537,0.191045,0.388060,0.187687,0.921642,0.462687,0.286194,0.244030,0.341418,0.451493,0.342910,0.247388,0.372761,0.406716,0.503731,0.406716,0.282463,0.310448,0.200000,0.809702,0.824627,0.318284,0.344030,0.384328,0.354851,0.563433,0.285075,0.263806,0.329478,0.477612,0.254478,0.492537,0.245522,0.462687,0.264552,0.421642,0.514925,0.809702,0.333955,0.196642,0.791045,0.239179,0.574627,0.339179,1.350746,0.380597,0.514925,0.414179,0.507463,0.295896,0.402985,0.308209,0.369776,0.462687,0.354478,0.320522,0.391791,0.406716,0.440298,0.263433,0.373134,1.085821,0.384328,0.399254,0.150373,0.604478,0.200746,0.286940,1.261194,1.000000,0.388060,0.289552,0.201493,0.195522,0.466418,0.250746,0.342910,0.548507,0.406716,0.320149,0.310821,0.936567,0.347761,0.522388,0.828358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCF24,0.077598,0.077714,0.083603,0.085104,0.000000,0.049076,0.030139,0.029792,0.034296,0.030139,0.024827,0.032102,0.044688,0.090878,0.034296,0.025173,0.094111,0.034180,0.018822,0.032679,0.036952,0.024134,0.087760,0.025289,0.000000,0.014319,0.026443,0.094804,0.040069,0.000000,0.107044,0.029446,0.000000,0.021132,0.081409,0.029561,0.036259,0.049192,0.049192,0.037413,0.086605,0.021709,0.021709,0.027252,0.029215,0.026328,0.000000,0.033372,0.048961,0.048845,0.027367,0.029792,0.024134,0.083949,0.029215,0.016628,0.087760,0.042494,0.030831,0.040762,0.034758,0.035681,0.022055,0.034411,0.033834,0.000000,0.092494,0.025635,0.025058,0.030370,0.026097,0.032679,0.032564,0.032910,0.031409,0.023210,0.020901,0.021940,0.037413,0.045497,0.024018,0.079561,0.022864,0.000000,0.084988,0.000000,0.030485,0.093533,0.039607,0.033834,0.006224,0.023441,0.025635,0.086952,0.029792,0.026674,0.000000,0.027598,0.032794,0.075404,...,0.033834,0.028753,0.031524,0.011259,0.000000,0.022055,0.071132,0.036490,0.033834,0.035566,0.023903,0.172055,0.029561,0.031986,0.000258,0.024480,0.090531,0.028406,0.029215,0.090185,0.029792,0.030600,0.024480,0.030716,0.027252,0.027252,0.075289,0.035219,0.024942,0.086952,0.016397,0.028406,0.074596,0.109353,0.031062,0.031986,0.016166,0.025173,0.079215,0.081409,0.028637,0.087182,0.000000,0.060855,0.022171,0.044804,0.037760,0.030139,0.043764,0.030254,0.030716,0.019861,0.032448,0.089607,0.034758,0.016051,0.084411,0.025866,0.024596,0.026328,0.000000,0.027367,0.031755,0.076905,0.027829,0.038799,0.084296,0.051386,0.091224,0.078522,0.021709,0.093418,0.025866,0.035797,0.029908,0.023672,0.089838,0.077252,0.000000,0.024018,0.026905,0.000000,0.024711,0.036259,0.034642,0.000000,0.021247,0.000000,0.018707,0.018245,0.071709,0.086721,0.034065,0.030139,0.021594,0.000000,0.036490,0.000000,0.020439,0.045612
GTF2A2,0.115323,0.120565,0.265726,0.150403,0.127823,0.184677,0.215726,0.077823,0.152016,0.355645,0.152419,0.183065,0.275000,0.101210,0.354436,0.202419,0.156048,0.148387,0.149597,0.228226,0.150806,0.143952,0.212903,0.251613,0.137097,0.105645,0.175000,0.131855,0.208065,0.110081,0.227419,0.061694,0.121774,0.164516,0.243548,0.329032,0.258871,0.415323,0.415323,0.183468,0.147984,0.207258,0.194355,0.204839,0.181048,0.116935,0.131855,0.124194,0.143145,0.199194,1.052419,0.245565,0.193952,0.229435,0.198790,0.165726,0.127016,0.143548,0.254435,0.192339,0.104032,0.166129,0.224194,0.199597,0.109677,0.145161,0.206855,0.120565,0.215323,0.228629,0.245968,0.160081,0.120968,0.127419,0.202823,0.240726,0.204032,0.227823,0.144758,0.287097,0.101210,0.229032,0.372984,0.153226,0.175000,0.128226,0.140726,0.182258,0.277823,0.127016,0.162500,0.173387,0.111290,0.141935,0.310081,0.135887,0.070565,0.178629,0.164919,0.233065,...,0.176210,0.148790,0.224597,0.163710,0.143952,0.150806,0.161694,0.094355,0.206855,0.131048,0.226613,0.110887,0.122581,0.156452,0.197177,0.166935,0.133065,0.160484,0.168145,0.116935,0.223790,0.180242,0.158871,0.066129,0.193548,0.157661,0.262903,0.230242,0.113710,0.267339,0.280645,0.076210,0.241532,0.159677,0.195565,0.199194,0.147984,0.208468,0.213306,0.149194,0.178226,0.296371,0.142742,0.155645,0.126210,0.139919,0.137097,0.166935,0.183065,0.182661,0.213306,0.169355,0.107258,0.194355,0.110484,0.302016,0.101613,0.128226,0.147177,0.335484,0.204839,0.175403,0.174597,0.342339,0.264919,0.122984,0.154032,0.157661,0.366532,0.156452,0.235081,0.184677,0.095968,0.182258,0.156855,0.201613,0.293145,0.206048,0.119758,0.102419,0.212097,0.096371,0.157258,0.166129,0.431452,0.102016,0.114516,0.145161,0.175806,0.148790,0.062903,0.186290,0.162903,0.145161,0.143952,0.177823,0.303226,0.224597,0.160484,0.612903
ZFP91-CNTF,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.541237,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.342784,0.000000,0.143557,0.000000,0.000000,0.329897,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.265464,0.466495,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.940722,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.994845,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.822165,0.023995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.896907,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.048711,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033247,0.000000,0.211340,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.682990,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.053608,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.219588,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LDB1,0.055594,0.018007,0.155944,0.110490,0.021399,0.118182,0.131469,0.090909,0.115385,0.132517,0.133566,0.128322,0.089860,0.118881,0.070629,0.170979,0.154895,0.074126,0.153147,0.139860,0.093007,0.092308,0.157343,0.119930,0.053846,0.095105,0.129021,0.068182,0.081818,0.122028,0.115734,0.064336,0.108042,0.117133,0.120979,0.129720,0.161538,0.101748,0.101748,0.107692,0.131119,0.116783,0.176923,0.078322,0.202797,0.134965,0.143357,0.140559,0.141608,0.110140,0.101748,0.093706,0.082517,0.128322,0.147203,0.089510,0.092657,0.112238,0.179021,0.118531,0.117133,0.119580,0.146504,0.090210,0.133916,0.062937,0.128322,0.148951,0.084615,0.107343,0.133217,0.147203,0.075175,0.227622,0.146504,0.101748,0.077273,0.076923,0.067483,0.114685,0.109091,0.140909,0.108042,0.088112,0.077273,0.111888,0.147902,0.061189,0.117483,0.091608,0.199650,0.268182,0.297552,0.105594,0.129021,0.170629,0.006888,0.083217,0.144056,0.150699,...,0.096853,0.110140,0.155245,0.076573,0.129021,0.093357,0.120979,0.105245,0.076923,0.107343,0.129371,0.167832,0.032657,0.165734,0.115734,0.136713,0.094056,0.086713,0.122028,0.108392,0.087063,0.138112,0.082168,0.097203,0.097203,0.098601,0.144406,0.143007,0.114685,0.167133,0.122378,0.025035,0.137413,0.137762,0.090210,0.090559,0.112587,0.085315,0.110490,0.120629,0.161538,0.146504,0.085315,0.059091,0.134965,0.204196,0.150699,0.156294,0.135664,0.110140,0.126224,0.129021,0.212937,0.134965,0.056993,0.117832,0.142657,0.128671,0.121329,0.117483,0.111888,0.135315,0.160839,0.085315,0.153147,0.100000,0.134965,0.151748,0.147902,0.166434,0.113636,0.100350,0.144056,0.078671,0.054895,0.102098,0.156643,0.100000,0.090210,0.056294,0.103846,0.095455,0.117832,0.097902,0.125874,0.084615,0.136713,0.104545,0.136364,0.060839,0.098601,0.123427,0.081469,0.120629,0.081818,0.066084,0.213287,0.109441,0.125524,0.116084


In [18]:
# Ensure new_da is a square matrix with the union of both columns and indices
all_labels = sorted(set(new_da.index).union(set(new_da.columns)))
new_da = new_da.reindex(index=all_labels, columns=all_labels, fill_value=0)
new_da


Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,AA06,AAAS,AACS,AACSP1,AADAC,AADACL2,AADACL2-AS1,AADACL3,AADACL4,AADACP1,AADAT,AAED1,AAGAB,AAK1,AAMDC,AAMP,AANAT,AAR2,AARD,AARS,AARS2,AARSD1,AASDH,AASDHPPT,AASS,AATBC,AATF,AATK,AATK-AS1,ABALON,ABAT,ABCA1,ABCA10,ABCA11P,ABCA12,ABCA13,ABCA17P,ABCA2,ABCA3,ABCA4,ABCA5,ABCA6,ABCA7,ABCA8,ABCA9,ABCA9-AS1,ABCB1,ABCB10,ABCB11,ABCB4,ABCB5,ABCB6,ABCB7,ABCB8,ABCB9,ABCC1,ABCC10,ABCC11,ABCC12,ABCC13,ABCC2,ABCC3,ABCC4,ABCC5,ABCC5-AS1,ABCC6,ABCC6P1,ABCC6P2,ABCC8,ABCC9,ABCD1,ABCD2,ABCD3,ABCD4,ABCE1,ABCF1,ABCF2,ABCF3,ABCG1,ABCG2,ABCG4,ABCG5,ABCG8,ABHD1,ABHD10,ABHD11,ABHD11-AS1,ABHD12,ABHD12B,ABHD13,...,ZNF84,ZNF841,ZNF843,ZNF844,ZNF845,ZNF846,ZNF85,ZNF850,ZNF852,ZNF853,ZNF860,ZNF862,ZNF865,ZNF875,ZNF876P,ZNF878,ZNF879,ZNF880,ZNF883,ZNF888,ZNF890P,ZNF891,ZNF90,ZNF91,ZNF92,ZNF93,ZNF98,ZNF99,ZNFX1,ZNHIT1,ZNHIT2,ZNHIT3,ZNHIT6,ZNRD1,ZNRD1ASP,ZNRF1,ZNRF2,ZNRF2P1,ZNRF2P2,ZNRF3,ZNRF3-AS1,ZNRF4,ZP1,ZP2,ZP3,ZP4,ZPBP,ZPBP2,ZPLD1,ZPR1,ZRANB1,ZRANB2,ZRANB2-AS1,ZRANB2-AS2,ZRANB3,ZRSR2,ZSCAN1,ZSCAN10,ZSCAN12,ZSCAN12P1,ZSCAN16,ZSCAN16-AS1,ZSCAN18,ZSCAN2,ZSCAN20,ZSCAN21,ZSCAN22,ZSCAN23,ZSCAN25,ZSCAN26,ZSCAN29,ZSCAN30,ZSCAN31,ZSCAN32,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSCAN5C,ZSCAN9,ZSWIM1,ZSWIM2,ZSWIM3,ZSWIM4,ZSWIM5,ZSWIM6,ZSWIM7,ZSWIM8,ZSWIM8-AS1,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
A1BG,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
A1BG-AS1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
A1CF,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
A2M,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
A2M-AS1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
ZYG11B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
ZYX,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
ZZEF1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000


In [19]:
new_da = new_da.loc[new_da.index.str.contains('ENSG'),new_da.index.str.contains('ENSG')]

In [22]:
new_da

In [None]:
new_da = new_da.T + new_da

## protein

In [17]:
string = pd.read_csv('../data/main/9606.protein.links.v12.0.txt.gz', sep=' ')

In [20]:
del string

In [18]:
string.protein1 = string.protein1.str.split('.').str[1]
string.protein2 = string.protein2.str.split('.').str[1]

In [10]:
rel = {}
for a, b in string.iloc[:,:2].values:
    a, b = rn.get(a, ""), rn.get(b, "")
    if a and b:
        if a in rel:
            rel[a].add(b)
        else:
            rel[a] = set([b])  
        if b in rel:
            rel[b].add(a)
        else:
            rel[b] = set([a])
len(rel)

19193

In [26]:
res = pd.DataFrame(data=np.zeros((len(rel), len(rel))), columns=list(rel.keys()), index=list(rel.keys()))

In [12]:
res.drop(columns=[''], inplace=True)
res.drop(index=[''], inplace=True)

In [25]:
for i,j in tqdm.tqdm(rel.items()):
    res.loc[i,list(j)] = 1


19193it [00:20, 917.76it/s] 


In [None]:
res.mean().mean()

0.03608647264434039

In [None]:
res.to_parquet('../data/main/stringdb_bias.parquet')

In [21]:
res = pd.read_parquet("../data/main/stringdb_bias.parquet")

In [30]:
res

Unnamed: 0,ENSG00000075292,ENSG00000172531,ENSG00000180745,ENSG00000156886,ENSG00000185069,ENSG00000125817,ENSG00000177459,ENSG00000074201,ENSG00000109390,ENSG00000165066,...,ENSG00000152592,ENSG00000164778,ENSG00000130957,ENSG00000165084,ENSG00000114487,ENSG00000178093,ENSG00000203950,ENSG00000126953,ENSG00000185985,ENSG00000180210
ENSG00000075292,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000172531,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ENSG00000180745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000156886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000185069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000178093,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000203950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000126953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
ENSG00000185985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
res + res.T

In [29]:
res.sum(1)

ENSG00000075292    1221.0
ENSG00000172531    3264.0
ENSG00000180745     466.0
ENSG00000156886     287.0
ENSG00000185069     671.0
                    ...  
ENSG00000178093    1031.0
ENSG00000203950       0.0
ENSG00000126953     852.0
ENSG00000185985     719.0
ENSG00000180210    1276.0
Length: 19193, dtype: float64

In [66]:
# # Align the indices and columns of res with da, and vice versa, filling missing values with 0
all_indices = sorted(set(new_da.index).union(set(res.index)))
# 
da_aligned = new_da.reindex(index=all_indices, columns=all_indices, fill_value=0)
res_aligned = res.reindex(index=all_indices, columns=all_indices, fill_value=0)
# 
# # Add res to da
da = da_aligned + res_aligned

In [67]:
da.shape

(28206, 28206)

In [68]:
del new_da
del res

In [16]:
genes = torch.load('../data/temp/vbd8bavn/epoch=17-step=90000.ckpt')['hyper_parameters']['genes']

In [None]:
biomart = getBiomartTable(attributes=['mmusculus_homolog_ensembl_gene', 'ensembl_gene_id'], bypass_attributes=True)
biomart

downloading gene names from biomart
['mmusculus_homolog_ensembl_gene', 'ensembl_gene_id']


Unnamed: 0,mmusculus_homolog_ensembl_gene,ensembl_gene_id
0,,ENSG00000210049
1,,ENSG00000211459
2,,ENSG00000210077
3,,ENSG00000210082
4,,ENSG00000209082
...,...,...
77642,,ENSG00000232679
77643,ENSMUSG00000088001,ENSG00000200033
77644,,ENSG00000228437
77645,,ENSG00000229463


In [None]:
biomart[~biomart.mmusculus_homolog_ensembl_gene.isna()]

Unnamed: 0,mmusculus_homolog_ensembl_gene,ensembl_gene_id
5,ENSMUSG00000064341,ENSG00000198888
9,ENSMUSG00000064345,ENSG00000198763
15,ENSMUSG00000064351,ENSG00000198804
18,ENSMUSG00000064354,ENSG00000198712
20,ENSMUSG00000064356,ENSG00000228253
...,...,...
77627,ENSMUSG00000032913,ENSG00000198799
77639,ENSMUSG00000102439,ENSG00000143631
77640,ENSMUSG00000049133,ENSG00000143520
77641,ENSMUSG00000039384,ENSG00000143507


In [79]:
mp = {} 
for k,v in biomart.iloc[:,[0,2]].values:
    if v is np.nan or k is np.nan:
        continue
    if k in mp:
        if v not in mp[k]:
            mp[k]+=[v]
    else:
        mp[k] = [v]

In [81]:
# Ensure da contains exactly the same genes as in genes (dropping or filling empty with 0)
da = da.reindex(index=genes, columns=genes, fill_value=0)
# Convert da to a sparse array
da = sparse.csr_matrix(da.values)


In [None]:
biomart_filtered = biomart[~biomart.mmusculus_homolog_ensembl_gene.isna()].iloc[:, [0, 2]]
biomart_filtered = biomart_filtered.loc[~biomart_filtered.duplicated(keep=False)]

biomart_filtered[biomart_filtered.mmusculus_homolog_ensembl_gene.isin(set(genes)) & biomart_filtered.ensembl_gene_id.isin(set(genes))]
biomart_filtered


Unnamed: 0,ensembl_gene_id,mmusculus_homolog_ensembl_gene
5,ENSG00000198888,ENSMUSG00000064341
9,ENSG00000198763,ENSMUSG00000064345
15,ENSG00000198804,ENSMUSG00000064351
18,ENSG00000198712,ENSMUSG00000064354
20,ENSG00000228253,ENSMUSG00000064356
...,...,...
76056,ENSG00000116771,ENSMUSG00000040706
76059,ENSG00000252417,ENSMUSG00002075659
76062,ENSG00000215695,ENSMUSG00000078515
76063,ENSG00000215695,ENSMUSG00000040715


In [None]:
# Create a mapping from gene to its index in the genes list
gene_to_index = {gene: idx for idx, gene in enumerate(genes)}

# Replace values in biomart_filtered with their locations in "genes"
biomart_filtered = biomart_filtered.applymap(lambda x: gene_to_index.get(x, x))

# Filter out rows where any of the values are not in the gene_to_index mapping
biomart_filtered = biomart_filtered[biomart_filtered.applymap(lambda x: isinstance(x, int)).all(axis=1)]


  biomart_filtered = biomart_filtered.applymap(lambda x: gene_to_index.get(x, x))
  biomart_filtered = biomart_filtered[biomart_filtered.applymap(lambda x: isinstance(x, int)).all(axis=1)]


In [None]:
da[biomart_filtered.values[:,1][:, None], biomart_filtered.values[:,1]] = da[biomart_filtered.values[:,0][:, None], biomart_filtered.values[:,0]]

  self._set_arrayXarray_sparse(i, j, x)


In [None]:
sparse.save_npz("bias_sparse.npz", da)


In [25]:
sparse.save_npz("bias_sparse.npz", da)
