# MaveDB Score Set Breakdown
This notebook computes the summary statistics used to generate the sankey plot in figure 2a

## Load Relevant Libraries
Run the cell below to load the libraries that are used in this analysis

In [1]:
from os import environ

import pandas as pd

from dcd_mapping.lookup import _normalize_gene

environ["GENE_NORM_DB_URL"] = "postgresql://postgres@localhost:5432/gene_normalizer"

## Load Metadata 
Run the cell below to load metadata information for the examined MaveDB score sets.

In [2]:
mave_dat = pd.read_csv("analysis_files/mave_metadata_20241114.csv", index_col=0)
mave_dat.head()

Unnamed: 0,urn,target_sequence,target_sequence_type,target,target_type,uniprot_id,organism
0,urn:mavedb:00000001-a-1,ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCAT...,dna,UBE2I,Protein coding,P63279,Homo sapiens
1,urn:mavedb:00000001-a-2,ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCAT...,dna,UBE2I,Protein coding,P63279,Homo sapiens
2,urn:mavedb:00000001-a-3,ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCAT...,dna,UBE2I,Protein coding,P63279,Homo sapiens
3,urn:mavedb:00000001-a-4,ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCAT...,dna,UBE2I,Protein coding,P63279,Homo sapiens
4,urn:mavedb:00000001-b-1,ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATA...,dna,SUMO1,Protein coding,P63165,Homo sapiens


### Compute the Number of Score Sets with DNA Target Sequences

In [3]:
dna_ts = mave_dat[mave_dat["target_sequence_type"] == "dna"]
f"The number of score sets with DNA target sequences is: {len(dna_ts.index)}"

'The number of score sets with DNA target sequences is: 482'

### Compute the Number of Score Sets with Protein Target Sequences

In [4]:
protein_ts = mave_dat[mave_dat["target_sequence_type"] == "protein"]
f"The number of score sets with protein target sequences is: {len(protein_ts.index)}"

'The number of score sets with protein target sequences is: 582'

### Compute the Number of Protein Coding Score Sets

In [5]:
coding_ss = mave_dat.loc[mave_dat["target_type"] == "Protein coding"]
f"The number of protein coding score sets: {len(coding_ss.index)}"

'The number of protein coding score sets: 1023'

### Compute the Number of Regulatory/Other Noncoding Score Sets

In [6]:
noncoding_ss = mave_dat.loc[mave_dat["target_type"] != "Protein coding"]
f"The number of regulatory/other noncoding score sets: {len(noncoding_ss.index)}"

'The number of regulatory/other noncoding score sets: 41'

### Compute the Number of Score Sets with UniProt IDs

In [7]:
f"{len(coding_ss) - len(coding_ss[coding_ss['uniprot_id'].isnull()].index)} score sets have UniProt IDs"

'897 score sets have UniProt IDs'

### Compute the Number of Score Sets with Protein Target Sequences

In [8]:
f"The number of score sets with protein target sequences is: {len(coding_ss[coding_ss['target_sequence_type'] == 'protein'].index)}"

'The number of score sets with protein target sequences is: 582'

### Compute the Number of Coding Score Sets with DNA Target Sequences

In [9]:
f"The number of coding score sets with DNA target sequences is: {len(coding_ss[coding_ss['target_sequence_type'] == 'dna'].index)}"

'The number of coding score sets with DNA target sequences is: 441'

### Compute the Number of Regulatory/Other Noncoding Score Sets with DNA Target Sequences

In [10]:
f"The number of regulatory/other noncoding score sets with DNA target sequences is: {len(noncoding_ss[noncoding_ss['target_sequence_type'] == 'dna'].index)}"

'The number of regulatory/other noncoding score sets with DNA target sequences is: 41'

### Compute the Average Length of Protein Target Sequences

In [11]:
protein_filter = coding_ss.loc[coding_ss["target_sequence_type"] == "protein"]
seqlist = list(protein_filter["target_sequence"])
f"The average length of protein target sequences is: {sum(map(len, seqlist)) / len(seqlist)} amino acids"

'The average length of protein target sequences is: 105.39003436426117 amino acids'

### Compute the Average Length of Protein Coding DNA Target Sequences

In [12]:
dna_filter = coding_ss.loc[coding_ss["target_sequence_type"] == "dna"]
seqlist = list(dna_filter["target_sequence"])
f"The average length of protein coding DNA target sequences is: {sum(map(len, seqlist)) / len(seqlist)} nucleotides"

'The average length of protein coding DNA target sequences is: 635.2448979591836 nucleotides'

### Compute the Average Length of Regulatory/Other Noncoding DNA Target Sequences

In [13]:
dna_filter = noncoding_ss.loc[noncoding_ss["target_sequence_type"] == "dna"]
seqlist = list(dna_filter["target_sequence"])
f"The average length of regulatory/other noncoding DNA target sequences is: {sum(map(len, seqlist)) / len(seqlist)} nucleotides"

'The average length of regulatory/other noncoding DNA target sequences is: 353.2682926829268 nucleotides'

### Compute the Number of Protein Coding Score Sets with Gene Symbol/Aliases

In [14]:
td = list(coding_ss["target"])
td = [x for x in td if " " not in x or x == "Glycophorin A"]
f"The number of protein coding score sets with gene symbols/aliases is: {len(td)}"

'The number of protein coding score sets with gene symbols/aliases is: 348'

### Compute the Number of Protein Coding Score Sets with Descriptive Targets

In [15]:
td = list(coding_ss["target"])
td = [x for x in td if " " in x and x != "Glycophorin A"]
f"The number of protein coding score sets with descriptive targets is: {len(td)}"

'The number of protein coding score sets with descriptive targets is: 675'

### Compute the Number of Regulator/Other Noncoding Score Sets with Descriptive Targets

In [16]:
td = list(noncoding_ss["target"])
td = [x for x in td if " " in x]
f"The number of regulatory/other noncoding score sets with descriptive targets is: {len(td)}"

'The number of regulatory/other noncoding score sets with descriptive targets is: 41'

### Compute the Number of Unique Genes that are Mapped Across Score Sets

In [17]:
targets = list(mave_dat["target"])
unique_genes = []
for gene in targets:
    comp = gene.split()
    if "Minigene" in comp:
        unique_genes.append("WT1")
    elif "hYAP65" in comp:
        unique_genes.append("YAP1")
    else:
        for ele in comp:
            is_normalized = False
            if _normalize_gene(ele):
                unique_genes.append(_normalize_gene(ele).label)
                is_normalized = True
                break
        if not is_normalized:
            unique_genes.append(comp[0])
f"The number of unique gene symbols across examined score set targets is: {len(set(unique_genes))}"

'The number of unique gene symbols across examined score set targets is: 526'

In [18]:
set(unique_genes)

{'ABL1',
 'ABLIM2',
 'ABLIM3',
 'ACE2',
 'ADAR',
 'ADARB1',
 'ADNP',
 'AHI1',
 'AICDA',
 'ALDOB',
 'ALX1',
 'ALX3',
 'ALX4',
 'AMFR',
 'ANK1',
 'ANK2',
 'ANK3',
 'ANKS1B',
 'ANKS6',
 'APBB3',
 'APP',
 'AR',
 'ARAF',
 'ARHGAP35',
 'ARHGEF28',
 'ARID2',
 'ARK2C',
 'ARX',
 'ASB14',
 'ASPA',
 'ASPM',
 'ASZ1',
 'ATP7B',
 'ATXN7',
 'AVIL',
 'Aβ42',
 'BAG3',
 'BAG5',
 'BAZ2B',
 'BCL11A',
 'BLK',
 'BPTF',
 'BRAF',
 'BRCA1',
 'BRD4',
 'BTK',
 'CABP2',
 'CALM1',
 'CALML4',
 'CAPN15',
 'CASK',
 'CBL',
 'CBS',
 'CBX4',
 'CBX5',
 'CBX7',
 'CCAR1',
 'CCNF',
 'CCR5',
 'CD2AP',
 'CD86',
 'CHD3',
 'CHD4',
 'CHD5',
 'CHD7',
 'CHEK2',
 'CIDEB',
 'CKS1B',
 'CNKSR2',
 'COMT_ROI1_2',
 'CPA2',
 'CPEB2',
 'CRX',
 'CRYBA1',
 'CRYBA4',
 'CRYBB1',
 'CRYBB2',
 'CRYBG2',
 'CRYGA',
 'CRYGB',
 'CRYGC',
 'CRYGD',
 'CRYGS',
 'CSDE1',
 'CUL9',
 'CUX1',
 'CUX2',
 'CXCR4',
 'CYP2C19',
 'CYP2C9',
 'DBT',
 'DDHD2',
 'DDX3X',
 'DIDO1',
 'DLG3',
 'DLG4',
 'DMBX1',
 'DMTF1',
 'DNAJA1',
 'DNAJA3',
 'DNAJB1',
 'DNAJB11',
 'DNAJ

### Generate Summary Statistics Table

In [19]:
params = {"Protein Coding Score Sets": 1023,
          "Regulatory/Other Noncoding Score Sets": 41,
          "Score Sets with DNA Target Sequences": 482,
          "Score Sets with Protein Target Sequences": 582,
          "Average Length of Protein Target Sequences": 105.39,
          "Average Length of Protein Coding DNA Sequences": 635.24,
          "Average Length of Regulatory/Other DNA Sequences": 353.27,
          "Protein Coding Score Sets with Gene Symbols/Aliases": 348,
          "Protein Coding Score Sets with Descriptive Targets": 675,
          "Regulatory/Other Noncoding Score Sets with Descriptive Targets": 41,
          "Unique Targets": 526}
pd.DataFrame.from_dict(params, orient="index",columns=["Value"])

Unnamed: 0,Value
Protein Coding Score Sets,1023.0
Regulatory/Other Noncoding Score Sets,41.0
Score Sets with DNA Target Sequences,482.0
Score Sets with Protein Target Sequences,582.0
Average Length of Protein Target Sequences,105.39
Average Length of Protein Coding DNA Sequences,635.24
Average Length of Regulatory/Other DNA Sequences,353.27
Protein Coding Score Sets with Gene Symbols/Aliases,348.0
Protein Coding Score Sets with Descriptive Targets,675.0
Regulatory/Other Noncoding Score Sets with Descriptive Targets,41.0
