# Extract phyloP scores
Get phyloP scores for all sites in canonical CDS.
Use Zoonomia scores for 241 mammalian species.

## Preliminaries
Download data from UKB RAP storage

In [None]:
%%bash
dx download -o ../data/ data/241-mammalian-2020v2.bigWig
dx download -o ../outputs/ outputs/gencode_v39_canonical_cds_chr.bed

Install pyBigWig to conda environment

In [3]:
%%bash
conda install pybigwig -c conda-forge -c bioconda -y

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done




  current version: 23.1.0
  latest version: 23.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.0





# All requested packages already installed.



In [6]:
# Import packages
import pyBigWig
import numpy as np
import pandas as pd

## Script

In [23]:
# Read BigWig file of phyloP scores
bw = pyBigWig.open("../data/241-mammalian-2020v2.bigWig")
bw.header()

{'version': 4,
 'nLevels': 10,
 'nBasesCovered': 2852623265,
 'minVal': -20,
 'maxVal': 9,
 'sumData': 365889578,
 'sumSquared': 4523134155}

In [118]:
# Read bed file of canonical CDS regions
bed = pd.read_csv("../outputs/gencode_v39_canonical_cds_chr.bed", sep="\t", header=None, names=["chr","start","end","id","score","strand"], usecols=["chr","start","end"])

# Exlucde mitochondrial regions
bed = bed[bed["chr"] != "chrM"]

bed.head(3)

Unnamed: 0,chr,start,end
0,chr1,65564,65573
1,chr1,69036,70005
2,chr1,450742,451678


In [125]:
# Get phyloP annotations for each site in a CDS
%%time
## Extract annotations
phylop = bed.apply(lambda x: bw.intervals(x["chr"], x["start"], x["end"]), axis=1)

## Reformat the data
phylop.index = bed["chr"]
phylop = phylop.explode().dropna()
phylop = pd.DataFrame([[*a] for a in phylop.values], columns=["start","end","phylop"], index=phylop.index)
phylop = phylop.reset_index(drop=False).drop("start", axis=1).rename(columns={"end":"pos"})

# Print summary statistics
print(f"{len(phylop)} sites successfully annotated with phyloP scores.")

CPU times: user 1min 6s, sys: 6.88 s, total: 1min 12s
Wall time: 1min 12s


In [128]:
phylop.to_csv("../outputs/phylop_all_sites.tsv", sep="\t", index=False)

In [129]:
%%bash
dx upload --destination outputs/ ../outputs/phylop_all_sites.tsv

ID                    file-GX4vbb0JvP05P515vJ6qYZ19
Class                 file
Project               project-GQFJfPjJvP02pk9yZpqJ0yjJ
Folder                /outputs
Name                  phylop_all_sites.tsv
State                 closing
Visibility            visible
Types                 -
Properties            -
Tags                  -
Outgoing links        -
Created               Wed Jun 14 14:12:36 2023
Created by            alexander.blakes
 via the job          job-GX4p5g0JvP06BpqYQJv688Yy
Last modified         Wed Jun 14 14:12:41 2023
Media type            
archivalState         "live"
cloudAccount          "cloudaccount-dnanexus"
