This notebook processes the `genosnp` file with data from the "Ice Age" paper into a sensible table, which can be used by other things than just ADMIXTOOLS etc.

In [1]:
import pandas as pd
import numpy as np
import pybedtools
from pybedtools import BedTool

In [2]:
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

plt.style.use('ggplot')

### Read the positions of sites from the archaic admixture array

In [4]:
snp_pos = pd.read_table('../clean_data/ice_age.tsv')[['chrom', 'pos']]

In [5]:
len(snp_pos)

945357

In [6]:
snp_pos.head()

Unnamed: 0,chrom,pos
0,1,847983
1,1,853089
2,1,853596
3,1,854793
4,1,858862


# Additional SNP annotations

We don't want to rely just on the B value annotations. How about using a more direct measure of this proximity?

* distance to the nearest coding region
* amount of coding sequence in a X bp window around each site
* phyloP score

## Exons

Create array sites BED object:

In [7]:
snp_pos['start'] = snp_pos.pos - 1
snp_pos['end'] = snp_pos.pos
snp_bed = BedTool.from_dataframe(snp_pos[['chrom', 'start', 'end']]).sort()

Download the genome annotations

In [8]:
gtf = pd.read_table('ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz',
                    header=None, sep='\t', skipinitialspace=True, skiprows=5, compression='gzip',
                    names=['chrom', 'source', 'feature', 'start', 'end',
                           'score', 'strand', 'frame', 'attribute'], low_memory=False)

In [9]:
gtf.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attribute
0,1,pseudogene,gene,11869,14412,.,+,.,"gene_id ""ENSG00000223972""; gene_name ""DDX11L1""..."
1,1,processed_transcript,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
2,1,processed_transcript,exon,11869,12227,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
3,1,processed_transcript,exon,12613,12721,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
4,1,processed_transcript,exon,13221,14409,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."


Subset to exon annotations only and create a BED object

In [10]:
exons = BedTool.from_dataframe(
    gtf[(gtf.source == "protein_coding") &
        (gtf.feature == "exon")].query('end - start > 10')
).sort().merge().sort()

In [11]:
exons.total_coverage()

81532972

### Distance to the nearest exon

In [12]:
closest = snp_bed.closest(exons, t='first', d=True).to_dataframe()

In [13]:
snp_pos = snp_pos.merge(closest, on=["chrom", "start", "end"])[["chrom", "pos", "start", "end", "thickStart"]] \
       .rename(columns={"thickStart": "exon_distance"})

In [14]:
snp_pos.head()

Unnamed: 0,chrom,pos,start,end,exon_distance
0,1,847983,847982,847983,12277
1,1,853089,853088,853089,7171
2,1,853596,853595,853596,6664
3,1,854793,854792,854793,5467
4,1,858862,858861,858862,1398


### Calculate the density of exon sequence in windows upstream/downstream of each SNP

In [17]:
# flank specifies how far upstream/downstream to extend the window
# Window size will be (2 * flank + 1) bp
for flank in [10000, 25000, 50000, 100000]:
    # generate the BED object of windows flanking both sides of all SNPs
    snp_windows = snp_bed.slop(b=flank, genome='hg19')
    
    # for each exon overlapping a given window, count the number of bases
    # of the overlap -- one row for each potential exon
    # If there are no exons overlapping a window, report 0 bp overlap
    exon_overlaps = snp_windows.intersect(exons, wao=True)   \
                               .to_dataframe()[['chrom', 'start', 'end', 'thickStart']]
        
    # thickStart contains the number of bases overlapping each exon in
    # a window around each SNP -- Summing up this column for each unique
    # region using groupby (i.e. window around each SNP) gives the total
    # number of coding sequence surrounding each SNP
    exon_total = exon_overlaps.groupby(['chrom', 'start', 'end'])['thickStart'].sum().reset_index()['thickStart']
    snp_pos['exon_density_' + str(flank)] = exon_total / (2 * flank + 1)

  if self.run_code(code, result):


In [18]:
snp_pos.head()

Unnamed: 0,chrom,pos,start,end,exon_distance,exon_density_10000,exon_density_25000,exon_density_50000,exon_density_100000
0,1,847983,847982,847983,12277,0.0,0.01352,0.062229,0.07256
1,1,853089,853088,853089,7171,0.015749,0.029219,0.080659,0.076115
2,1,853596,853595,853596,6664,0.015749,0.036199,0.080659,0.076115
3,1,854793,854792,854793,5467,0.015749,0.053019,0.080659,0.076115
4,1,858862,858861,858862,1398,0.027549,0.072479,0.093319,0.07868


## phyloP annotation

### Process all phyloP BED files in parallel

In [21]:
import functools
import random
import os
from multiprocessing import Pool

In [22]:
def process_chromosome(chrom, snps, window_size):
    """Process the given chromosome - calculate average phylop for each SNP,
    and also annotate each SNP with it's estimated phylop value. Return both
    as a pandas dataframe.
    """
    # chromosome IDs need a 'chr' prefix to match phyloP data
    snp_pos_with_chr = snps.query("chrom == {}".format(chrom)).copy()
    snp_pos_with_chr['chrom'] = "chr" + snp_pos_with_chr.chrom.astype(str)
    snp_bed = BedTool.from_dataframe(snp_pos_with_chr[['chrom', 'start', 'end']])
    
    phylop = BedTool("/mnt/expressions/benjamin_vernot/phyloP_no_human/Compara.36_eutherian_mammals_EPO_LOW_COVERAGE.chr" + str(chrom) + "_.phyloP_no_human.SS.bed")

    # get all sites surrounding each SNP from the archaic admixture array
    phylop_windows = snp_bed.window(phylop, w=window_size)

    # calculate the average phyloP score of each SNP (based on phyloP values in windows around them)
    phylop_snps = phylop_windows.to_dataframe().groupby(['chrom', 'start', 'end'])['thickStart'].mean().reset_index()

    # rename the fields to make join with the original archaic admixture array SNP table possible
    phylop_snps = phylop_snps.rename(columns={'thickStart': 'phylop_' + str(window_size)})

    os.remove(snp_bed.fn)
    os.remove(phylop_windows.fn)

    return phylop_snps[['chrom', 'start', 'end', 'phylop_' + str(window_size)]]

In [23]:
def process_all_chromosomes(snp_pos, window_size, cores):
    """Process SNPs from all chromosomes, averaging PhyloP for each
    SNP in a given window. Use the specified number of cores for
    the computation."""

    # create a partial function used to process a given chromosome's phyloP BED file
    fn = functools.partial(process_chromosome, snps=snp_pos, window_size=window_size)

    chroms = [chrom for chrom in range(1, 23)]
    random.shuffle(chroms)

    # create a pool of workers that will process each chromosome
    with Pool(processes=cores) as pool:
        # map the processing function across all chromosomes
        phylop_snps_per_chrom = pool.map(fn, chroms)

        # concatenate the results into a single dataframe
        phylop_snps = pd.concat(phylop_snps_per_chrom)

        # convert to from the BED coordinates into a normal table format
        phylop_snps = phylop_snps.rename(columns={"end": "pos"})[["chrom", "pos", "phylop_" + str(window_size)]]

        # remove the 'chr' prefix of chromosome IDs
        phylop_snps.chrom.replace(to_replace='chr', value='', regex=True, inplace=True)
        phylop_snps['chrom'] = phylop_snps.astype(int)
        
        # pybedtools.cleanup() leaves crap behind, let's delete tmp files directly
        os.system('rm /tmp/pybedtools.*')

    return phylop_snps

In [24]:
phylop_1k = process_all_chromosomes(snp_pos, window_size=1000, cores=15)

In [25]:
phylop_10k = process_all_chromosomes(snp_pos, window_size=10000, cores=3)

In [26]:
phylop_20k = process_all_chromosomes(snp_pos, window_size=20000, cores=1)

In [29]:
phylop_1k = phylop_1k.sort_values(by=["chrom", "pos"])
phylop_10k = phylop_10k.sort_values(by=["chrom", "pos"])
phylop_20k = phylop_20k.sort_values(by=["chrom", "pos"])

#### Save the phylop averages on the drive as a backup

In [30]:
phylop_1k.to_csv('../tmp/phylop_1k.tsv', sep='\t', index=False, na_rep='-')
phylop_10k.to_csv('../tmp/phylop_10k.tsv', sep='\t', index=False, na_rep='-')
phylop_20k.to_csv('../tmp/phylop_20k.tsv', sep='\t', index=False, na_rep='-')

### Join the phyloP average tables with the rest of the data

In [22]:
merged = snp_pos \
    .merge(phylop_1k, how="outer") \
    .merge(phylop_10k, how="outer") \
    .merge(phylop_20k, how="outer") \
    .drop(["start", "end"], axis=1)

In [23]:
merged.head()

Unnamed: 0,chrom,pos,exon_distance,exon_density_10000,exon_density_25000,exon_density_50000,exon_density_100000,phylop_1000,phylop_10000,phylop_20000
0,1,847983,12277,0.0,0.01352,0.062229,0.07256,-0.176853,-0.152668,-0.11263
1,1,853089,7171,0.015749,0.029219,0.080659,0.076115,-0.145938,-0.161705,-0.152555
2,1,853596,6664,0.015749,0.036199,0.080659,0.076115,-0.252323,-0.167846,-0.15523
3,1,854793,5467,0.015749,0.053019,0.080659,0.076115,-0.280611,-0.186692,-0.159135
4,1,858862,1398,0.027549,0.072479,0.093319,0.07868,-0.157188,-0.206906,-0.163019


# Output the processed genosnp table in a tab-separated file

In [24]:
merged.to_csv('../clean_data/additional_annotations.tsv', sep='\t', index=False, na_rep='-')