This notebook processes the `genosnp` file with data from the "Ice Age" paper into a sensible table, which can be used by other things than just ADMIXTOOLS etc.

In [1]:
import pandas as pd
import numpy as np
from pybedtools import BedTool

In [2]:
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

plt.style.use('ggplot')

### Read the table of individuals

In [4]:
!head ../raw_data/ice_age/individuals

Index	Sample.ID	Name.In.Paper	Date	Listed.in.TableS1	Used.In.Regression
0	AG2	AfontovaGora2	16710	1	1
1	AfontovaGora3	AfontovaGora3	17000	1	1
2	Altai	Altai	70000	0	0
3	B_Australian-4	Australian	0	1	0
4	Chimp	Chimp	0	0	0
5	Continenza	Continenza	10860	1	1
6	B_Dai-4	Dai	0	1	1
7	Denisova	Denisovan	70000	0	0
8	B_Dinka-3	Dinka	0	0	0


In [5]:
individuals = pd.read_table('../raw_data/ice_age/individuals',
                            names=['sample_id', 'name', 'date',
                                     'in_table_s1', 'in_regression'],
                            skiprows=1, index_col=0)
individuals.head()

Unnamed: 0,sample_id,name,date,in_table_s1,in_regression
0,AG2,AfontovaGora2,16710,1,1
1,AfontovaGora3,AfontovaGora3,17000,1,1
2,Altai,Altai,70000,0,0
3,B_Australian-4,Australian,0,1,0
4,Chimp,Chimp,0,0,0


Replace unallowed characters with underscores:

In [6]:
individuals.sample_id = individuals.sample_id.apply(lambda s: s.replace('-', '_'))

### Read the positions of sites from the archaic admixture array

In [7]:
array_sites = pd.read_table('../raw_data/ice_age/filter.gz', names=['chrom', 'end', 'set_1', 'set_2'])

In [8]:
array_sites.head()

Unnamed: 0,chrom,end,set_1,set_2
0,1,847983,1,1
1,1,853089,1,1
2,1,853596,1,1
3,1,854793,1,1
4,1,858862,0,0


### Process the genosnp file

In [9]:
genosnp = pd.concat([pd.read_table('../raw_data/ice_age/genosnp_first_500000.gz', sep=' ', header=None, na_values='-'),
                     pd.read_table('../raw_data/ice_age/genosnp_last_454456.gz', sep=' ', header=None, na_values='-')])
genosnp.columns = ['chrom', 'pos', 'id', 'genpos', 'allele0', 'allele1', 'bin', 'gt_string', 'bvalue']

In [10]:
genosnp.head()

Unnamed: 0,chrom,pos,id,genpos,allele0,allele1,bin,gt_string,bvalue
0,1,847983,1_847983,0.00848,T,C,8.0,901009000990900900000900000009099999909000010009,958
1,1,853089,1_853089,0.008531,C,G,8.0,991009000999990000900900000009099999909000990009,958
2,1,853596,1_853596,0.008536,G,A,8.0,901019000999990900990900009009999999909000990009,958
3,1,854793,1_854793,0.008548,C,A,8.0,991009000999990900900900009001099999909000990009,958
4,1,858862,1_858862,0.008589,C,G,7.0,990009000999990900990900900009099999909000000009,952


Convert the concatenated string of randomly called alleles to a DataFrame of integer values:

In [11]:
random_calls = pd.DataFrame([list(s) for s in list(genosnp.gt_string)], 
                            columns=individuals.sample_id.values, dtype=int,
                            index=genosnp.index)

In [12]:
random_calls.head()

Unnamed: 0,AG2,AfontovaGora3,Altai,B_Australian_4,Chimp,Continenza,B_Dai_4,Denisova,B_Dinka_3,Dolni13,...,Pavlov1,B_Sardinian_3,Stuttgart,Ust_Ishim,Villabruna,Vi_merge,B_Yoruba_3,Bichon,KK1,SATP
0,9,0,1,0,0,9,0,0,0,9,...,9,0,0,0,0,1,0,0,0,9
1,9,9,1,0,0,9,0,0,0,9,...,9,0,0,0,9,9,0,0,0,9
2,9,0,1,0,1,9,0,0,0,9,...,9,0,0,0,9,9,0,0,0,9
3,9,9,1,0,0,9,0,0,0,9,...,9,0,0,0,9,9,0,0,0,9
4,9,9,0,0,0,9,0,0,0,9,...,9,0,0,0,0,0,0,0,0,9


Combine both into one table of randomly called alleles and their positions, B-values, etc...:

In [13]:
snp_data = pd.concat([genosnp[['chrom', 'pos', 'bvalue']],
                      random_calls], axis=1, ignore_index=True)
snp_data.columns = ['chrom', 'pos', 'bvalue'] + list(individuals.sample_id)

Subset to the `783,974` sites used in the Ice Age paper:

In [14]:
snp_data = snp_data[(array_sites.set_1 == 1).values]

Subset to sites which have a B-value assigned:

In [15]:
snp_data = snp_data[snp_data.bvalue != -1]
len(snp_data)

783747

Check the result:

In [16]:
snp_data.head()

Unnamed: 0,chrom,pos,bvalue,AG2,AfontovaGora3,Altai,B_Australian_4,Chimp,Continenza,B_Dai_4,...,Pavlov1,B_Sardinian_3,Stuttgart,Ust_Ishim,Villabruna,Vi_merge,B_Yoruba_3,Bichon,KK1,SATP
0,1,847983,958,9,0,1,0,0,9,0,...,9,0,0,0,0,1,0,0,0,9
1,1,853089,958,9,9,1,0,0,9,0,...,9,0,0,0,9,9,0,0,0,9
2,1,853596,958,9,0,1,0,1,9,0,...,9,0,0,0,9,9,0,0,0,9
3,1,854793,958,9,9,1,0,0,9,0,...,9,0,0,0,9,9,0,0,0,9
5,1,867151,946,9,9,1,0,9,9,0,...,9,0,0,0,0,9,0,0,0,0


# Additional SNP annotations

We don't want to rely just on the B value annotations. Since they are used as a measure of proximity to functional regions, how about using a more direct measure of this proximity?

* distance to the nearest exon
* amount of coding sequence in a X bp window around each site
* phyloP score

## Exonic annotations

Create array sites BED object:

In [17]:
snp_bed = snp_data[['chrom', 'pos']].copy()
snp_bed['start'] = snp_bed.pos - 1
snp_bed.rename(columns={'pos': 'end'}, inplace=True)

snps = BedTool.from_dataframe(snp_bed[['chrom', 'start', 'end']]).sort()

Download the genome annotations and subset to exons only:

In [18]:
gtf = pd.read_table('ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz',
                    header=None, sep='\t', skipinitialspace=True, skiprows=5, compression='gzip',
                    names=['chrom', 'source', 'feature', 'start', 'end',
                           'score', 'strand', 'frame', 'attribute'], low_memory=False)

In [19]:
gtf.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attribute
0,1,pseudogene,gene,11869,14412,.,+,.,"gene_id ""ENSG00000223972""; gene_name ""DDX11L1""..."
1,1,processed_transcript,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
2,1,processed_transcript,exon,11869,12227,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
3,1,processed_transcript,exon,12613,12721,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
4,1,processed_transcript,exon,13221,14409,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."


In [20]:
gtf.feature.value_counts()

exon              1306656
CDS                791856
UTR                304070
transcript         215170
stop_codon          73411
start_codon         73358
gene                63677
Selenocysteine        114
Name: feature, dtype: int64

Create exons BED object:

In [21]:
exons = BedTool.from_dataframe(
    gtf[(gtf.source == "protein_coding") &
        (gtf.feature == "exon")]
).sort().merge()

### Distance to the nearest exon

Find the distance of each site from the admixture array to the nearest exon:

In [22]:
distances = snps.closest(exons, t='first', d=True).to_dataframe()[[-1]].values
snp_data['exon_distance'] = distances

### Amount of coding sequence in a 100 kb window upstream/downstream of the SNP

How far upstream/downstream to extend the window? Window size will be `(2 * flank + 1)` bp.

In [23]:
flank = 100000

Generate the BED object of windows flanking both sides of all SNPs:

In [24]:
snp_windows = snps.slop(b=flank, genome='hg19')

For each exon overlapping a given window, count the number of bases of the overlap. One row for each potential exon. If there are no exons overlapping a window, report 0 bp overlap.

In [25]:
exon_overlaps = snp_windows.intersect(exons, wao=True)   \
                           .to_dataframe()[['chrom', 'start', 'end', 'thickStart']]

  if self.run_code(code, result):


`thickStart` contains the number of bases overlapping with each exon in a window around each SNP. Summing up these column for each unique region using `groupby` (i.e. window around each SNP) gives the total number of coding sequence surrounding each SNP.

In [26]:
total_overlaps = exon_overlaps.groupby(['chrom', 'start', 'end'])['thickStart'].sum().reset_index()['thickStart']

Save the proportion of each window occupied by exonic sequence:

In [27]:
snp_data['exon_overlap'] = total_overlaps / (2 * flank + 1)

## phyloP annotation

Conservation score calculated without human data (more suitable to study Nea. introgression).

Create array sites BED object:

In [28]:
snp_bed = snp_data[['chrom', 'pos']].copy()
snp_bed['start'] = snp_bed.pos - 1
snp_bed.rename(columns={'pos': 'end'}, inplace=True)

# chromosome IDs need a 'chr' prefix to match phyloP data
snp_bed['chrom'] = "chr" + snp_bed.chrom.astype(str)

snps = BedTool.from_dataframe(snp_bed[['chrom', 'start', 'end']]).sort()

Size of the window around each SNP, in which to calculate average phyloP score:

In [29]:
WINDOW_SIZE = 1000

In [30]:
def process_chromosome(chrom, snps, window_size):
    phylop = BedTool("/mnt/expressions/benjamin_vernot/phyloP_no_human/Compara.36_eutherian_mammals_EPO_LOW_COVERAGE.chr" + str(chrom) + "_.phyloP_no_human.SS.bed")
    
    # get all sites surrounding each SNP from the archaic admixture array
    phylop_windows = snps.window(phylop, w=window_size).to_dataframe()
    
    # calculate the average phyloP score of each SNP (based on phyloP values in windows around them)
    avg_phylop = phylop_windows.groupby(['chrom', 'start', 'end'])['thickStart'].mean().reset_index()
    del phylop_windows

    # rename the fields to make join with the original archaic admixture array SNP table possible
    avg_phylop = avg_phylop.rename(columns={'end': 'pos', 'thickStart': 'phylop'})[['chrom', 'pos', 'phylop']]

    # remove the 'chr' prefix of chromosome IDs
    avg_phylop.chrom.replace(to_replace='chr', value='', regex=True, inplace=True)
    avg_phylop['chrom'] = avg_phylop.astype(int)
    
    return avg_phylop

### Process all phyloP BED files in parallel

In [31]:
import functools

# create a partial function used to process a given chromosome's phyloP BED file
fn = functools.partial(process_chromosome, snps=snps, window_size=WINDOW_SIZE)

In [32]:
import random
chroms = [chrom for chrom in range(1, 23)]
random.shuffle(chroms)

In [33]:
from multiprocessing import Pool

# create a pool of 22 workers that will process each chromosome
with Pool(processes=10) as pool:
    # map the processing function across all chromosomes
    phylop_snps_per_chrom = pool.map(fn, chroms)

    # concatenate the results into a single dataframe
    phylop_snps = pd.concat(phylop_snps_per_chrom)

### Join the phyloP averages with the rest of the data

In [34]:
snp_data = snp_data.merge(phylop_snps, how='left', on=['chrom', 'pos'])

In [35]:
snp_data.query("chrom == 21").head()

Unnamed: 0,chrom,pos,bvalue,AG2,AfontovaGora3,Altai,B_Australian_4,Chimp,Continenza,B_Dai_4,...,Ust_Ishim,Villabruna,Vi_merge,B_Yoruba_3,Bichon,KK1,SATP,exon_distance,exon_overlap,phylop
764619,21,15499641,929,9,9,1,0,0,9,0,...,0,9,1,0,0,0,0,69083,0.0,0.030273
764620,21,15499796,929,9,9,1,0,0,9,0,...,0,9,9,0,1,0,0,68987,0.0,0.020055
764621,21,15500371,929,9,9,1,0,0,9,0,...,0,9,9,0,0,0,9,68859,0.0,-0.023501
764622,21,15502912,929,9,9,1,0,0,9,0,...,0,1,1,0,1,0,0,63949,0.0,0.024461
764623,21,15507966,929,9,0,1,0,0,9,0,...,0,9,9,0,1,0,0,62056,0.0,0.054961


# Output the processed genosnp table in a tab-separated file

In [36]:
snp_data[['chrom', 'pos', 'bvalue', 'phylop', 'exon_distance', 'exon_overlap'] +
         list(individuals.sample_id)].to_csv('../clean_data/ice_age.tsv', sep='\t', index=False, na_rep='-')