# Generate Archaic Dataframe and Splice Altering List

This notebook gathers the SpliceAI archaic annotations, combines the InDel and SNV data, adds additional information including the distribution of alternate alleles, gene constraint, mean gene conservation, and removes invariant sites.

Load library. Set the number of columns to be displayed to a large value.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)

Load indels.

In [2]:
indels_header = ['chrom','pos','ref_allele','altai_gt','denisovan_gt','vindija_gt','alt_allele','annotation','ag_delta','al_delta','dg_delta','dl_delta','ag_pos','al_pos','dg_pos','dl_pos']
indels = pd.read_csv('../../data/spliceai_outputs/spliceai_archaic_indels.txt', sep = '\t', names = indels_header)
indels.head(10)

Unnamed: 0,chrom,pos,ref_allele,altai_gt,denisovan_gt,vindija_gt,alt_allele,annotation,ag_delta,al_delta,dg_delta,dl_delta,ag_pos,al_pos,dg_pos,dl_pos
0,chr1,739130,T,0/0,./.,0/1,TA,AL669831.1,0.0,0.0,0.0,0.0,-7,-36,-35,-31
1,chr1,739130,T,0/1,./.,0/0,TAA,AL669831.1,0.0,0.0,0.0,0.0,-31,-36,-7,-31
2,chr1,867993,GTTTC,1/1,./.,./.,G,SAMD11,0.0,0.06,0.0,0.0,-48,11,29,-7
3,chr1,868522,A,1/1,./.,1/1,AG,SAMD11,0.0,0.0,0.0,0.0,-33,2,-10,2
4,chr1,869121,T,./.,1/1,./.,TG,SAMD11,0.0,0.0,0.01,0.0,48,-36,-36,-41
5,chr1,869244,C,./.,1/1,./.,CAG,SAMD11,0.0,0.0,0.0,0.0,-3,33,23,33
6,chr1,870339,C,0/0,./.,./.,CA,SAMD11,0.0,0.0,0.0,0.01,-33,-5,-6,-34
7,chr1,870339,C,1/1,./.,./.,CG,SAMD11,0.0,0.0,0.0,0.0,-33,-5,-6,-34
8,chr1,872534,A,1/1,1/1,./.,AG,SAMD11,0.0,0.0,0.0,0.0,1,-32,-32,17
9,chr1,872776,GCC,./.,./.,./.,G,SAMD11,0.01,0.0,0.0,0.0,-14,18,-19,-14


Add a column for Chagyrskaya and the variant type.

In [3]:
indels['chagyrskaya_gt'] = './.'
indels['variant_type'] = 'indel'

Reorder columns so that the dataframe matches the SNV dataframe.

In [4]:
indels = indels[['chrom','pos','ref_allele','altai_gt','chagyrskaya_gt','denisovan_gt','vindija_gt','alt_allele','annotation','ag_delta','al_delta','dg_delta','dl_delta','ag_pos','al_pos','dg_pos','dl_pos','variant_type']]
indels.head(10)

Unnamed: 0,chrom,pos,ref_allele,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,alt_allele,annotation,ag_delta,al_delta,dg_delta,dl_delta,ag_pos,al_pos,dg_pos,dl_pos,variant_type
0,chr1,739130,T,0/0,./.,./.,0/1,TA,AL669831.1,0.0,0.0,0.0,0.0,-7,-36,-35,-31,indel
1,chr1,739130,T,0/1,./.,./.,0/0,TAA,AL669831.1,0.0,0.0,0.0,0.0,-31,-36,-7,-31,indel
2,chr1,867993,GTTTC,1/1,./.,./.,./.,G,SAMD11,0.0,0.06,0.0,0.0,-48,11,29,-7,indel
3,chr1,868522,A,1/1,./.,./.,1/1,AG,SAMD11,0.0,0.0,0.0,0.0,-33,2,-10,2,indel
4,chr1,869121,T,./.,./.,1/1,./.,TG,SAMD11,0.0,0.0,0.01,0.0,48,-36,-36,-41,indel
5,chr1,869244,C,./.,./.,1/1,./.,CAG,SAMD11,0.0,0.0,0.0,0.0,-3,33,23,33,indel
6,chr1,870339,C,0/0,./.,./.,./.,CA,SAMD11,0.0,0.0,0.0,0.01,-33,-5,-6,-34,indel
7,chr1,870339,C,1/1,./.,./.,./.,CG,SAMD11,0.0,0.0,0.0,0.0,-33,-5,-6,-34,indel
8,chr1,872534,A,1/1,./.,1/1,./.,AG,SAMD11,0.0,0.0,0.0,0.0,1,-32,-32,17,indel
9,chr1,872776,GCC,./.,./.,./.,./.,G,SAMD11,0.01,0.0,0.0,0.0,-14,18,-19,-14,indel


Now load the SNVs.

In [5]:
snvs_header = ['chrom','pos','ref_allele','altai_gt','chagyrskaya_gt','denisovan_gt','vindija_gt','alt_allele','annotation','ag_delta','al_delta','dg_delta','dl_delta','ag_pos','al_pos','dg_pos','dl_pos']
snvs = pd.read_csv('../../data/spliceai_outputs/spliceai_archaic_snvs.txt', sep = '\t', names = snvs_header)
snvs.head(10)

Unnamed: 0,chrom,pos,ref_allele,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,alt_allele,annotation,ag_delta,al_delta,dg_delta,dl_delta,ag_pos,al_pos,dg_pos,dl_pos
0,chr10,180887,C,0/0,./.,./.,0/0,A,ZMYND11,0.0,0.0,0.0,0.0,40,42,40,-2
1,chr10,180998,G,0/0,./.,./.,0/0,A,ZMYND11,0.0,0.0,0.0,0.0,2,12,35,0
2,chr10,181000,T,0/0,./.,./.,0/0,A,ZMYND11,0.0,0.0,0.01,0.0,-7,10,33,-2
3,chr10,182007,C,0/0,0/0,./.,./.,T,ZMYND11,0.0,0.0,0.0,0.0,-6,-24,40,-24
4,chr10,182073,C,1/1,0/0,0/0,0/0,T,ZMYND11,0.01,0.0,0.02,0.0,12,-46,11,2
5,chr10,183370,C,0/0,0/0,1/1,0/0,T,ZMYND11,0.0,0.0,0.0,0.0,-48,-30,-12,37
6,chr10,184616,T,1/1,1/1,0/0,1/1,G,ZMYND11,0.0,0.0,0.0,0.0,9,24,-1,3
7,chr10,185171,G,1/1,1/1,1/1,1/1,T,ZMYND11,0.01,0.0,0.0,0.0,38,1,37,-47
8,chr10,185264,G,0/0,0/0,1/1,0/0,A,ZMYND11,0.0,0.01,0.0,0.0,2,19,2,19
9,chr10,185358,C,1/1,1/1,1/1,1/1,T,ZMYND11,0.0,0.0,0.0,0.0,-41,-25,-29,-11


In [6]:
snvs['variant_type'] = 'snv'

Concat the dataframes. 

In [7]:
data = pd.concat([indels, snvs], ignore_index = True)
data = data.sort_values(by = ['chrom', 'pos'])
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,alt_allele,annotation,ag_delta,al_delta,dg_delta,dl_delta,ag_pos,al_pos,dg_pos,dl_pos,variant_type
0,chr1,739130,T,0/0,./.,./.,0/1,TA,AL669831.1,0.0,0.0,0.0,0.0,-7,-36,-35,-31,indel
1,chr1,739130,T,0/1,./.,./.,0/0,TAA,AL669831.1,0.0,0.0,0.0,0.0,-31,-36,-7,-31,indel
1169649,chr1,861808,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.01,0.0,-29,-27,24,-20,snv
1169650,chr1,861808,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,48,-14,-46,-14,snv
1169651,chr1,862072,C,1/1,1/1,0/0,0/1,T,SAMD11,0.0,0.0,0.0,0.0,3,28,-38,41,snv
1169652,chr1,862072,C,1/1,1/1,0/0,0/1,T,AL645608.1,0.0,0.0,0.0,0.0,-6,-50,24,-48,snv
1169653,chr1,862093,T,0/0,0/0,1/1,0/0,C,SAMD11,0.0,0.0,0.0,0.0,7,14,-15,34,snv
1169654,chr1,862093,T,0/0,0/0,1/1,0/0,C,AL645608.1,0.0,0.0,0.0,0.0,-34,48,49,3,snv
1169655,chr1,862124,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.0,0.0,-24,26,-11,3,snv
1169656,chr1,862124,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,17,-14,-28,35,snv


In [8]:
len(data)

2269396

Now let's add columns to designate whether at least one alternate allele is present per individual using a boolean (>= 1 alternate allele present = TRUE). 

In [9]:
data['altai_gt_boolean'] = data['altai_gt'].replace({'./.':'FALSE', '0/0':'FALSE', '1/1':'TRUE', '1/0':'TRUE', '0/1':'TRUE'})
data['chagyrskaya_gt_boolean'] = data['chagyrskaya_gt'].replace({'./.':'FALSE', '0/0':'FALSE', '1/1':'TRUE', '1/0':'TRUE', '0/1':'TRUE'})
data['denisovan_gt_boolean'] = data['denisovan_gt'].replace({'./.':'FALSE', '0/0':'FALSE', '1/1':'TRUE', '1/0':'TRUE', '0/1':'TRUE'})
data['vindija_gt_boolean'] = data['vindija_gt'].replace({'./.':'FALSE', '0/0':'FALSE', '1/1':'TRUE', '1/0':'TRUE', '0/1':'TRUE'})

Now that we know which archaics have at least one alternate allele, let's assign a distribution variable that indicates which archaics have one or two alternate alleles. 

In [10]:
def distribution(data):
    
    if (data['altai_gt_boolean'] == 'TRUE') & (data['chagyrskaya_gt_boolean'] == 'FALSE') & (data['denisovan_gt_boolean'] == 'FALSE') & (data['vindija_gt_boolean'] == 'FALSE'):
        return 'Altai'
    elif (data['altai_gt_boolean'] == 'FALSE') & (data['chagyrskaya_gt_boolean'] == 'TRUE') & (data['denisovan_gt_boolean'] == 'FALSE') & (data['vindija_gt_boolean'] == 'FALSE'):
        return 'Chagyrskaya'
    elif (data['altai_gt_boolean'] == 'FALSE') & (data['chagyrskaya_gt_boolean'] == 'FALSE') & (data['denisovan_gt_boolean'] == 'TRUE') & (data['vindija_gt_boolean'] == 'FALSE'):
        return 'Denisovan'
    elif (data['altai_gt_boolean'] == 'FALSE') & (data['chagyrskaya_gt_boolean'] == 'FALSE') & (data['denisovan_gt_boolean'] == 'FALSE') & (data['vindija_gt_boolean'] == 'TRUE'):
        return 'Vindija'
    elif (data['altai_gt_boolean'] == 'TRUE') & (data['chagyrskaya_gt_boolean'] == 'TRUE') & (data['denisovan_gt_boolean'] == 'FALSE') & (data['vindija_gt_boolean'] == 'TRUE'):
        return 'Neanderthal'
    elif (data['altai_gt_boolean'] == 'FALSE') & (data['chagyrskaya_gt_boolean'] == 'TRUE') & (data['denisovan_gt_boolean'] == 'FALSE') & (data['vindija_gt_boolean'] == 'TRUE'):
        return 'Late Neanderthal'
    elif (data['altai_gt_boolean'] == 'TRUE') & (data['chagyrskaya_gt_boolean'] == 'FALSE') & (data['denisovan_gt_boolean'] == 'TRUE') & (data['vindija_gt_boolean'] == 'TRUE') & (data['variant_type'] == 'indel'):
        return 'Shared'
    elif (data['altai_gt_boolean'] == 'TRUE') & (data['chagyrskaya_gt_boolean'] == 'TRUE') & (data['denisovan_gt_boolean'] == 'TRUE') & (data['vindija_gt_boolean'] == 'TRUE') & (data['variant_type'] == 'snv'):
        return 'Shared'
    else:
        return 'Other'
    
data['distribution'] = data.apply(distribution, axis = 1)

Some rows may actually be invariant, likely because we set low quality genotypes to missing and/or invariant sites were included in the original data. We need to remove these rows.

In [11]:
data_drop_indices = data[ (data['altai_gt_boolean'] == 'FALSE') & (data['chagyrskaya_gt_boolean'] == 'FALSE') & (data['denisovan_gt_boolean'] == 'FALSE') & (data['vindija_gt_boolean'] == 'FALSE')].index
data.drop(data_drop_indices, inplace = True)

In [12]:
len(data)

2107054

In multiple downstream analyses, we will need the maximum delta value among AG, AL, DG, and DL. Let's add a row with that information now.

In [13]:
data['delta_max'] = data[['ag_delta','al_delta','dg_delta','dl_delta']].max(axis = 1)

Let's reorder the columns a bit so that value occurs near the other deltas. 

In [14]:
data = data[['chrom','pos','ref_allele','altai_gt','chagyrskaya_gt','denisovan_gt','vindija_gt','alt_allele','annotation','ag_delta','al_delta','dg_delta','dl_delta','delta_max','ag_pos','al_pos','dg_pos','dl_pos','variant_type','altai_gt_boolean','chagyrskaya_gt_boolean','denisovan_gt_boolean','vindija_gt_boolean','distribution']]
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,alt_allele,annotation,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,variant_type,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution
0,chr1,739130,T,0/0,./.,./.,0/1,TA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-7,-36,-35,-31,indel,False,False,False,True,Vindija
1,chr1,739130,T,0/1,./.,./.,0/0,TAA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-31,-36,-7,-31,indel,True,False,False,False,Altai
1169649,chr1,861808,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,snv,False,False,True,False,Denisovan
1169650,chr1,861808,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,snv,False,False,True,False,Denisovan
1169651,chr1,862072,C,1/1,1/1,0/0,0/1,T,SAMD11,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,snv,True,True,False,True,Neanderthal
1169652,chr1,862072,C,1/1,1/1,0/0,0/1,T,AL645608.1,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,snv,True,True,False,True,Neanderthal
1169653,chr1,862093,T,0/0,0/0,1/1,0/0,C,SAMD11,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,snv,False,False,True,False,Denisovan
1169654,chr1,862093,T,0/0,0/0,1/1,0/0,C,AL645608.1,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,snv,False,False,True,False,Denisovan
1169655,chr1,862124,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,snv,False,False,True,False,Denisovan
1169656,chr1,862124,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,snv,False,False,True,False,Denisovan


Let's load some data on constraint at the gene-level and add four relevant ones to our dataframe: 1) missense observed/expected, 2) loss-of-function observed/expected, 3) missense z-score and 4)loss-of-function z-score.

In [15]:
constraint = pd.read_csv('../../data/annotations/gnomad.v2.1.1.lof_metrics.by_gene.txt', header = 0, sep = '\t')
constraint.head(10)

Unnamed: 0,gene,transcript,obs_mis,exp_mis,oe_mis,mu_mis,possible_mis,obs_mis_pphen,exp_mis_pphen,oe_mis_pphen,possible_mis_pphen,obs_syn,exp_syn,oe_syn,mu_syn,possible_syn,obs_lof,mu_lof,possible_lof,exp_lof,pLI,pNull,pRec,oe_lof,oe_syn_lower,oe_syn_upper,oe_mis_lower,oe_mis_upper,oe_lof_lower,oe_lof_upper,constraint_flag,syn_z,mis_z,lof_z,oe_lof_upper_rank,oe_lof_upper_bin,oe_lof_upper_bin_6,n_sites,classic_caf,max_af,no_lofs,obs_het_lof,obs_hom_lof,defined,p,exp_hom_lof,classic_caf_afr,classic_caf_amr,classic_caf_asj,classic_caf_eas,classic_caf_fin,classic_caf_nfe,classic_caf_oth,classic_caf_sas,p_afr,p_amr,p_asj,p_eas,p_fin,p_nfe,p_oth,p_sas,transcript_type,gene_id,transcript_level,cds_length,num_coding_exons,gene_type,gene_length,exac_pLI,exac_obs_lof,exac_exp_lof,exac_oe_lof,brain_expression,chromosome,start_position,end_position
0,MED13,ENST00000397786,871,1117.8,0.77921,5.6e-05,14195,314.0,529.75,0.59273,6708.0,422,387.53,1.089,1.9e-05,4248,0.0,5e-06,1257.0,98.429,1.0,8.9436e-40,1.8383e-16,0.0,1.005,1.18,0.736,0.824,0.0,0.03,,-1.3765,2.6232,9.1935,0.0,0.0,0.0,2.0,1.2e-05,8e-06,124782.0,3.0,0.0,124785.0,1.2e-05,1.8e-05,0.0,0.0,0.0,0.0,9.3e-05,9e-06,0.0,0.0,0.0,0.0,0.0,0.0,9.3e-05,9e-06,0.0,0.0,protein_coding,ENSG00000108510,2,6522,30,protein_coding,122678,1.0,0.0,64.393,0.0,,17,60019966,60142643
1,NIPBL,ENST00000282516,846,1441.5,0.58688,7.4e-05,18540,158.0,543.1,0.29092,7135.0,496,495.01,1.002,2.5e-05,5211,1.0,9e-06,1781.0,150.32,1.0,2.9773e-59,3.5724e-24,0.006653,0.93,1.079,0.554,0.621,0.001,0.032,,-0.035119,5.5737,11.286,1.0,0.0,0.0,2.0,1.2e-05,8e-06,125693.0,3.0,0.0,125696.0,1.2e-05,1.8e-05,0.0,0.0,9.9e-05,0.0,0.0,0.0,0.0,6.5e-05,0.0,0.0,9.9e-05,0.0,0.0,0.0,0.0,6.5e-05,protein_coding,ENSG00000164190,2,8412,46,protein_coding,189655,1.0,1.0,110.57,0.009044,,5,36876861,37066515
2,SMC3,ENST00000361804,178,630.07,0.28251,3.2e-05,8109,21.0,182.52,0.11506,2197.0,215,203.25,1.0578,1e-05,2091,0.0,5e-06,937.0,79.49,1.0,2.7853e-32,2.1914e-13,0.0,0.946,1.184,0.249,0.32,0.0,0.037,,-0.64776,6.3999,8.2618,2.0,0.0,0.0,8.0,3.2e-05,4e-06,125731.0,8.0,0.0,125739.0,3.2e-05,0.000127,0.0,0.0,9.9e-05,5.4e-05,0.0,4.4e-05,0.0,3.3e-05,0.0,0.0,9.9e-05,5.4e-05,0.0,4.4e-05,0.0,3.3e-05,protein_coding,ENSG00000108055,2,3651,29,protein_coding,36946,1.0,0.0,58.523,0.0,,10,112327449,112364394
3,CNOT1,ENST00000317147,561,1295.9,0.4329,6.9e-05,15670,51.0,290.68,0.17545,3560.0,470,456.03,1.0306,2.4e-05,4564,1.0,7e-06,1440.0,125.03,1.0,2.9924e-49,4.5628999999999995e-20,0.007998,0.955,1.112,0.403,0.464,0.002,0.038,,-0.5141,7.2546,10.279,3.0,0.0,0.0,5.0,2e-05,4e-06,125740.0,4.0,0.0,125744.0,1.6e-05,3.2e-05,0.0,2.9e-05,0.0,5.5e-05,0.0,2.6e-05,0.0,0.0,0.0,2.9e-05,0.0,5.4e-05,0.0,1.8e-05,0.0,0.0,protein_coding,ENSG00000125107,2,7128,48,protein_coding,109936,1.0,3.0,90.13,0.033285,,16,58553855,58663790
4,RLF,ENST00000372771,669,972.87,0.68766,4.7e-05,12682,107.0,321.14,0.33319,4151.0,358,352.62,1.0153,1.7e-05,3482,0.0,4e-06,1024.0,73.222,1.0,8.4055e-30,2.2842e-12,0.0,0.93,1.108,0.645,0.733,0.0,0.04,,-0.22518,3.462,7.9294,4.0,0.0,0.0,1.0,4e-06,4e-06,125122.0,1.0,0.0,125123.0,4e-06,2e-06,6.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,protein_coding,ENSG00000117000,2,5742,8,protein_coding,79549,1.0,0.0,43.607,0.0,,1,40627045,40706593
5,PCF11,ENST00000298281,574,783.46,0.73265,3.9e-05,10106,130.0,264.36,0.49176,3521.0,298,272.69,1.0928,1.3e-05,3026,0.0,5e-06,862.0,74.559,1.0,2.4872e-30,1.3855e-12,0.0,0.993,1.203,0.683,0.785,0.0,0.04,,-1.205,2.6592,8.0014,5.0,0.0,0.0,4.0,1.6e-05,4e-06,124625.0,3.0,0.0,124628.0,1.2e-05,1.8e-05,0.0,0.0,0.0,0.000111,0.0,1.8e-05,0.0,0.0,0.0,0.0,0.0,5.6e-05,0.0,1.8e-05,0.0,0.0,protein_coding,ENSG00000165494,2,4665,16,protein_coding,30464,1.0,1.0,48.16,0.020764,,11,82868030,82898493
6,FNDC3B,ENST00000336824,551,689.52,0.79911,3.8e-05,7833,75.0,160.48,0.46735,1794.0,274,265.93,1.0304,1.7e-05,2360,0.0,4e-06,805.0,69.811,1.0,1.8792000000000001e-28,8.1792e-12,0.0,0.933,1.139,0.744,0.857,0.0,0.042,,-0.38908,1.8745,7.7425,6.0,0.0,0.0,2.0,8e-06,4e-06,125745.0,2.0,0.0,125747.0,8e-06,8e-06,0.0,2.9e-05,0.0,0.0,0.0,9e-06,0.0,0.0,0.0,2.9e-05,0.0,0.0,0.0,9e-06,0.0,0.0,protein_coding,ENSG00000075420,2,3612,25,protein_coding,362038,1.0,0.0,49.447,0.0,,3,171757418,172119455
7,TAF1,ENST00000276072,326,748.3,0.43565,6e-05,12563,82.0,312.02,0.2628,5365.0,272,250.43,1.0861,1.8e-05,3522,0.0,5e-06,1203.0,68.906,1.0,4.288100000000001e-28,1.1477e-11,0.0,0.983,1.201,0.397,0.477,0.0,0.043,,-1.0712,5.486,7.6921,7.0,0.0,0.0,2.0,9.3e-05,8.7e-05,125612.0,14.0,3.0,125629.0,6.8e-05,0.000575,0.0,0.0,0.000535,0.000794,0.0,1.2e-05,0.0,5.2e-05,0.0,0.0,0.000397,0.000598,0.0,9e-06,0.0,3.3e-05,protein_coding,ENSG00000147133,2,5679,38,protein_coding,166111,1.0,2.0,50.8,0.03937,,X,70586114,70752224
8,RSF1,ENST00000308488,620,756.71,0.81933,4e-05,9552,206.0,310.49,0.66348,3722.0,259,265.17,0.97673,1.3e-05,2603,0.0,4e-06,846.0,67.83,1.0,1.1427e-27,1.7162e-11,0.0,0.882,1.083,0.766,0.876,0.0,0.044,,0.29786,1.7661,7.6318,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,protein_coding,ENSG00000048649,2,4323,16,protein_coding,161023,1.0,0.0,46.996,0.0,,11,77371041,77532063
9,NCKAP1,ENST00000360982,291,580.53,0.50127,2.9e-05,7483,56.0,154.07,0.36346,2017.0,194,197.96,0.98,1e-05,2051,0.0,4e-06,850.0,67.231,1.0,1.9726000000000002e-27,2.1473e-11,0.0,0.871,1.104,0.455,0.552,0.0,0.044,,0.22117,4.2701,7.5981,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,protein_coding,ENSG00000061676,2,3402,32,protein_coding,129744,1.0,0.0,50.723,0.0,,2,183773843,183903586


Now add the missense and loss-of-function z-scores to the main dataframe.

In [16]:
mis_oe = pd.Series(constraint.oe_mis.values, index = constraint.gene).to_dict()
lof_oe = pd.Series(constraint.oe_lof.values, index = constraint.gene).to_dict()
mis_z = pd.Series(constraint.mis_z.values, index = constraint.gene).to_dict()
lof_z = pd.Series(constraint.lof_z.values, index = constraint.gene).to_dict()

In [17]:
data['mis_oe'] = data['annotation'].map(mis_oe)
data['lof_oe'] = data['annotation'].map(lof_oe)
data['mis_z'] = data['annotation'].map(mis_z)
data['lof_z'] = data['annotation'].map(lof_z)

In [18]:
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,alt_allele,annotation,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,variant_type,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,mis_oe,lof_oe,mis_z,lof_z
0,chr1,739130,T,0/0,./.,./.,0/1,TA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-7,-36,-35,-31,indel,False,False,False,True,Vindija,,,,
1,chr1,739130,T,0/1,./.,./.,0/0,TAA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-31,-36,-7,-31,indel,True,False,False,False,Altai,,,,
1169649,chr1,861808,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,snv,False,False,True,False,Denisovan,1.5082,0.89656,-3.4361,0.47484
1169650,chr1,861808,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,snv,False,False,True,False,Denisovan,1.2217,0.73515,-0.64548,0.49579
1169651,chr1,862072,C,1/1,1/1,0/0,0/1,T,SAMD11,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,snv,True,True,False,True,Neanderthal,1.5082,0.89656,-3.4361,0.47484
1169652,chr1,862072,C,1/1,1/1,0/0,0/1,T,AL645608.1,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,snv,True,True,False,True,Neanderthal,1.2217,0.73515,-0.64548,0.49579
1169653,chr1,862093,T,0/0,0/0,1/1,0/0,C,SAMD11,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,snv,False,False,True,False,Denisovan,1.5082,0.89656,-3.4361,0.47484
1169654,chr1,862093,T,0/0,0/0,1/1,0/0,C,AL645608.1,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,snv,False,False,True,False,Denisovan,1.2217,0.73515,-0.64548,0.49579
1169655,chr1,862124,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,snv,False,False,True,False,Denisovan,1.5082,0.89656,-3.4361,0.47484
1169656,chr1,862124,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,snv,False,False,True,False,Denisovan,1.2217,0.73515,-0.64548,0.49579


In [19]:
len(data)

2107054

Let's add per variant phyloP data.

In [20]:
variant_phyloP_header = ['chrom','start','pos','phyloP']
variant_phyloP = pd.read_csv('../../data/annotations/all_variant_sites_hg19_phyloP.bed', sep = '\t', names = variant_phyloP_header)
variant_phyloP.head(10)

Unnamed: 0,chrom,start,pos,phyloP
0,chr1,739129,739130,0.155
1,chr1,739129,739130,0.155
2,chr1,861807,861808,-0.683
3,chr1,861807,861808,-0.683
4,chr1,862071,862072,0.197
5,chr1,862071,862072,0.197
6,chr1,862092,862093,-1.042
7,chr1,862092,862093,-1.042
8,chr1,862123,862124,-3.75
9,chr1,862123,862124,-3.75


In [21]:
data = pd.merge(data, variant_phyloP[['chrom','pos','phyloP']], on = ['chrom','pos'], how = 'left', indicator = False)
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,alt_allele,annotation,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,variant_type,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,mis_oe,lof_oe,mis_z,lof_z,phyloP
0,chr1,739130,T,0/0,./.,./.,0/1,TA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-7,-36,-35,-31,indel,False,False,False,True,Vindija,,,,,0.155
1,chr1,739130,T,0/0,./.,./.,0/1,TA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-7,-36,-35,-31,indel,False,False,False,True,Vindija,,,,,0.155
2,chr1,739130,T,0/1,./.,./.,0/0,TAA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-31,-36,-7,-31,indel,True,False,False,False,Altai,,,,,0.155
3,chr1,739130,T,0/1,./.,./.,0/0,TAA,AL669831.1,0.0,0.0,0.0,0.0,0.0,-31,-36,-7,-31,indel,True,False,False,False,Altai,,,,,0.155
4,chr1,861808,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,snv,False,False,True,False,Denisovan,1.5082,0.89656,-3.4361,0.47484,-0.683
5,chr1,861808,A,0/0,0/0,1/1,0/0,G,SAMD11,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,snv,False,False,True,False,Denisovan,1.5082,0.89656,-3.4361,0.47484,-0.683
6,chr1,861808,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,snv,False,False,True,False,Denisovan,1.2217,0.73515,-0.64548,0.49579,-0.683
7,chr1,861808,A,0/0,0/0,1/1,0/0,G,AL645608.1,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,snv,False,False,True,False,Denisovan,1.2217,0.73515,-0.64548,0.49579,-0.683
8,chr1,862072,C,1/1,1/1,0/0,0/1,T,SAMD11,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,snv,True,True,False,True,Neanderthal,1.5082,0.89656,-3.4361,0.47484,0.197
9,chr1,862072,C,1/1,1/1,0/0,0/1,T,SAMD11,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,snv,True,True,False,True,Neanderthal,1.5082,0.89656,-3.4361,0.47484,0.197


In [22]:
len(data)

2358420

In [23]:
data = data.drop_duplicates()
len(data)

2107041

Let's reorder the dataframe a bit to group similar columns.

In [24]:
data = data[['chrom','pos','ref_allele','alt_allele','variant_type','altai_gt','chagyrskaya_gt','denisovan_gt','vindija_gt','altai_gt_boolean','chagyrskaya_gt_boolean','denisovan_gt_boolean','vindija_gt_boolean','distribution','annotation','mis_oe','mis_z','lof_oe','lof_z','phyloP','ag_delta','al_delta','dg_delta','dl_delta','delta_max','ag_pos','al_pos','dg_pos','dl_pos']]

Now let's export this dataframe as we will use it again in downstream analyses.

In [25]:
data.to_csv('../../data/dataframes/archaic_data_with_constraint.txt', sep = '\t', header = True, index = False)