# Generate Archaic Dataframe with Ancestral Alleles, Allele Frequencies, and Introgressed Variants and Get sQTLs

Now that we have some additional information on ancestral alleles, allele frequencies in 1KG, and which variants appear introgressed in modern humans let's add that data to our dataframe. We will also get sQTL data from GTEx from variants.

Load library.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)

In [2]:
cd ../../data/dataframes

/wynton/group/capra/projects/archaic_splicing/data/dataframes


Load data.

In [3]:
data = pd.read_csv('archaic_data_with_constraint.txt', sep = '\t', header = 0)
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos
0,chr1,739130,T,TA,indel,0/0,./.,./.,0/1,False,False,False,True,Vindija,AL669831.1,,,,,0.155,0.0,0.0,0.0,0.0,0.0,-7,-36,-35,-31
1,chr1,739130,T,TAA,indel,0/1,./.,./.,0/0,True,False,False,False,Altai,AL669831.1,,,,,0.155,0.0,0.0,0.0,0.0,0.0,-31,-36,-7,-31
2,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20
3,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14
4,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41
5,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48
6,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34
7,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3
8,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3
9,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35


In [4]:
len(data)

2107041

# Drop InDels and X Chromosome

Let's go ahead and drop the InDels and the X chromosome here.

In [5]:
data = data[data['variant_type'] == 'snv']
data = data[data['chrom'] != 'chrX']

In [6]:
len(data)

1607350

# Ancestral Alleles

Load ancestral alleles. Some entries are duplicated that will mess with the merge so we drop duplicates.

In [7]:
ancestral_header = ['chrom','start','pos','ancestral_allele']
ancestral = pd.read_csv('../archaic_variants/variant_sites_hg19_with_ancestral.bed', sep = '\t', names = ancestral_header)
ancestral = ancestral.drop_duplicates()
ancestral.head(10)

Unnamed: 0,chrom,start,pos,ancestral_allele
0,chr1,739129,739130,.
2,chr1,861807,861808,A
4,chr1,862071,862072,C
6,chr1,862092,862093,C
8,chr1,862123,862124,G
10,chr1,862382,862383,C
12,chr1,862388,862389,G
14,chr1,863123,863124,G
16,chr1,863842,863843,C
18,chr1,863862,863863,G


In [8]:
len(ancestral)

2025313

Merge and check the dataframe len to make sure we haven't lost any or unintentionally added variants. 

In [9]:
data = pd.merge(data, ancestral[['chrom','pos','ancestral_allele']], on = ['chrom','pos'], how = 'left')
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C


In [10]:
len(data)

1607350

Let's add a column indicating whether the variant is the ancestral or derived allele regardless of confidence. We'll use a function and temporarily create a column to get around that case indicates confidence.

In [11]:
data['temp'] = data['ancestral_allele'].str.upper()

def anc_dev(data):
    if (data['ancestral_allele'] == '-') | (data['ancestral_allele'] == '.') | (data['ancestral_allele'] == 'N'):
        return '.'
    elif (data['temp'] == data['alt_allele']):
        return 'ancestral'
    elif (data['temp'] != data['alt_allele']):
        return 'derived'

data['anc_dev'] = data.apply(anc_dev, axis = 1)
data.drop(['temp'], axis = 1)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,anc_dev
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.50820,-3.43610,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,derived
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.22170,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.00,0.0,0.00,48,-14,-46,-14,A,derived
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.50820,-3.43610,0.89656,0.47484,0.197,0.0,0.0,0.00,0.0,0.00,3,28,-38,41,C,derived
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.22170,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.00,0.0,0.00,-6,-50,24,-48,C,derived
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.50820,-3.43610,0.89656,0.47484,-1.042,0.0,0.0,0.00,0.0,0.00,7,14,-15,34,C,ancestral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607345,chr9,141016791,C,T,snv,0/1,0/0,0/0,0/0,True,False,False,False,Altai,CACNA1B,0.64737,4.51650,0.15813,7.84720,0.561,0.0,0.0,0.00,0.0,0.00,4,-23,-12,22,C,derived
1607346,chr9,141017291,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,CACNA1B,0.64737,4.51650,0.15813,7.84720,0.556,0.0,0.0,0.00,0.0,0.00,7,1,24,-8,C,derived
1607347,chr9,141017344,G,A,snv,0/0,0/0,0/0,1/1,False,False,False,True,Vindija,CACNA1B,0.64737,4.51650,0.15813,7.84720,0.561,0.0,0.0,0.00,0.0,0.00,4,-46,-29,36,G,derived
1607348,chr9,141017352,C,T,snv,1/1,1/1,1/1,1/1,True,True,True,True,Shared,CACNA1B,0.64737,4.51650,0.15813,7.84720,-0.469,0.0,0.0,0.00,0.0,0.00,25,-4,-37,28,T,ancestral


In [12]:
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived


# Modern Human Allele Frequencies

Now for 1000 Genomes allele frequencies. We will also add a column indicating whether or not the variant is present in 1000 Genomes.

In [13]:
frequencies_header = ['chrom','start','pos','ref_allele','alt_allele','1KG_allele_count','1KG_allele_number','1KG_allele_frequency','1KG_EAS_AF','1KG_EUR_AF','1KG_AFR_AF','1KG_AMR_AF','1KG_SAS_AF']
frequencies = pd.read_csv('../archaic_variants_in_humans/allele_frequencies_hg19.bed', sep = '\t', names = frequencies_header)
frequencies = frequencies.drop(['start'], axis = 1)
frequencies.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF
0,chr1,861808,A,G,3479,5096,0.68,0.55,0.97,0.34,0.83,0.88
1,chr1,862072,C,T,9,5096,0.0,0.01,0.0,0.0,0.0,0.0
2,chr1,862093,T,C,3484,5096,0.68,0.55,0.97,0.34,0.83,0.88
3,chr1,862124,A,G,3485,5096,0.68,0.55,0.97,0.34,0.83,0.88
4,chr1,862383,C,T,3476,5096,0.68,0.55,0.97,0.34,0.83,0.88
5,chr1,862389,A,G,3482,5096,0.68,0.55,0.97,0.34,0.83,0.88
6,chr1,863124,G,T,3473,5096,0.68,0.55,0.97,0.34,0.83,0.88
7,chr1,863843,C,T,1090,5096,0.21,0.43,0.02,0.35,0.13,0.08
8,chr1,863863,G,A,1281,5096,0.25,0.43,0.02,0.47,0.15,0.08
9,chr1,863978,G,A,497,5096,0.1,0.15,0.01,0.18,0.11,0.03


In [14]:
len(frequencies)

1234950

In [15]:
data = pd.merge(data, frequencies, on = ['chrom','pos','ref_allele','alt_allele'], how = 'left', indicator = True)
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,_merge
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,both
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,both
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,both


In [16]:
len(data)

1607350

In [17]:
data['present_in_1KG'] = data['_merge'].replace({'both':'yes', 'left_only':'no'})
data = data.drop(columns = ['_merge'])
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,present_in_1KG
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes


## Non-ASW AFR Allele Frequencies

In [18]:
AFR_frequencies_header = ['chrom','start','pos','ref_allele','alt_allele','non_ASW_AFR_allele_count','non_ASW_AFR_allele_number']
AFR_frequencies = pd.read_csv('../archaic_variants_in_humans/non_ASW_AFR_allele_frequencies_hg19.bed', sep = '\t', names = AFR_frequencies_header)
AFR_frequencies['1KG_non_ASW_AFR_AF'] = AFR_frequencies['non_ASW_AFR_allele_count'] / AFR_frequencies['non_ASW_AFR_allele_number']
AFR_frequencies.head(10)

Unnamed: 0,chrom,start,pos,ref_allele,alt_allele,non_ASW_AFR_allele_count,non_ASW_AFR_allele_number,1KG_non_ASW_AFR_AF
0,chr1,861807,861808,A,G,322,1026,0.31384
1,chr1,862071,862072,C,T,0,1026,0.0
2,chr1,862092,862093,T,C,324,1026,0.315789
3,chr1,862123,862124,A,G,325,1026,0.316764
4,chr1,862382,862383,C,T,319,1026,0.310916
5,chr1,862388,862389,A,G,325,1026,0.316764
6,chr1,863123,863124,G,T,316,1026,0.307992
7,chr1,863842,863843,C,T,362,1026,0.352827
8,chr1,863862,863863,G,A,489,1026,0.476608
9,chr1,863977,863978,G,A,185,1026,0.180312


In [19]:
AFR_frequencies = AFR_frequencies.drop(['start','non_ASW_AFR_allele_count','non_ASW_AFR_allele_number'], axis = 1)
AFR_frequencies.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,1KG_non_ASW_AFR_AF
0,chr1,861808,A,G,0.31384
1,chr1,862072,C,T,0.0
2,chr1,862093,T,C,0.315789
3,chr1,862124,A,G,0.316764
4,chr1,862383,C,T,0.310916
5,chr1,862389,A,G,0.316764
6,chr1,863124,G,T,0.307992
7,chr1,863843,C,T,0.352827
8,chr1,863863,G,A,0.476608
9,chr1,863978,G,A,0.180312


In [20]:
data = pd.merge(data, AFR_frequencies, on = ['chrom','pos','ref_allele','alt_allele'], how = 'left')
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,present_in_1KG,1KG_non_ASW_AFR_AF
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916


## Vernot et al. 2016 Introgression

Now let's add a column to reflect whether a variant is an introgressed tag SNP determined using the S* method from Vernot et al. 2016.

Load the data.

In [21]:
vernot_header = ['chrom','start','pos','Vernot_ancestral_allele','Vernot_derived_allele','Vernot_ancestral_derived_code','Vernot_AFA_AF','Vernot_AFR_AF','Vernot_AMR_AF','Vernot_EAS_AF','Vernot_EUR_AF','Vernot_PNG_AF','Vernot_SAS_AF','Vernot_Neanderthal_base','Vernot_Denisovan_base','Vernot_haplotype_tag'] # let's call 'stop' 'pos' here because we switching from half open to fully closed coordinates
vernot = pd.read_csv('../introgression/sorted_no_dups_archaic_introgressed_tag_snps.bed', sep = '\t', names = vernot_header)
vernot.head(10)

Unnamed: 0,chrom,start,pos,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Neanderthal_base,Vernot_Denisovan_base,Vernot_haplotype_tag
0,chr1,2270126,2270127,C,T,1,0.0,0.00198,0.0,0.00595,0.0,0.0,0.00102,T,C,chr1_2270126_2300081
1,chr1,2273653,2273654,T,A,1,0.0,0.0,0.0,0.00595,0.0,0.0,0.00102,A,T,chr1_2270126_2300081
2,chr1,2285111,2285112,G,A,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.01329,A,G,chr1_2270126_2300081
3,chr1,2292649,2292650,A,C,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.00102,C,A,chr1_2270126_2300081
4,chr1,2299058,2299059,C,T,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.00102,T,C,chr1_2270126_2300081
5,chr1,2300080,2300081,C,T,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.00102,T,C,chr1_2270126_2300081
6,chr1,2855996,2855997,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.07407,0.07055,A,G,chr1_2855996_2879424
7,chr1,2858726,2858727,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.07407,0.07055,A,G,chr1_2855996_2879424
8,chr1,2861780,2861781,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.07407,0.07055,G/A,G,chr1_2855996_2879424
9,chr1,2863783,2863784,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.03704,0.06953,A,G,chr1_2855996_2879424


Let's split the Neanderthal and Denisovan base columns because some of these have two alleles separated by a slash.

In [22]:
vernot[['Vernot_Neanderthal_base_1','Vernot_Neanderthal_base_2']] = vernot['Vernot_Neanderthal_base'].str.split(pat = '/', expand = True)
vernot.head(10)

Unnamed: 0,chrom,start,pos,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Neanderthal_base,Vernot_Denisovan_base,Vernot_haplotype_tag,Vernot_Neanderthal_base_1,Vernot_Neanderthal_base_2
0,chr1,2270126,2270127,C,T,1,0.0,0.00198,0.0,0.00595,0.0,0.0,0.00102,T,C,chr1_2270126_2300081,T,
1,chr1,2273653,2273654,T,A,1,0.0,0.0,0.0,0.00595,0.0,0.0,0.00102,A,T,chr1_2270126_2300081,A,
2,chr1,2285111,2285112,G,A,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.01329,A,G,chr1_2270126_2300081,A,
3,chr1,2292649,2292650,A,C,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.00102,C,A,chr1_2270126_2300081,C,
4,chr1,2299058,2299059,C,T,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.00102,T,C,chr1_2270126_2300081,T,
5,chr1,2300080,2300081,C,T,1,0.0,0.0,0.0,0.00694,0.0,0.0,0.00102,T,C,chr1_2270126_2300081,T,
6,chr1,2855996,2855997,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.07407,0.07055,A,G,chr1_2855996_2879424,A,
7,chr1,2858726,2858727,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.07407,0.07055,A,G,chr1_2855996_2879424,A,
8,chr1,2861780,2861781,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.07407,0.07055,G/A,G,chr1_2855996_2879424,G,A
9,chr1,2863783,2863784,G,A,1,0.0,0.0,0.00144,0.06448,0.0,0.03704,0.06953,A,G,chr1_2855996_2879424,A,


Drop the original columns.

In [23]:
vernot = vernot.drop(['Vernot_Neanderthal_base'], axis = 1)

Now merge. Pandas is going to panic because there are duplicate positions per chromosome and during it's panic, it will start duplicating rows. We'll remove these post-merge.

In [24]:
data = pd.merge(data, vernot, on = ['chrom','pos'], how = 'left')
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,present_in_1KG,1KG_non_ASW_AFR_AF,start,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Denisovan_base,Vernot_haplotype_tag,Vernot_Neanderthal_base_1,Vernot_Neanderthal_base_2
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,


In [25]:
len(data)

1674361

Drop duplicate rows.

In [26]:
data = data.drop_duplicates(['chrom','pos','ref_allele','alt_allele','annotation'])
len(data)

1607350

Let's save two copies of the dataframe: 1) where only the ref allele matches a tag SNP and 2) where either the ref or alt allele matches a tag SNP.

In [27]:
introgressed_ref_tag = data[(data['ref_allele'] == data['Vernot_Neanderthal_base_1']) | (data['ref_allele'] == data['Vernot_Neanderthal_base_2'])]
introgressed_ref_tag = introgressed_ref_tag[introgressed_ref_tag['present_in_1KG'] == 'yes']

In [28]:
len(introgressed_ref_tag)

7977

In [29]:
introgressed_ref_tag.to_csv('introgressed_ref_tag.txt', sep = '\t', header = False, index = False)

Now let's write a function to designate if the reference or alternate allele for each variant is a Neanderthal tag SNP from Vernot et al. 2016.

In [30]:
def Vernot_introgressed(data):
    if (data['ref_allele'] == data['Vernot_Neanderthal_base_1']) or (data['ref_allele'] == data['Vernot_Neanderthal_base_2']) or (data['alt_allele'] == data['Vernot_Neanderthal_base_1']) or (data['alt_allele'] == data['Vernot_Neanderthal_base_2']):
        return 'yes'
    else:
        return 'no'

data['Vernot_introgressed'] = data.apply(Vernot_introgressed, axis = 1)

In [31]:
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,present_in_1KG,1KG_non_ASW_AFR_AF,start,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Denisovan_base,Vernot_haplotype_tag,Vernot_Neanderthal_base_1,Vernot_Neanderthal_base_2,Vernot_introgressed
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,,no
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,,no
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,,no
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,,no
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,,no
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,,no
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,,no
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,,no
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,,no
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,,no


Check the length to make sure we haven't lost any variants along the way.

In [32]:
len(data)

1607350

To make our future lives easier, we also need to add a column that designates the allele's origin (i.e., when did it evolve and did it stay "there"). 

In [33]:
def Vernot_allele_origin(data):
    if (data['present_in_1KG'] == 'no'):
        return 'archaic-specific'
    elif (data['present_in_1KG'] == 'yes') & (data['Vernot_introgressed'] == 'yes'):
        return 'introgressed'
    elif (data['present_in_1KG'] == 'yes') & (data['Vernot_introgressed'] == 'no'):
        return 'ancient'

data['Vernot_allele_origin'] = data.apply(Vernot_allele_origin, axis = 1)

Now let's add a fourth category to designate variants that may not actually be ancient but occur in at least one 1KG human and at least one archaic. We'll call them low-confidence ancient.

In [34]:
allele_frequencies = data[(data['Vernot_allele_origin']=='ancient')]
allele_frequencies = allele_frequencies[['1KG_non_ASW_AFR_AF','1KG_AMR_AF','1KG_EAS_AF','1KG_EUR_AF','1KG_SAS_AF']]
allele_frequencies.head(10)

Unnamed: 0,1KG_non_ASW_AFR_AF,1KG_AMR_AF,1KG_EAS_AF,1KG_EUR_AF,1KG_SAS_AF
0,0.31384,0.83,0.55,0.97,0.88
1,0.31384,0.83,0.55,0.97,0.88
2,0.0,0.0,0.01,0.0,0.0
3,0.0,0.0,0.01,0.0,0.0
4,0.315789,0.83,0.55,0.97,0.88
5,0.315789,0.83,0.55,0.97,0.88
6,0.316764,0.83,0.55,0.97,0.88
7,0.316764,0.83,0.55,0.97,0.88
8,0.310916,0.83,0.55,0.97,0.88
9,0.310916,0.83,0.55,0.97,0.88


In [35]:
ancient_indexes = allele_frequencies[(allele_frequencies >= 0.05).sum(axis=1) >= 2].index
shared_non_introgressed_indexes = allele_frequencies.drop(ancient_indexes).index

In [36]:
data.loc[shared_non_introgressed_indexes,'Vernot_allele_origin']='low-confidence ancient'

Drop unnecessary columns.

In [37]:
data.drop(['start'], axis = 1)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,present_in_1KG,1KG_non_ASW_AFR_AF,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Denisovan_base,Vernot_haplotype_tag,Vernot_Neanderthal_base_1,Vernot_Neanderthal_base_2,Vernot_introgressed,Vernot_allele_origin
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.50820,-3.43610,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.313840,,,,,,,,,,,,,,,no,ancient
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.22170,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.00,0.0,0.00,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.313840,,,,,,,,,,,,,,,no,ancient
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.50820,-3.43610,0.89656,0.47484,0.197,0.0,0.0,0.00,0.0,0.00,3,28,-38,41,C,C,derived,9.0,5096.0,0.00,0.01,0.00,0.00,0.00,0.00,yes,0.000000,,,,,,,,,,,,,,,no,low-confidence ancient
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.22170,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.00,0.0,0.00,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.00,0.01,0.00,0.00,0.00,0.00,yes,0.000000,,,,,,,,,,,,,,,no,low-confidence ancient
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.50820,-3.43610,0.89656,0.47484,-1.042,0.0,0.0,0.00,0.0,0.00,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,no,ancient
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674356,chr9,141016791,C,T,snv,0/1,0/0,0/0,0/0,True,False,False,False,Altai,CACNA1B,0.64737,4.51650,0.15813,7.84720,0.561,0.0,0.0,0.00,0.0,0.00,4,-23,-12,22,C,C,derived,,,,,,,,,no,,,,,,,,,,,,,,,,no,archaic-specific
1674357,chr9,141017291,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,CACNA1B,0.64737,4.51650,0.15813,7.84720,0.556,0.0,0.0,0.00,0.0,0.00,7,1,24,-8,C,C,derived,,,,,,,,,no,,,,,,,,,,,,,,,,no,archaic-specific
1674358,chr9,141017344,G,A,snv,0/0,0/0,0/0,1/1,False,False,False,True,Vindija,CACNA1B,0.64737,4.51650,0.15813,7.84720,0.561,0.0,0.0,0.00,0.0,0.00,4,-46,-29,36,G,G,derived,7.0,5096.0,0.00,0.01,0.00,0.00,0.00,0.00,yes,0.000000,,,,,,,,,,,,,,,no,low-confidence ancient
1674359,chr9,141017352,C,T,snv,1/1,1/1,1/1,1/1,True,True,True,True,Shared,CACNA1B,0.64737,4.51650,0.15813,7.84720,-0.469,0.0,0.0,0.00,0.0,0.00,25,-4,-37,28,T,T,ancestral,4273.0,5096.0,0.84,0.80,0.89,0.84,0.87,0.80,yes,0.848928,,,,,,,,,,,,,,,no,ancient


# Browning et al. 2018

Now let's add the introgressed variants identified by the Sprime method from Browning et al. 2018.

In [38]:
browning_header = ['chrom','pos','ref_allele','alt_allele','Browning_ref_alt']
browning = pd.read_csv('../introgression/Browning_et_al_2018_Neanderthal_introgressed_variants.txt', sep = '\t', names = browning_header)
browning.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,Browning_ref_alt
0,chr1,1894604,C,T,1
1,chr1,1898797,G,A,1
2,chr1,1899929,C,T,1
3,chr1,1902906,T,G,1
4,chr1,1903373,C,T,1
5,chr1,1904910,G,A,1
6,chr1,1905395,G,T,1
7,chr1,1907597,G,A,1
8,chr1,1912485,A,G,1
9,chr1,1914609,A,T,1


In [39]:
data = pd.merge(data, browning, on=['chrom','pos','ref_allele','alt_allele'], how = 'left', indicator=True)
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,present_in_1KG,1KG_non_ASW_AFR_AF,start,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Denisovan_base,Vernot_haplotype_tag,Vernot_Neanderthal_base_1,Vernot_Neanderthal_base_2,Vernot_introgressed,Vernot_allele_origin,Browning_ref_alt,_merge
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,,no,ancient,,left_only
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,,no,ancient,,left_only
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,,no,low-confidence ancient,,left_only
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,,no,low-confidence ancient,,left_only
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,,no,ancient,,left_only
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,,no,ancient,,left_only
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,,no,ancient,,left_only
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,,no,ancient,,left_only
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,,no,ancient,,left_only
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,,no,ancient,,left_only


In [40]:
len(data)

1607350

In [41]:
data.rename(columns={'_merge': 'Browning_introgressed'}, inplace=True)
data['Browning_introgressed'] = data['Browning_introgressed'].replace({'both':'yes', 'left_only':'no'})
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos,ancestral_allele,temp,anc_dev,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,present_in_1KG,1KG_non_ASW_AFR_AF,start,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Denisovan_base,Vernot_haplotype_tag,Vernot_Neanderthal_base_1,Vernot_Neanderthal_base_2,Vernot_introgressed,Vernot_allele_origin,Browning_ref_alt,Browning_introgressed
0,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,,no,ancient,,no
1,chr1,861808,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14,A,A,derived,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.31384,,,,,,,,,,,,,,,,no,ancient,,no
2,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,,no,low-confidence ancient,,no
3,chr1,862072,C,T,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48,C,C,derived,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,yes,0.0,,,,,,,,,,,,,,,,no,low-confidence ancient,,no
4,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,,no,ancient,,no
5,chr1,862093,T,C,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3,C,C,ancestral,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.315789,,,,,,,,,,,,,,,,no,ancient,,no
6,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,,no,ancient,,no
7,chr1,862124,A,G,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35,G,G,ancestral,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.316764,,,,,,,,,,,,,,,,no,ancient,,no
8,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,,no,ancient,,no
9,chr1,862383,C,T,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46,C,C,derived,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,yes,0.310916,,,,,,,,,,,,,,,,no,ancient,,no


In [42]:
def Browning_allele_origin(data):
    if (data['present_in_1KG'] == 'no'):
        return 'archaic-specific'
    elif (data['present_in_1KG'] == 'yes') & (data['Browning_introgressed'] == 'yes'):
        return 'introgressed'
    elif (data['present_in_1KG'] == 'yes') & (data['Browning_introgressed'] == 'no'):
        return 'ancient'

data['Browning_allele_origin'] = data.apply(Browning_allele_origin, axis = 1)

In [43]:
allele_frequencies = data[(data['Browning_allele_origin']=='ancient')]
allele_frequencies = allele_frequencies[['1KG_non_ASW_AFR_AF','1KG_AMR_AF','1KG_EAS_AF','1KG_EUR_AF','1KG_SAS_AF']]
allele_frequencies.head(10)

Unnamed: 0,1KG_non_ASW_AFR_AF,1KG_AMR_AF,1KG_EAS_AF,1KG_EUR_AF,1KG_SAS_AF
0,0.31384,0.83,0.55,0.97,0.88
1,0.31384,0.83,0.55,0.97,0.88
2,0.0,0.0,0.01,0.0,0.0
3,0.0,0.0,0.01,0.0,0.0
4,0.315789,0.83,0.55,0.97,0.88
5,0.315789,0.83,0.55,0.97,0.88
6,0.316764,0.83,0.55,0.97,0.88
7,0.316764,0.83,0.55,0.97,0.88
8,0.310916,0.83,0.55,0.97,0.88
9,0.310916,0.83,0.55,0.97,0.88


In [44]:
ancient_indexes = allele_frequencies[(allele_frequencies >= 0.05).sum(axis=1) >= 2].index
shared_non_introgressed_indexes = allele_frequencies.drop(ancient_indexes).index

In [45]:
data.loc[shared_non_introgressed_indexes,'Browning_allele_origin']='low-confidence ancient'

In [46]:
len(data)

1607350

# New Allele Frequencies

As noted above, some introgressed variants are in fact the reference allele. Let's recalculate the 1KG allele frequencies to reflect the introgressed allele. For Vernot, let's take the average of the five 1KG superpopulations. For Browning, we can easily use the ref_alt column to calculate at all loci.

In [47]:
data['Vernot_introgressed_AF'] = (data['Vernot_AFR_AF'] + data['Vernot_AMR_AF'] + data['Vernot_EAS_AF'] + data['Vernot_EUR_AF'] + data['Vernot_SAS_AF'])/5

In [48]:
def Browning_introgressed_AF(data):
    if (data['Browning_ref_alt'] == 0):
        return 1-(data['1KG_allele_frequency'])
    else:
        return data['1KG_allele_frequency']

data['Browning_introgressed_AF'] = data.apply(Browning_introgressed_AF, axis = 1)

Let's reorder some columns.

In [49]:
data = data[['chrom','pos','ref_allele','alt_allele','ancestral_allele','anc_dev','variant_type','altai_gt','chagyrskaya_gt','denisovan_gt','vindija_gt','altai_gt_boolean','chagyrskaya_gt_boolean','denisovan_gt_boolean','vindija_gt_boolean','distribution','present_in_1KG','1KG_allele_count','1KG_allele_number','1KG_allele_frequency','1KG_EAS_AF','1KG_EUR_AF','1KG_AFR_AF','1KG_AMR_AF','1KG_SAS_AF','1KG_non_ASW_AFR_AF','Vernot_introgressed','Vernot_ancestral_allele','Vernot_derived_allele','Vernot_ancestral_derived_code','Vernot_AFA_AF','Vernot_AFR_AF','Vernot_AMR_AF','Vernot_EAS_AF','Vernot_EUR_AF','Vernot_PNG_AF','Vernot_SAS_AF','Vernot_Denisovan_base','Vernot_Neanderthal_base_1','Vernot_Neanderthal_base_2','Vernot_haplotype_tag','Vernot_allele_origin','Vernot_introgressed_AF','Browning_introgressed','Browning_allele_origin','Browning_ref_alt','Browning_introgressed_AF','annotation','mis_oe','mis_z','lof_oe','lof_z','phyloP','ag_delta','al_delta','dg_delta','dl_delta','delta_max','ag_pos','al_pos','dg_pos','dl_pos']]
data.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,ancestral_allele,anc_dev,variant_type,altai_gt,chagyrskaya_gt,denisovan_gt,vindija_gt,altai_gt_boolean,chagyrskaya_gt_boolean,denisovan_gt_boolean,vindija_gt_boolean,distribution,present_in_1KG,1KG_allele_count,1KG_allele_number,1KG_allele_frequency,1KG_EAS_AF,1KG_EUR_AF,1KG_AFR_AF,1KG_AMR_AF,1KG_SAS_AF,1KG_non_ASW_AFR_AF,Vernot_introgressed,Vernot_ancestral_allele,Vernot_derived_allele,Vernot_ancestral_derived_code,Vernot_AFA_AF,Vernot_AFR_AF,Vernot_AMR_AF,Vernot_EAS_AF,Vernot_EUR_AF,Vernot_PNG_AF,Vernot_SAS_AF,Vernot_Denisovan_base,Vernot_Neanderthal_base_1,Vernot_Neanderthal_base_2,Vernot_haplotype_tag,Vernot_allele_origin,Vernot_introgressed_AF,Browning_introgressed,Browning_allele_origin,Browning_ref_alt,Browning_introgressed_AF,annotation,mis_oe,mis_z,lof_oe,lof_z,phyloP,ag_delta,al_delta,dg_delta,dl_delta,delta_max,ag_pos,al_pos,dg_pos,dl_pos
0,chr1,861808,A,G,A,derived,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.31384,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,SAMD11,1.5082,-3.4361,0.89656,0.47484,-0.683,0.0,0.0,0.01,0.0,0.01,-29,-27,24,-20
1,chr1,861808,A,G,A,derived,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3479.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.31384,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-0.683,0.0,0.0,0.0,0.0,0.0,48,-14,-46,-14
2,chr1,862072,C,T,C,derived,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,yes,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,no,,,,,,,,,,,,,,,low-confidence ancient,,no,low-confidence ancient,,0.0,SAMD11,1.5082,-3.4361,0.89656,0.47484,0.197,0.0,0.0,0.0,0.0,0.0,3,28,-38,41
3,chr1,862072,C,T,C,derived,snv,1/1,1/1,0/0,0/1,True,True,False,True,Neanderthal,yes,9.0,5096.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,no,,,,,,,,,,,,,,,low-confidence ancient,,no,low-confidence ancient,,0.0,AL645608.1,1.2217,-0.64548,0.73515,0.49579,0.197,0.0,0.0,0.0,0.0,0.0,-6,-50,24,-48
4,chr1,862093,T,C,C,ancestral,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.315789,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.042,0.0,0.0,0.0,0.0,0.0,7,14,-15,34
5,chr1,862093,T,C,C,ancestral,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3484.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.315789,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.042,0.0,0.0,0.0,0.0,0.0,-34,48,49,3
6,chr1,862124,A,G,G,ancestral,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.316764,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,SAMD11,1.5082,-3.4361,0.89656,0.47484,-3.75,0.0,0.0,0.0,0.0,0.0,-24,26,-11,3
7,chr1,862124,A,G,G,ancestral,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3485.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.316764,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-3.75,0.0,0.0,0.0,0.0,0.0,17,-14,-28,35
8,chr1,862383,C,T,C,derived,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.310916,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,SAMD11,1.5082,-3.4361,0.89656,0.47484,-1.932,0.0,0.0,0.0,0.0,0.0,22,-26,-21,29
9,chr1,862383,C,T,C,derived,snv,0/0,0/0,1/1,0/0,False,False,True,False,Denisovan,yes,3476.0,5096.0,0.68,0.55,0.97,0.34,0.83,0.88,0.310916,no,,,,,,,,,,,,,,,ancient,,no,ancient,,0.68,AL645608.1,1.2217,-0.64548,0.73515,0.49579,-1.932,0.0,0.0,0.0,0.0,0.0,-23,7,28,46


Save the dataframe for later analysis.

In [50]:
data.to_csv('archaic_data_with_constraint_moderns_introgression.txt', sep = '\t', header = True, index = False)

# sQTLs

Now we need to determine which variants are sQTLs in GTEx. First, load the long-form file we generated in the last notebook. Add a 'value' column to help generate the pivot table.

In [51]:
sQTLs_header = ['chrom_pos','ref_allele','alt_allele','tissue']
sQTLs = pd.read_csv('../GTEx_sQTLs/concat_sQTLs.txt', sep = '\t', names = sQTLs_header)
sQTLs['value'] = 1
sQTLs.head(10)

Unnamed: 0,chrom_pos,ref_allele,alt_allele,tissue,value
0,chr1_739465,TTTTG,T,Adipose_Subcutaneous,1
1,chr1_763097,C,T,Adipose_Subcutaneous,1
2,chr1_763107,A,G,Adipose_Subcutaneous,1
3,chr1_767270,T,C,Adipose_Subcutaneous,1
4,chr1_767578,T,C,Adipose_Subcutaneous,1
5,chr1_774708,C,A,Adipose_Subcutaneous,1
6,chr1_774815,A,G,Adipose_Subcutaneous,1
7,chr1_775065,A,G,Adipose_Subcutaneous,1
8,chr1_775962,A,G,Adipose_Subcutaneous,1
9,chr1_777135,T,TC,Adipose_Subcutaneous,1


Make the pivot table.

In [52]:
sQTLs_pivot = sQTLs.pivot_table(index = ['chrom_pos','ref_allele','alt_allele'], columns = 'tissue', values = 'value')
sQTLs_pivot.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,tissue,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,Brain_Cerebellum,Brain_Cortex,Brain_Frontal_Cortex_BA9,Brain_Hippocampus,Brain_Hypothalamus,Brain_Nucleus_accumbens_basal_ganglia,Brain_Putamen_basal_ganglia,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Breast_Mammary_Tissue,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Colon_Sigmoid,Colon_Transverse,Esophagus_Gastroesophageal_Junction,Esophagus_Mucosa,Esophagus_Muscularis,Heart_Atrial_Appendage,Heart_Left_Ventricle,Kidney_Cortex,Liver,Lung,Minor_Salivary_Gland,Muscle_Skeletal,Nerve_Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
chrom_pos,ref_allele,alt_allele,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
chr10_100000235,C,T,1.0,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,1.0,,,
chr10_100000943,G,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,
chr10_100002628,A,C,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,1.0,,,
chr10_100004827,A,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
chr10_100005358,G,C,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,1.0,,,
chr10_100005711,G,A,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,1.0,,,
chr10_100006780,C,T,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,1.0,,,
chr10_100007241,C,T,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
chr10_100008640,A,G,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,1.0,1.0,,,,,1.0,,,
chr10_100009013,G,A,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,


Get the number of collated sQTLs.

In [53]:
len(sQTLs_pivot)

2027766

In [54]:
sQTLs_pivot = sQTLs_pivot.reset_index()
sQTLs_pivot.head(10)

tissue,chrom_pos,ref_allele,alt_allele,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,Brain_Cerebellum,Brain_Cortex,Brain_Frontal_Cortex_BA9,Brain_Hippocampus,Brain_Hypothalamus,Brain_Nucleus_accumbens_basal_ganglia,Brain_Putamen_basal_ganglia,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Breast_Mammary_Tissue,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Colon_Sigmoid,Colon_Transverse,Esophagus_Gastroesophageal_Junction,Esophagus_Mucosa,Esophagus_Muscularis,Heart_Atrial_Appendage,Heart_Left_Ventricle,Kidney_Cortex,Liver,Lung,Minor_Salivary_Gland,Muscle_Skeletal,Nerve_Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
0,chr10_100000235,C,T,1.0,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,1.0,,,
1,chr10_100000943,G,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,
2,chr10_100002628,A,C,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,1.0,,,
3,chr10_100004827,A,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
4,chr10_100005358,G,C,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,1.0,,,
5,chr10_100005711,G,A,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,1.0,,,
6,chr10_100006780,C,T,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,1.0,,,
7,chr10_100007241,C,T,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
8,chr10_100008640,A,G,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,1.0,1.0,,,,,1.0,,,
9,chr10_100009013,G,A,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,


Now load the variant index from the previous notebook.

In [55]:
variants_header = ['chrom_pos']
variants = pd.read_csv('../GTEx_sQTLs/hg38_chrom_pos_index.txt', sep = '\t', names = variants_header)
variants.head(10)

Unnamed: 0,chrom_pos
0,chr1_803750
1,chr1_803750
2,chr1_926428
3,chr1_926428
4,chr1_926692
5,chr1_926692
6,chr1_926713
7,chr1_926713
8,chr1_926744
9,chr1_926744


Merge the data.

In [56]:
sQTLs = pd.merge(variants, sQTLs_pivot, on = ['chrom_pos'], how = 'left', indicator = True)
sQTLs.head(10)

Unnamed: 0,chrom_pos,ref_allele,alt_allele,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,Brain_Cerebellum,Brain_Cortex,Brain_Frontal_Cortex_BA9,Brain_Hippocampus,Brain_Hypothalamus,Brain_Nucleus_accumbens_basal_ganglia,Brain_Putamen_basal_ganglia,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Breast_Mammary_Tissue,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Colon_Sigmoid,Colon_Transverse,Esophagus_Gastroesophageal_Junction,Esophagus_Mucosa,Esophagus_Muscularis,Heart_Atrial_Appendage,Heart_Left_Ventricle,Kidney_Cortex,Liver,Lung,Minor_Salivary_Gland,Muscle_Skeletal,Nerve_Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,_merge
0,chr1_803750,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
1,chr1_803750,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
2,chr1_926428,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
3,chr1_926428,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
4,chr1_926692,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
5,chr1_926692,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
6,chr1_926713,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
7,chr1_926713,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
8,chr1_926744,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
9,chr1_926744,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both


Subset to just sQTLs that are archaic variants.

In [57]:
sQTLs = sQTLs[sQTLs['_merge'] == 'both']
sQTLs.head(10)

Unnamed: 0,chrom_pos,ref_allele,alt_allele,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,Brain_Cerebellum,Brain_Cortex,Brain_Frontal_Cortex_BA9,Brain_Hippocampus,Brain_Hypothalamus,Brain_Nucleus_accumbens_basal_ganglia,Brain_Putamen_basal_ganglia,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Breast_Mammary_Tissue,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Colon_Sigmoid,Colon_Transverse,Esophagus_Gastroesophageal_Junction,Esophagus_Mucosa,Esophagus_Muscularis,Heart_Atrial_Appendage,Heart_Left_Ventricle,Kidney_Cortex,Liver,Lung,Minor_Salivary_Gland,Muscle_Skeletal,Nerve_Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,_merge
2,chr1_926428,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
3,chr1_926428,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
6,chr1_926713,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
7,chr1_926713,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
8,chr1_926744,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
9,chr1_926744,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
10,chr1_927003,C,T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
11,chr1_927003,C,T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both
12,chr1_927009,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,both
13,chr1_927009,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,both


In [58]:
len(sQTLs)

276762

Now split our silly index column.

In [59]:
sQTLs[['chrom','pos']] = sQTLs['chrom_pos'].str.split('_', expand = True)
sQTLs.head(10)

Unnamed: 0,chrom_pos,ref_allele,alt_allele,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,Brain_Cerebellum,Brain_Cortex,Brain_Frontal_Cortex_BA9,Brain_Hippocampus,Brain_Hypothalamus,Brain_Nucleus_accumbens_basal_ganglia,Brain_Putamen_basal_ganglia,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Breast_Mammary_Tissue,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Colon_Sigmoid,Colon_Transverse,Esophagus_Gastroesophageal_Junction,Esophagus_Mucosa,Esophagus_Muscularis,Heart_Atrial_Appendage,Heart_Left_Ventricle,Kidney_Cortex,Liver,Lung,Minor_Salivary_Gland,Muscle_Skeletal,Nerve_Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,_merge,chrom,pos
2,chr1_926428,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,926428
3,chr1_926428,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,926428
6,chr1_926713,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,926713
7,chr1_926713,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,926713
8,chr1_926744,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,926744
9,chr1_926744,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,926744
10,chr1_927003,C,T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,927003
11,chr1_927003,C,T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,both,chr1,927003
12,chr1_927009,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,both,chr1,927009
13,chr1_927009,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,both,chr1,927009


Drop the two columns we don't need.

In [60]:
sQTLs.drop(['_merge', 'chrom_pos'], axis = 1)

Unnamed: 0,ref_allele,alt_allele,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,Brain_Cerebellum,Brain_Cortex,Brain_Frontal_Cortex_BA9,Brain_Hippocampus,Brain_Hypothalamus,Brain_Nucleus_accumbens_basal_ganglia,Brain_Putamen_basal_ganglia,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Breast_Mammary_Tissue,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Colon_Sigmoid,Colon_Transverse,Esophagus_Gastroesophageal_Junction,Esophagus_Mucosa,Esophagus_Muscularis,Heart_Atrial_Appendage,Heart_Left_Ventricle,Kidney_Cortex,Liver,Lung,Minor_Salivary_Gland,Muscle_Skeletal,Nerve_Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,chrom,pos
2,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,chr1,926428
3,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,chr1,926428
6,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,chr1,926713
7,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,chr1,926713
8,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,chr1,926744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2048570,T,TG,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,chr9,138111537
2048593,C,T,,,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,chr9,138117129
2048603,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,chr9,138121151
2048606,T,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,chr9,138121810


Reorder columns.

In [61]:
sQTLs = sQTLs[['chrom','pos','ref_allele','alt_allele','Adipose_Subcutaneous','Adipose_Visceral_Omentum','Adrenal_Gland','Artery_Aorta','Artery_Coronary','Artery_Tibial','Brain_Amygdala','Brain_Anterior_cingulate_cortex_BA24','Brain_Caudate_basal_ganglia','Brain_Cerebellar_Hemisphere','Brain_Cerebellum','Brain_Cortex','Brain_Frontal_Cortex_BA9','Brain_Hippocampus','Brain_Hypothalamus','Brain_Nucleus_accumbens_basal_ganglia','Brain_Putamen_basal_ganglia','Brain_Spinal_cord_cervical_c-1','Brain_Substantia_nigra','Breast_Mammary_Tissue','Cells_Cultured_fibroblasts','Cells_EBV-transformed_lymphocytes','Colon_Sigmoid','Colon_Transverse','Esophagus_Gastroesophageal_Junction','Esophagus_Mucosa','Esophagus_Muscularis','Heart_Atrial_Appendage','Heart_Left_Ventricle','Kidney_Cortex','Liver','Lung','Minor_Salivary_Gland','Muscle_Skeletal','Nerve_Tibial','Ovary','Pancreas','Pituitary','Prostate','Skin_Not_Sun_Exposed_Suprapubic','Skin_Sun_Exposed_Lower_leg','Small_Intestine_Terminal_Ileum','Spleen','Stomach','Testis','Thyroid','Uterus','Vagina','Whole_Blood']]
sQTLs = sQTLs.fillna(0)
sQTLs.head(10)

Unnamed: 0,chrom,pos,ref_allele,alt_allele,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,Brain_Cerebellum,Brain_Cortex,Brain_Frontal_Cortex_BA9,Brain_Hippocampus,Brain_Hypothalamus,Brain_Nucleus_accumbens_basal_ganglia,Brain_Putamen_basal_ganglia,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Breast_Mammary_Tissue,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Colon_Sigmoid,Colon_Transverse,Esophagus_Gastroesophageal_Junction,Esophagus_Mucosa,Esophagus_Muscularis,Heart_Atrial_Appendage,Heart_Left_Ventricle,Kidney_Cortex,Liver,Lung,Minor_Salivary_Gland,Muscle_Skeletal,Nerve_Tibial,Ovary,Pancreas,Pituitary,Prostate,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
2,chr1,926428,A,G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,chr1,926428,A,G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,chr1,926713,T,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,chr1,926713,T,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,chr1,926744,A,G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,chr1,926744,A,G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10,chr1,927003,C,T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11,chr1,927003,C,T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12,chr1,927009,A,G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,chr1,927009,A,G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


And export this lovely new dataframe. We'll lift this over in the next, very short notebook.

In [62]:
sQTLs.to_csv('../GTEx_sQTLs/sQTLs_hg38.txt', sep = '\t', header = False, index = False)