In [1]:
import numpy as np
import pandas as pd

Load the catalog of non-synonymous fixed modern-human-specific autosomal differences

In [9]:
catalog = (pd
           .read_table('/mnt/expressions/mp/Projects/catalogs/final_version/HumDer_SNCs/fixed/fixed_HumDer_SNCs_merged_genic_ccds_nonsyn.tsv')
           .rename(columns={'Chrom'       : 'chrom',
                            'Pos'         : 'pos',
                            'Consequence' : 'consequence',
                            'Ancestor'    : 'ancestral',
                            'States'      : 'archaics'})
           .query('chrom != "X"')
          )[['chrom', 'pos', 'consequence', 'ancestral', 'archaics']]

Split the column with archaics states (like 'A/A,A/H,A/A,A/A' in the order Vindija, Sidron, Altai, Denisovan) into individual columns and drop the original column:

In [10]:
catalog = catalog.join(
    catalog['archaics']
    .apply(lambda x: pd.Series(x.split(',')))
    .rename(columns={0:'Vindija', 1:'ElSidron', 2:'Altai', 3:'Denisovan'})) \
    .drop(labels=['archaics', 'ElSidron'], axis=1)

In [11]:
catalog.head()

Unnamed: 0,chrom,pos,consequence,ancestral,Vindija,Altai,Denisovan
5,1,79095493,NON_SYNONYMOUS_CODING,A,A/A,A/A,A/A
6,1,79106805,"NON_SYNONYMOUS_CODING,SPLICE_SITE",A,A/A,A/A,A/A
7,1,118558632,NON_SYNONYMOUS_CODING,C,A/A,A/A,A/A
8,1,118634297,NON_SYNONYMOUS_CODING,C,A/A,A/A,A/A
9,1,153751869,NON_SYNONYMOUS_CODING,T,./.,A/A,A/A


In [13]:
catalog = catalog.join(
    catalog[['Vindija', 'Altai', 'Denisovan']].replace(to_replace=['A/A', 'A/H', 'H/H', './.'],
                                                       value=[0.0, 0.5, 1.0, np.nan]),
    lsuffix="_GT")

In [14]:
catalog

Unnamed: 0,chrom,pos,consequence,ancestral,Vindija_GT,Altai_GT,Denisovan_GT,Vindija,Altai,Denisovan
5,1,79095493,NON_SYNONYMOUS_CODING,A,A/A,A/A,A/A,0.0,0.0,0.0
6,1,79106805,"NON_SYNONYMOUS_CODING,SPLICE_SITE",A,A/A,A/A,A/A,0.0,0.0,0.0
7,1,118558632,NON_SYNONYMOUS_CODING,C,A/A,A/A,A/A,0.0,0.0,0.0
8,1,118634297,NON_SYNONYMOUS_CODING,C,A/A,A/A,A/A,0.0,0.0,0.0
9,1,153751869,NON_SYNONYMOUS_CODING,T,./.,A/A,A/A,,0.0,0.0
10,1,158648210,NON_SYNONYMOUS_CODING,C,A/A,A/A,A/A,0.0,0.0,0.0
11,1,204966474,NON_SYNONYMOUS_CODING,A,A/A,A/A,A/A,0.0,0.0,0.0
12,1,245582905,NON_SYNONYMOUS_CODING,G,./.,A/A,A/A,,0.0,0.0
13,2,40657356,NON_SYNONYMOUS_CODING,A,A/A,A/A,A/A,0.0,0.0,0.0
14,2,73438011,NON_SYNONYMOUS_CODING,A,A/A,A/A,A/A,0.0,0.0,0.0
