In [1]:
%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy.random as npr
import re
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
# this is Table S4, downloaded as a TSV.
denovo = pd.read_csv('table_s4_germline_de_novo.tsv', sep='\t', skiprows=1)  # first row is title/description
denovo['both'] = denovo.eval('heuristic == 1 and opl == 1').astype(int)
denovo.head(2)

Unnamed: 0,FID,mother_id,individual_id,tissue,position,MAF,heuristic,opl,opl_posterior_prob_de_novo,both
0,F156,F156m1,F156m1c1,bl,13951,0.05747,1,0,4.3e-05,0
1,F157,F157m1,F157m1c2,bl,16240,0.06672,1,0,0.344165,0


In [3]:
# Load in information on all 345 individuals
fam = pd.read_csv('famfile.tsv', sep='\t')
# Convert mother's age at birth from days to years.
fam['age_birth'] /= 365.0
print 'num tissue samples:', fam.shape[0]
print 'num individuals:', int(fam.shape[0]/2)
fam.head(2)

num tissue samples: 690
num individuals: 345


Unnamed: 0,fqid,FID,mother_id,individual_id,level,tissue,tissue_id,Sex,fam_str,fam_cat,mot_cat,age_collection,age_birth
0,TR1329M_S13,F117,F117m1,F117m1c1,c1,bl,F117m1c1_bl,Female,0-0-1-2,m1c2,,6620,21.024658
1,TR1329M_S6,F117,F117m1,F117m1c1,c1,ch,F117m1c1_ch,Female,0-0-1-2,m1c2,,6620,21.024658


In [4]:
# Drop duplicates across tissues since we just want the age of every indivdual.
fam.drop_duplicates(['individual_id'], inplace=True)
assert fam.shape[0] == 345

In [13]:
# Discard individuals at the top of their pedigree. fam_cat[0] == level[0]
filt = fam.apply(lambda x: x['level'][0] != x['fam_cat'][0], axis=1)
print 'number of individuals whose mothers were sequenced (345 individuals - 96 heads of family):', filt.sum()
fam_have_mother = fam.loc[filt,:]
# Make sure we have mother's age at birth for all these individuals.
assert fam_have_mother['age_birth'].isnull().sum() == 0

number of individuals whose mothers were sequenced (345 individuals - 96 heads of family): 249


In [6]:
# Make the regression data.
regdat = fam_have_mother[['individual_id', 'age_birth']].set_index('individual_id')
heur_counts = denovo.groupby('individual_id')['heuristic'].sum()
opl_counts = denovo.groupby('individual_id')['opl'].sum()
intersection_counts = denovo.groupby('individual_id')['both'].sum()
regdat['heur'] = 0
regdat['heur'] = regdat['heur'].add(heur_counts, fill_value=0).astype(int)
regdat['opl'] = 0
regdat['opl'] = regdat['opl'].add(opl_counts, fill_value=0).astype(int)
regdat['intersection'] = 0
regdat['intersection'] = regdat['intersection'].add(intersection_counts, fill_value=0).astype(int)
regdat.head(2)

Unnamed: 0_level_0,age_birth,heur,opl,intersection
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F117m1c1,21.024658,0,0,0
F294m1c1,36.673973,0,2,0


In [16]:
# Perform the Poisson regression for the OPL mutations.
sm.GLM(regdat['opl'], sm.add_constant(regdat['age_birth']),
       family=sm.families.Poisson(),
       link=sm.families.links.Log()).fit().summary()

0,1,2,3
Dep. Variable:,opl,No. Observations:,249.0
Model:,GLM,Df Residuals:,247.0
Model Family:,Poisson,Df Model:,1.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-324.11
Date:,"Thu, 11 Apr 2019",Deviance:,463.59
Time:,11:11:13,Pearson chi2:,548.0
No. Iterations:,5,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.6327,0.426,-3.836,0.000,-2.467,-0.798
age_birth,0.0419,0.014,3.053,0.002,0.015,0.069


In [15]:
# Perform the Poisson regression for the heuristic mutations.
sm.GLM(regdat['heur'], sm.add_constant(regdat['age_birth']),
       family=sm.families.Poisson(),
       link=sm.families.links.Log()).fit().summary()

0,1,2,3
Dep. Variable:,heur,No. Observations:,249.0
Model:,GLM,Df Residuals:,247.0
Model Family:,Poisson,Df Model:,1.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-312.59
Date:,"Thu, 11 Apr 2019",Deviance:,453.47
Time:,11:11:04,Pearson chi2:,581.0
No. Iterations:,5,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2845,0.439,-2.928,0.003,-2.144,-0.425
age_birth,0.0276,0.014,1.921,0.055,-0.001,0.056


In [14]:
# Perform the Poisson regression for the heuristic mutations.
sm.GLM(regdat['intersection'], sm.add_constant(regdat['age_birth']),
       family=sm.families.Poisson(),
       link=sm.families.links.Log()).fit().summary()

0,1,2,3
Dep. Variable:,intersection,No. Observations:,249.0
Model:,GLM,Df Residuals:,247.0
Model Family:,Poisson,Df Model:,1.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-238.35
Date:,"Thu, 11 Apr 2019",Deviance:,363.66
Time:,11:10:49,Pearson chi2:,571.0
No. Iterations:,6,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.3654,0.558,-4.236,0.000,-3.460,-1.271
age_birth,0.0485,0.018,2.712,0.007,0.013,0.084
