In [None]:
%run IMPORT.ipynb

In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib notebook

In [None]:
from itertools import product

### Parse data and save

In [None]:
sampleqc = pd.read_csv('data/ukb_sqc_v2.txt', sep=' ', header=None).drop([0,1], axis=1)
index = pd.read_csv('data/ukbxxxx_cal_chr10_v2_sxxxxxx.fam', sep=' ', header=None)
sampleqc['index'] = index[0].values
sampleqc = sampleqc.set_index('index')
pca = sampleqc[range(25,65)]
pca.columns = ['PC%i' % i for i in range(len(pca.columns))]

sampleqc.to_pickle('data/sampleqc.pkl')
pca.to_pickle('data/pca.pkl')

In [None]:
def parse_mort_data_and_baseline_info():

    ukb10772 = pd.read_csv('data/ukbxxxxx.csv.gz',
            index_col=0, engine='c', quotechar='\"', quoting=1, memory_map=False, low_memory=False,
            usecols=['eid','31-0.0','34-0.0','52-0.0','53-0.0','54-0.0','21000-0.0','40000-0.0'],
                           dtype={'34-0.0':str,'52-0.0':str},
                           parse_dates=['53-0.0','40000-0.0'])

    ukb10772['54-0.0'].to_pickle('data/assess_center.pkl')
    
    birth = pd.to_datetime(ukb10772['34-0.0']+'-'+ukb10772['52-0.0']+'-15')
    sex = pd.Series(ukb10772['31-0.0'], name='sex')

    t1 = ukb10772['53-0.0'] - birth
    t1 = t1.apply(lambda x: x.days/365.25)

    deathtime = ukb10772['40000-0.0']
    deathtime = deathtime.fillna(deathtime.max())

    t2 = deathtime - pd.to_datetime(birth, errors='coerce')
    # t2 = deathtime.max() - pd.to_datetime(birth, errors='coerce')
    t2 = t2.apply(lambda x: x.days/365.25)

    event = ukb10772['40000-0.0'].notnull()

    tf = deathtime.max() - pd.to_datetime(birth, errors='coerce')
    tf = tf.apply(lambda x: x.days/365.25)
    
    return pd.concat([birth, sex, t1, t2, tf, event], axis=1, keys=['birth', 'sex', 't1', 't2', 'tf', 'event'])

parse_mort_data_and_baseline_info()

In [None]:
race = pd.read_csv('data/ukbxxxxx.csv.gz',
                  index_col=0, engine='c', quotechar='\"', quoting=1, memory_map=False, usecols=['eid','21000-0.0'])

race = race.rename(columns={'21000-0.0':'race'})['race']
coding1001 = pd.read_csv('data/coding1001.tsv', sep='\t').set_index('coding')['meaning']

for key, value in coding1001.iteritems():
    race[race==key] = value

In [None]:
race.to_pickle('data/race.pkl')

### Load data

In [None]:
sampleqc = pd.read_pickle('data/sampleqc.pkl')
pca = pd.read_pickle('data/pca.pkl')
assess_center = pd.read_pickle('data/assess_center.pkl')
baseline = pd.read_pickle('data/baseline.pkl')
race = pd.read_pickle('data/race.pkl')

### Disease incidence data generation

In [None]:
def e(field, instances, arrays):
    return ['%s-%i.%i'%(field, a, b) for a, b in product(range(instances),range(arrays))]

def make_ukbdf():
    ukb1 = pd.read_csv('data/ukbxxxxxx.csv.gz',
                            index_col=0, engine='c', quotechar='\"', quoting=1, memory_map=False, low_memory=False,
                            usecols=['eid']+
                            e('4056',3,1)+ #  stroke
                            e('3894',3,1)+ #  heart attack/ MI

                            e('20001',3,6)+ #  deseases
                            e('20002',3,29)+ #  deseases

                            e('40008',32,1)+ #  cancer
                            e('40006',32,1)+ #  cancer icd
                            e('40012',32,1)+ #  cancer icd
                            e('40007',3,1), #  death
#                             ['40000-0.0','40001-0.0','40002-0.0'],
                            )
    ukb2 = pd.read_csv('data/ukbxxxxxx.csv.gz',
                         index_col=0, engine='c', quotechar='\"', quoting=1, memory_map=False)

    ukbdfxs = ukb1.index.intersection(ukb2.index)
    ukbdf = pd.concat([ukb2.loc[ukbdfxs,:],ukb1.loc[ukbdfxs,:]], axis=1)
    return ukbdf


def parse_ICD10_data():

    hesin = pd.read_csv('data/hesin.tsv', sep='\t',
                        usecols=['eid','record_id','diag_icd10','epistart']).dropna(how='any')
    hesin_diag10 = pd.read_csv('data/hesin_diag10.tsv', sep='\t',
                               usecols=['eid','record_id','diag_icd10']).dropna(how='any')

    coding19 = pd.read_csv('data/coding19.tsv', sep='\t')
    coding19['codelen'] = coding19['coding'].apply(len)

    epistart = pd.to_datetime(hesin[['epistart','record_id']].set_index('record_id')['epistart'], errors='coerce')

    main = hesin.set_index('eid')
    secondary = hesin_diag10.set_index('eid')
    datasets = []

    for dataset in [main, secondary]:
        dataset['birth'] = baseline.loc[dataset.index,'birth']
        dataset = dataset.reset_index().set_index('record_id')
        dataset['epistart'] = epistart[dataset.index]
        dataset['age'] = dataset['epistart'] - dataset['birth']
        dataset['age'] = dataset['age'].apply(lambda x: x.days/365.25)
        dataset = dataset.reset_index().set_index('eid').drop(['epistart','birth','record_id'],
                                                                  axis=1).reset_index().set_index(['eid']).dropna()
        datasets.append(dataset)

    icd10 = pd.concat(datasets)
    icd10 = icd10.reset_index().drop_duplicates()#.set_index('eid')
    icd10['first3'] = icd10['diag_icd10'].apply(lambda x: x[:3])

    icdsource = icd10[['eid','age','first3']].groupby(['eid','first3']).min().reset_index()
    
    return icdsource[icdsource['eid']!=2715388]


EXCLUDE_C44 = False

def make_ICD_dfs():
    
    icdsource = parse_ICD10_data()
    ukbdf = make_ukbdf()

    icd_df = pd.DataFrame(index=ukbdf.index, columns=descodes.keys())

    for desease_name, codes in descodes.iteritems():

        ill = icdsource.loc[icdsource['first3'].isin(codes['icd']),['eid','age']].groupby('eid').min()['age']
        icd_df.loc[ill.index,desease_name] = ill.values


    if EXCLUDE_C44:

        mask1 = (ukbdf[e('40012',32,1)]>2).rename(columns=dict(zip(e('40012',32,1),e('40008',32,1))))
        cancericd = ukbdf[e('40006',32,1)]
        mask2 = (~pd.concat([cancericd[col].astype(str).str.contains('C44') for col in cancericd.columns],
                            axis=1)).rename(
            columns=dict(zip(e('40006',32,1),e('40008',32,1))))

        cancer_reg = ukbdf[e('40008',32,1)][mask1&mask2].min(1)

        ccc = pd.concat([cancer_reg,
                         icdsource.loc[(icdsource['first3'].str.contains('C')&(~icdsource['first3'].str.contains(
                             'C44'))), ['eid','age']].groupby('eid').min()['age']], axis=1,
                         keys=['CR','ICD'])

    else:

        cancer_reg = ukbdf[e('40008',32,1)][(ukbdf[e('40012',32,1)]>2).rename(
            columns=dict(zip(e('40012',32,1),e('40008',32,1))))].min(1)

        ccc = pd.concat([cancer_reg,
                         icdsource.loc[icdsource['first3'].str.contains('C'),
                                       ['eid','age']].groupby('eid').min()['age']], axis=1, keys=['CR','ICD'])

    cancer_comb = ccc.min(1).dropna()

    mort = baseline[['t1','t2','event']]#pd.concat([t1,t2,event], axis=1, keys=['t1','t2','event'])

    icd_df['cancer'] = cancer_comb.loc[icd_df.index]
    icd_df.loc[mort['event']==True,'death'] = mort.loc[mort['event']==True,'t2']

    srdeseases = ukbdf[e('20002',3,29)].rename(columns=dict(zip(e('20002',3,29),e('20009',3,29))))
    srages = ukbdf[e('20009',3,29)]

    srages[srages<0] = np.nan

    sr_df = pd.DataFrame(index=ukbdf.index, columns=descodes.keys())

    for desease_name, codes in descodes.iteritems():

        ill = srages[srdeseases.isin(codes['sr'])].min(1)
        sr_df.loc[ill.index,desease_name] = ill.values

    sr_df['cancer'] = ukbdf[e('20007',3,6)].min(1).loc[sr_df.index]
    # sr_df['cancer'] = pd.Series(np.nan, index=sr_df.index)
    sr_df['death'] = np.nan
    
    return icd_df, sr_df

icd_df, sr_df = make_ICD_dfs()

In [None]:
clean_index = sampleqc.index[sampleqc[23]==1].intersection(
              sampleqc.index[sampleqc[2]=='UKBB']).intersection(
              sampleqc.index[sampleqc[24]==1])

In [None]:
clean_index = race[(race=='British')|(race=='Any other white background') \
                   |(race=='Irish')|(race=='White')].index.intersection(sampleqc.index[sampleqc[24]==1])

In [None]:
pd.Series(index=clean_index).to_pickle('data/clean_index.pkl')

In [None]:
icd_df = pd.read_pickle('data/icd_df.pkl')
sr_df = pd.read_pickle('data/sr_df.pkl')

In [None]:
icd_df.loc[clean_index,:].notnull().sum()

In [None]:
def get_comb_df(setlist, clean_index):
    comb_df = icd_df[setlist].min(1)
    sr_comb = sr_df[setlist].min(1)

    add = sr_comb[sr_comb.notnull()&comb_df.isnull()]
    comb_df[add.index] = add.values
    comb_df[comb_df<0] = np.nan
    comb_df = pd.concat([comb_df,comb_df.notnull(),baseline['t2'],baseline['tf']], axis=1,
                        keys=['tdiag','event','t2','tf']).loc[clean_index,:]
    comb_df.loc[:,'t2'] = comb_df.loc[:,['tdiag','t2']].min(1)
    return comb_df

In [None]:
sets = {name:[name] for name in descodes.keys()}
sets['death'] = ['death']
sets['cancer'] = ['cancer']
sets['healthspan'] = descodes.keys() + ['death','cancer']

In [None]:
get_comb_df(sets['death'], baseline.index).to_pickle('data/lifespan_500k.pkl')
get_comb_df(sets['healthspan'], baseline.index).to_pickle('data/healthspan_500k.pkl')

### Plots generation

#### Incidence rates and slopes generation

In [None]:
from scipy.stats import binom

In [None]:
td = 2.

plotdata = {}

for label, setlist in sets.iteritems():

    comb_df = get_comb_df(setlist, clean_index)

    if label == 'death':
        comb_df['t1'] = baseline.loc[comb_df.index,'t1']
    else:
        comb_df['t1'] = 0

    xrange_ = np.arange(comb_df['t2'].min(),comb_df['t2'].max(),td)

    res = pd.DataFrame(xrange_, columns=['x'])
    res['res'] = np.nan
    res['Nd'] = np.nan
    res['Na'] = np.nan

    for i, a in enumerate(xrange_):
        folslice = comb_df[(comb_df['tf'] >= (a+td))&(comb_df['t1']<=a)]
        Nd = np.sum((folslice['event'] == True) & (folslice['t2'] >= a) & (folslice['t2'] < (a+td)))
        Na = np.sum(((folslice['event'] == True) & (folslice['t2'] >= a))|
                    ((folslice['event'] == False) & (folslice['t2'] >= (a+td))))
        res.loc[i,'res'] = np.divide(float(Nd),float(Na))/td
        dNd = Nd**0.5
        dNa = Na**0.5
        res.loc[i,'Nd'] = Nd
        res.loc[i,'Na'] = Na
        res.loc[i,'std1'] = np.sqrt((dNd/Na)**2+(dNa*Nd/Na**2)**2)/td
        res.loc[i,'std2'] = np.sqrt((dNd/Na)**2-(dNa*Nd/Na**2)**2)/td
        binom95ci = binom.ppf([0.025,0.975], Na, res.loc[i,'res'])
        res.loc[i,'std4'] = binom95ci[0]/Na
        res.loc[i,'std5'] = binom95ci[1]/Na

    plotdata[label] = res

#### Incidence rates plotting

In [None]:
with pd.ExcelWriter('data/supdata1.xlsx') as writer:

    for key, value in plotdata.iteritems():

        value.drop(['std1','std2'], axis=1).rename(
        columns={'x':'age','res':'logIR','std4':'logIR 2.5% CI',
                 'std5':'logIR 97.5% CI'}).set_index('age').to_excel(writer, sheet_name=key)

In [None]:
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['font.size'] = 8

In [None]:
fig = pl.figure(figsize=(6,6))
ax = fig.add_subplot(111)
colors = {}

for label, res in plotdata.iteritems():

    if label == 'healthspan':
        lw = 3
    else:
        lw = 2
    res_ = res.copy()
    res_[res_['Nd']<=30] = np.nan
    plot = ax.plot(res_['x'], res_['res'], lw=lw, label=label)
    colors[label] = plot[0].get_color()
    ax.fill_between(res_['x'], res_['std4'], res_['std5'],
                    color=colors[label], alpha=0.4, linewidth=0.0)
    
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)

pl.yscale('log')
pl.ylim(bottom=1e-5)
pl.legend()
ax.set_xlabel('Age, years')
pl.ylabel('log Incidence rate, cases per population per year')

In [None]:
fig.savefig('data/DiseasesIncidence.pdf', dpi=300)
fig.savefig('data/DiseasesIncidence.png', dpi=300)

#### Picharts data generation

In [None]:
def generate_data_for_piecharts():
    comb_df = icd_df[sets['healthspan']].min(1)
    sr_comb = sr_df[sets['healthspan']].min(1)

    icd_minmask = icd_df==pd.concat([comb_df]*icd_df.shape[1], keys=icd_df.columns, axis=1)
    sr_minmask = sr_df==pd.concat([sr_comb]*sr_df.shape[1], keys=sr_df.columns, axis=1)

    labeldf = icd_minmask.copy()
    for col in labeldf.columns:
        labeldf[col] = col

    mindes = {}

    for title, maskdf in {'sr':sr_minmask, 'icd':icd_minmask}.iteritems():

        labeldf = maskdf.copy()
        for col in labeldf.columns:
            labeldf[col] = col

        nonnadf = labeldf[maskdf].dropna(how='all')

        sampleser = pd.Series(index=nonnadf.index)
        for ix, row in nonnadf.iterrows():
            sampleser[ix] = row.dropna().sample(n=1, random_state=42).values[0]
        mindes[title] = sampleser

    mindesdf = pd.DataFrame(index=icd_minmask.index, columns=['sr','icd'])
    mindesdf.loc[mindes['sr'].index,'sr'] = mindes['sr']
    mindesdf.loc[mindes['icd'].index,'icd'] = mindes['icd']

    add = mindesdf.loc[mindesdf['sr'].notnull()&mindesdf['icd'].isnull(),'sr']

    totaldf = mindesdf['icd'].copy()
    totaldf[add.index] = add.values

    totaldfpie = totaldf.dropna().value_counts()
    
    return icd_minmask, sr_minmask, totaldfpie

icd_minmask, sr_minmask, totaldfpie = generate_data_for_piecharts()

#### Resulting pichart plot

In [None]:
pl.figure()
pl.pie(totaldfpie.values, labels=totaldfpie.index, autopct='%1.1f%%',
       colors=[colors[label] for label in totaldfpie.index])
pl.axis('equal');
# pl.title(title)

In [None]:
incid = {'Combined data':totaldfpie,
         'Clinical information data':icd_minmask.sum(),
         'Self-reported data':sr_minmask.sum()}
for key, value in incid.iteritems():
    incid[key] = pd.concat([value, (100.*value/value.sum()).round(decimals=1)], axis=1,
                           keys=['events','percentage'])
supdata2 = pd.concat(incid, axis=1).sort_values([('Combined data','events')], ascending=False)[
    ['Clinical information data','Self-reported data','Combined data']]
supdata2.to_excel('data/supdata2.xlsx')

#### Supplementary piecharts and plotsm

In [None]:
pl.figure()
(icd_minmask).sum(1).hist(label='icd');
(sr_minmask).sum(1).hist(label='sr');
pl.legend()

renamedict = {'sr':'Self-reported data', 'icd':'Clinical information data'}
startangles = {'sr':0, 'icd':20}

for title, maskdf in {'sr':sr_minmask, 'icd':icd_minmask}.iteritems():
    sumdf = maskdf.sum()
    sumdf = sumdf[sumdf>0]
    pl.figure()
    pl.pie(sumdf.values, labels=sumdf.index, colors=[colors[label] for label in sumdf.index],
           autopct=lambda(p): '{:.0f}'.format(p * sumdf.sum() / 100),
          pctdistance=0.9, rotatelabels=False, labeldistance=1.1, startangle=startangles[title])
    pl.axis('equal')
    pl.title(renamedict[title], y=1.1)
#     pl.tight_layout()

### LH optimization

In [None]:
pd.set_option('display.max_rows', 50)

In [None]:
index_slice = race[(race=='British')|(race=='Any other white background')|(race=='Irish')|(race=='White')].index

In [None]:
morths = get_comb_df(sets['healthspan_wo_death'], clean_index)
morths['t1'] = 0

In [None]:
morths.event.sum()

In [None]:
morths = morths[morths['t2']!=0]

In [None]:
dfindex = df_main.index

In [None]:
batch_dummy = pd.get_dummies(sampleqc.loc[:,3])
batch_dummy = batch_dummy.drop(batch_dummy.columns[batch_dummy.columns.str.contains('UKBiLEVEAX')], axis=1)
df_main = pd.concat([
                     baseline['sex'],
                     pca,#.iloc[:,:15],
#                      pd.get_dummies(race),
                     pd.get_dummies(assess_center),
                     batch_dummy,
                     morths.drop(['tdiag','tf'], axis=1, errors='ignore').astype(float)
                    ], axis=1).loc[clean_index,:].dropna()#.sample(n=40000, random_state=42)

In [None]:
# df_main = pd.concat([df_main[['sex']+covariates2.tolist()], df_main[['t1','t2','event']]], axis=1)
# df_main = pd.concat([df_main[['sex']], df_main[['t1','t2','event']]], axis=1)
df_main.iloc[:,:-3] -= df_main.iloc[:,:-3].mean(0)
df_main.iloc[:,:-3] /= df_main.iloc[:,:-3].std(0)

In [None]:
df_main = df_main.dropna(axis=1)

In [None]:
betas = optimize_cox_gomperz(df_main)

In [None]:
df_main.to_pickle('data/df_slice_andersen_list.pkl')
betas.to_pickle('data/betas_andersen_list.pkl')

In [None]:
(np.log(betas['gamma']/betas['M0'])-0.577+betas['sex'])/betas['gamma']-\
(np.log(betas['gamma']/betas['M0'])-0.577-betas['sex'])/betas['gamma']

### Replication

#### Dataset compisition

In [None]:
pl.figure()
venn3(subsets=(set(sampleqc.index[sampleqc[23]==1].tolist()),
               set(sampleqc.index[sampleqc[2]=='UKBB']),
               set(sampleqc.index[sampleqc[24]==1].tolist())),
      set_labels=('british ancestry','not UK Bileve','PCA cohort (QC passed)'))

#### COJO SNPs and healthspan data generation

In [None]:
cojodata = pd.concat([pd.read_csv(f, delim_whitespace=True, index_col=0).iloc[:,5:] \
           for f in glob('*.raw')], axis=1)
cojodata.columns = [col[:-2] for col in cojodata.columns]
# %store cojodata

In [None]:
clean_index = sampleqc.index[sampleqc[24]==1]
morths = get_comb_df(descodes.keys() + ['death','cancer'], clean_index)
morths['t1'] = 0

#### Parcing race data and composing cohorts for replication

In [None]:
cohorts = {'gwas':sampleqc.index[sampleqc[23]==1].intersection(
                  sampleqc.index[sampleqc[2]=='UKBB'].intersection(
                  sampleqc.index[sampleqc[24]==1]))}

cohorts['british'] = sampleqc.index[sampleqc[23]==1].intersection(
                      sampleqc.index[sampleqc[2]=='UKBL'].intersection(
                      sampleqc.index[sampleqc[24]==1]))

cohorts['replication'] = pd.Index(set(sampleqc.index[sampleqc[24]==1].tolist())-set(cohorts['gwas'].tolist()))

cohorts['white'] = race.index[race.isin(['British','Any other white background','Irish','White'])].intersection(
    cohorts['replication'])
cohorts['african'] = race.index[race.isin(['African'])].intersection(cohorts['replication'])
cohorts['indian'] = race.index[race.isin(['Indian','Pakistani','Bangladeshi'])].intersection(cohorts['replication'])
cohorts['chinese'] = race.index[race.isin(['Chinese'])].intersection(cohorts['replication'])
cohorts['carribean'] = race.index[race.isin(['Caribbean'])].intersection(cohorts['replication'])

In [None]:
replication_cohorts = ['white','african','indian','chinese','carribean']

In [None]:
from functools import reduce
fullcohort = reduce(pd.Index.union,[cohorts[elem] for elem in replication_cohorts]).tolist()
np.savetxt('data/replication_cohort.txt', np.stack((fullcohort,fullcohort)).T, fmt='%s')

#### Run replication pipelines

In [None]:
morths = pd.read_pickle('data/healthspan_500k.pkl')
morths['t1'] = 0

In [None]:
batch_dummy = pd.get_dummies(sampleqc.loc[:,3])
race_dummy = pd.get_dummies(race)
df_main_ = pd.concat([baseline['sex'],
                      race_dummy,
                      pd.Series((sampleqc.loc[:,2]=='UKBB').astype(float),name='ukbb'),
                      pca, pd.get_dummies(assess_center),
                      batch_dummy, morths.drop(['tdiag','tf'], axis=1, errors='ignore').astype(float)],
                     axis=1)#.loc[sampleqc.index[sampleqc[24]==1],:]

In [None]:
res = [replication_pipeline(df_main_, cojodata, cohorts[cohort_name]) for cohort_name in replication_cohorts]

gwas_replication = pd.concat([elem[0] for elem in res], axis=1, keys=replication_cohorts)
betas_replication = pd.concat([elem[1] for elem in res], axis=1, keys=replication_cohorts)

betas = gwas_replication.loc[:,(slice(None),'beta')]
betas.columns = betas.columns.droplevel(1)
sigmas = gwas_replication.loc[:,(slice(None),'sigma')]
sigmas.columns = sigmas.columns.droplevel(1)

In [None]:
replication_ma = pd.concat([metaanalysis(betas, sigmas)], axis=1, keys=['meta-analysis'])

renamedic = {'beta':'Beta','sigma':'SE','p':'P-value'}
replication_table = pd.concat([gwas_replication, replication_ma],
                              axis=1).drop(['Nd'], axis=1, level=1).rename(columns=renamedic, level=1)

In [None]:
pd.read_csv('data/suptable1.tsv', sep='\t', index_col=0, header=[0,1]).loc[snporder,:]

In [None]:
snporder = ['rs12134662','rs10197246','rs12203592','rs1049053','rs10455872','rs140570886',
            'rs7859727','rs34872471','rs2860197','rs79820308','rs1126809','rs4784227','rs4268748','rs159428']
replication_table.loc[snporder,:]

In [None]:
replication_pipeline(df_main_, cojodata, cohorts['gwas'])