In [1]:
import os
import sys
sys.path.append('..')

In [2]:
import scripts.settings as settings

In [3]:
import pandas as pd
import numpy as np

In [4]:
from pandas_plink import read_plink1_bin, write_plink1_bin

In [5]:
rng = np.random.default_rng(123)

### Identify sequenced cell lines with open access genotypes

In [6]:
donor_ids = pd.read_csv(settings.ENDO_META_PATH, sep='\t')[['donor', 'donor_short_id', 'donor_long_id']].drop_duplicates()

In [7]:
donor_ids

Unnamed: 0,donor,donor_short_id,donor_long_id
21843_1#10,joxm,joxm_1,HPSI0114i-joxm_1
21843_1#100,fafq,fafq_1,HPSI0314i-fafq_1
21843_1#102,wuye,wuye_2,HPSI1013i-wuye_2
21843_1#105,iisa,iisa_3,HPSI0114i-iisa_3
21843_1#114,lexy,lexy_1,HPSI0114i-lexy_1
...,...,...,...
23794_1#110,nudd,nudd_1,HPSI0413i-nudd_1
24252_1#10,bokz,bokz_5,HPSI0814i-bokz_5
24229_3#100,zagm,zagm_1,HPSI1013i-zagm_1
24229_3#106,fejf,fejf_2,HPSI0513i-fejf_2


In [8]:
G = read_plink1_bin(settings.HIPSCI_GENO_PATH)

Mapping files: 100%|██████████| 3/3 [00:43<00:00, 14.51s/it]


In [9]:
open_access = pd.Series(G.sample.values)
donor_ids = donor_ids[donor_ids['donor_long_id'].isin(open_access)]

### Generate genotype subsets

In [10]:
G_sel = G[:, G.chrom == str(settings.CHROM)]
G_sel = G_sel[:, G_sel.values.sum(0) / (2 * G_sel.shape[0]) > 0.02]
G_sel = G_sel[pd.Series(G_sel.sample).isin(donor_ids['donor_long_id']), :]
G_sel = G_sel[:, G_sel.values.std(0) > 0.2] # remove low-variance SNPs
G_sel = G_sel[:, G_sel[:40, :].values.std(0) > 0] # remove low-variance SNPs for small numbers of simulated individuals
G_sel

Unnamed: 0,Array,Chunk
Bytes,35.11 MiB,365.62 kiB
Shape,"(100, 92037)","(100, 936)"
Count,39143 Tasks,123 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 35.11 MiB 365.62 kiB Shape (100, 92037) (100, 936) Count 39143 Tasks 123 Chunks Type float32 numpy.ndarray",92037  100,

Unnamed: 0,Array,Chunk
Bytes,35.11 MiB,365.62 kiB
Shape,"(100, 92037)","(100, 936)"
Count,39143 Tasks,123 Chunks
Type,float32,numpy.ndarray


In [11]:
write_plink1_bin(G_sel, settings.FILTERED_GENO_PATH)

Writing BED: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]

Writing FAM... done.
Writing BIM... 




done.


### Subset & save kinship matrix

In [12]:
kinship = pd.read_csv(settings.HIPSCI_KINSHIP_PATH, sep='\t', index_col=0)

In [13]:
kinship = kinship.loc[donor_ids['donor_long_id'], donor_ids['donor_long_id']]

In [14]:
kinship.to_csv(settings.FILTERED_KINSHIP_PATH)