In [1]:
# ALL imports
import pysam   # for VCF handling
import cyvcf2 # for VCF handling
from cyvcf2 import VCF # for VCF handling
import numpy as np  
import pandas as pd 

## ___FINAL FILE___ : HAPNEST SYNTHETIC DATA ##
#### Pipeline to create, clean and filter the synthetic dataset created using HAPNEST.

-----------------------------------------------------------------------------
### ___TERMINAL COMMANDS___ 
##### ___0. create dataset from HAPNEST (all chromosomes final file) using Docker container___
* genotype files will be in _.bed_, _.bim_ and _.fam_ plink format
##### ___1. create vcf.gz of each chromosome using plink___
* plink --bfile test_chr-* --recode vcf bgz --out vcf/vcf_chr-*
##### ___2. filter vcf.gz files to only contain EUR samples___
* bcftools view -S {EUR_samples.txt} vcf_chr-*.vcf.gz > filtered_chr-*.vcf.gz
##### ___3. create vcf.gz of all samples all chromosomes using bcftools___
* bcftools concat vcf_chr-*.vcf.gz -Oz -o input_files.vcf.gz
##### ___4. filter final vcf to only include cd related found mutations___
* vcftools --gzvcf input_files.vcf.gz --snps snps.txt --recode  
##### |__ ___4.1 gzip file___
* bcftools view synth_files.vcf -Oz -o synth_files.vcf.gz
##### |__ ___4.2 create index (tabix)___
* bcftools index -t synth_files.vcf.gz 

-----------------------------------------------------------------------------
## ___1 Million dataset cleaning and QC.___

##### ___a. filter vcf.gz files to only contain EUR samples___

In [20]:
# extract EUR samples list names (168000)
sampleFile = open("data/inputs/1mil/synt.sample", 'r')
sampleList = open("data/inputs/1mil/EUR_samples.txt", 'w')
cnt = 1
lines = []
for line in sampleFile.readlines():
    if str(line) == 'EUR\n': # check if sample ancestry is 'EUR' (european)
        s = 'syn'+str(cnt)+'_syn'+str(cnt)
        sampleList.write(str(s+'\n')) # write one sample name per row
    cnt += 1
sampleFile.close()
sampleList.close()

-----------------------------------------------------------------------------
## ___Phenotype dataset for Neural Network training and testing.___

In [6]:
# Setup to read data
input_prefix = "data/samples/5k/" #5k sample
output_prefix = "synth_data/5k/" #5k sample
# input_prefix = "data/samples/1_mil/" #1mil sample
# output_prefix = "synth_data/1_mil/" #1mil sample
synthetic_vcf = pysam.VariantFile(str(input_prefix+"input_files.vcf.gz"))
pheno_data = pd.read_csv(str(input_prefix+"synth.pheno1"), sep='\t')
snp_data = pd.read_csv("data/causal_beta.txt",sep=',').set_index("SNP")
samples_list = synthetic_vcf.header.samples   # get all sample names

# Create final dataset file with following format:
#    SAMPLE   |  rsXXXXXXX  |  rsYYYYYYY  |  ...  |  PHENO
#    syn001          1             0         ...       0
#    syn002          2             1         ...       1
#  ...
pheno_data_per_sample = [] 
prs_data_per_sample = [] 
i = 0
for sample in samples_list:   # iterate through each sample in record
    prs = 0 # reset prs for each sample
    pheno_row = {}
    prs_row = {}
    pheno_row['SAMPLE'] = prs_row['SAMPLE'] = sample # create dictionary 
    for rec in synthetic_vcf.fetch():   # iterate through each mutation 
        if rec.id is not None:  # get only non-null mutations
            (alt, ref) = rec.samples[sample]['GT'] # genotype x/y returned as (x,y)
            if  not (alt is None or ref is None):
                alt_freq = int(alt)+int(ref)    # sample's alt allele dosage from genotype field 
                                                #  - 0 for (0,0)
                                                #  - 1 for (1,0) or (0,1)
                                                #  - 2 for (1,1)
            else: alt_freq = -1 # to account for bad genotype reads (NOT a problem for HAPNEST data)
            rsid = snp_data.loc[rec.id]['ID']
            pheno_row[rsid] = alt_freq  # set dosage for mutation
            if alt_freq > -1:
                prs += (float(snp_data.loc[rec.id]['BETA']) * alt_freq) # weighted calulation for prs
            else: prs += 0
    pheno_row['PHENO'] = prs_row['PHENO'] = pheno_data.iloc[i]['Phenotype(binary)'] # get simulated phenotype
    prs_row['PRS'] = prs # store prs score
    pheno_data_per_sample.append(pheno_row)
    prs_data_per_sample.append(prs_row)
    i += 1


In [34]:
pheno_dataset = pd.DataFrame(pheno_data_per_sample)
pheno_dataset.set_index('SAMPLE', inplace=True)
pheno_dataset.to_csv(str(output_prefix+'140Snp_phenoDataset.txt'))
pheno_dataset.head()

Unnamed: 0_level_0,rs10917536,rs4652846,rs3816989,rs11805303,rs2201841,rs11209026,rs1495965,rs924080,rs4658360,rs6583061,...,rs3753115,rs11990425,rs4599795,rs13260300,rs3735887,rs3134295,rs13281279,rs2469507,rs853326,PHENO
SAMPLE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
syn1_syn1,1,1,0,1,1,0,1,1,1,0,...,2,1,1,0,2,2,2,0,1,0
syn2_syn2,0,0,0,0,1,0,0,2,0,0,...,1,0,1,1,2,1,0,0,0,1
syn3_syn3,0,1,0,1,2,0,1,1,0,1,...,1,1,1,2,0,2,0,0,1,0
syn4_syn4,2,0,0,0,0,0,1,1,1,0,...,1,0,1,1,1,1,1,2,1,0
syn5_syn5,2,2,1,0,0,1,0,2,0,1,...,1,0,1,2,1,1,0,1,0,0


In [35]:
prs_dataset = pd.DataFrame(prs_data_per_sample)
prs_dataset.set_index('SAMPLE', inplace=True)
prs_dataset.to_csv(str(output_prefix+'info_per_sample.txt'))
prs_dataset.head()

Unnamed: 0_level_0,PHENO,PRS
SAMPLE,Unnamed: 1_level_1,Unnamed: 2_level_1
syn1_syn1,0,0.564212
syn2_syn2,1,0.025081
syn3_syn3,0,0.335559
syn4_syn4,0,0.183843
syn5_syn5,0,0.44431


-----------------------------------------------------------------------------
## ___Get rsids for mutations not in causal list___

In [81]:
synthetic_txt = open("synth_data/5k/1kSnps_phenoDataset.txt", 'r')
synthetic_new_header = open("synth_data/5k/1kSnps_phenoDataset_2.txt", 'w')
header = synthetic_txt.readline().split(',')
new_header = ['SAMPLE']
hapnest_rsids = "data/mutations/HAPNEST/rsid_map_list_"
file = pd.read_csv(str(hapnest_rsids+'chr1.txt'), sep='\t').set_index('id_hg38')
curr = 'chr1'
for col in header[1:(len(header)-1)]:   # iterate through each sample in record
    next = col.split(':')[0]
    if curr != next:
        file = pd.read_csv(str(hapnest_rsids+next+'.txt'), sep='\t').set_index('id_hg38')
    new_header.append(file.loc[col]['rsid'])
    curr = next
new_header.append('PHENO') 

# rewrite header
l = 0
for line in synthetic_txt.readlines():
    if l==0:
        synthetic_new_header.write(','.join(new_header))
        l+=1
    else:
        synthetic_new_header.write(line)
synthetic_new_header.close()

-----------------------------------------------------------------------------
## ___Format training and testing dataset___

In [2]:
prefix = 'data/samples/1_ml/filtered_vcfs/chr_'
ext = '.txt'
chrs = []
for i in range(1,23):
    filename = str(prefix+str(i)+ext)
    chr_i = pd.read_csv(filename, sep='\t')
    chrs.append(chr_i)
    print(i,' done.')

1  done.
2  done.
3  done.
4  done.
5  done.
6  done.
7  done.
8  done.
9  done.
10  done.
11  done.
12  done.
13  done.
14  done.
15  done.
16  done.
17  done.
18  done.
19  done.
20  done.
21  done.
22  done.


In [3]:
ds = chrs[0]
for df in chrs[1:]:
    ds = pd.concat([ds, df])
    
ds = ds.drop('#CHROM', axis='columns')
ds = ds.drop('POS', axis='columns')
ds = ds.drop('REF', axis='columns')
ds = ds.drop('ALT', axis='columns')
ds = ds.drop('QUAL', axis='columns')
ds = ds.drop('FILTER', axis='columns')
ds = ds.drop('FORMAT', axis='columns')
ds = ds.drop('INFO', axis='columns')
ds = ds.reset_index(drop=True)
# ds.set_index('ID', inplace=True)
ds

Unnamed: 0,ID,syn504001_syn504001,syn504002_syn504002,syn504003_syn504003,syn504004_syn504004,syn504005_syn504005,syn504006_syn504006,syn504007_syn504007,syn504008_syn504008,syn504009_syn504009,...,syn671991_syn671991,syn671992_syn671992,syn671993_syn671993,syn671994_syn671994,syn671995_syn671995,syn671996_syn671996,syn671997_syn671997,syn671998_syn671998,syn671999_syn671999,syn672000_syn672000
0,chr1:12621562:G:A,0/1,0/1,0/1,1/1,0/1,0/1,0/0,1/1,1/1,...,0/0,1/1,0/0,0/1,1/1,0/0,1/1,0/0,0/1,0/1
1,chr1:14917261:T:C,1/1,0/0,0/1,0/1,1/1,0/1,1/1,0/0,0/1,...,0/0,1/1,1/1,0/1,1/1,0/1,1/1,0/0,1/1,1/1
2,chr1:19771448:G:T,0/1,1/1,0/0,0/1,0/1,0/0,0/1,1/1,0/1,...,0/0,0/1,0/1,0/1,0/0,0/0,0/1,0/0,0/0,0/0
3,chr1:34517708:C:T,0/1,0/1,0/1,0/1,0/0,0/1,0/1,1/1,0/1,...,0/0,0/1,1/1,1/1,0/1,1/1,0/0,0/1,1/1,0/0
4,chr1:66776404:G:A,0/0,0/0,1/1,0/0,1/1,0/0,0/0,0/1,0/0,...,0/1,0/0,0/1,0/0,0/0,0/1,0/0,0/1,0/0,0/1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,chr21:20845295:C:T,0/1,0/1,0/1,0/1,0/1,0/0,0/1,0/1,0/1,...,0/1,0/0,0/0,0/1,1/1,1/1,0/1,0/1,0/1,0/1
143,chr21:39058624:G:A,0/0,0/0,0/0,0/1,0/1,0/1,0/0,0/1,0/1,...,0/0,0/0,0/1,0/1,1/1,0/0,0/1,0/0,0/0,0/0
144,chr22:24813757:T:C,0/1,0/1,0/1,0/0,0/0,0/0,0/1,0/0,0/0,...,1/1,0/0,0/1,0/1,0/0,0/0,0/1,0/0,0/0,0/0
145,chr22:37185343:C:T,0/1,0/0,0/1,0/0,1/1,0/1,0/1,0/0,0/1,...,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0


In [4]:
grCh_ls = pd.read_csv('data/mutations/rsid_causal/causal_grCh_liftover.txt', sep='\t')
for i in range(ds.shape[0]):
    ds_id = ((ds.loc[i]['ID']).split(':')[:2])
    pos = ds_id[0] + ':' + ds_id[1]
    # print((pos))
    rsid = grCh_ls[grCh_ls['grCh38']==pos]['RSID']
    # print(str(list(rsid)[0]))
    ds.loc[i,'ID'] = str(list(rsid)[0])

ds.to_csv('data/samples/1_ml/filtered_vcfs/all_chr.txt')
ds

Unnamed: 0,ID,syn504001_syn504001,syn504002_syn504002,syn504003_syn504003,syn504004_syn504004,syn504005_syn504005,syn504006_syn504006,syn504007_syn504007,syn504008_syn504008,syn504009_syn504009,...,syn671991_syn671991,syn671992_syn671992,syn671993_syn671993,syn671994_syn671994,syn671995_syn671995,syn671996_syn671996,syn671997_syn671997,syn671998_syn671998,syn671999_syn671999,syn672000_syn672000
0,rs12117229,0/1,0/1,0/1,1/1,0/1,0/1,0/0,1/1,1/1,...,0/0,1/1,0/0,0/1,1/1,0/0,1/1,0/0,0/1,0/1
1,rs7547573,1/1,0/0,0/1,0/1,1/1,0/1,1/1,0/0,0/1,...,0/0,1/1,1/1,0/1,1/1,0/1,1/1,0/0,1/1,1/1
2,rs10917536,0/1,1/1,0/0,0/1,0/1,0/0,0/1,1/1,0/1,...,0/0,0/1,0/1,0/1,0/0,0/0,0/1,0/0,0/0,0/0
3,rs4652846,0/1,0/1,0/1,0/1,0/0,0/1,0/1,1/1,0/1,...,0/0,0/1,1/1,1/1,0/1,1/1,0/0,0/1,1/1,0/0
4,rs3816989,0/0,0/0,1/1,0/0,1/1,0/0,0/0,0/1,0/0,...,0/1,0/0,0/1,0/0,0/0,0/1,0/0,0/1,0/0,0/1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,rs1009164,0/1,0/1,0/1,0/1,0/1,0/0,0/1,0/1,0/1,...,0/1,0/0,0/0,0/1,1/1,1/1,0/1,0/1,0/1,0/1
143,rs406418,0/0,0/0,0/0,0/1,0/1,0/1,0/0,0/1,0/1,...,0/0,0/0,0/1,0/1,1/1,0/0,0/1,0/0,0/0,0/0
144,rs5996763,0/1,0/1,0/1,0/0,0/0,0/0,0/1,0/0,0/0,...,1/1,0/0,0/1,0/1,0/0,0/0,0/1,0/0,0/0,0/0
145,rs7290488,0/1,0/0,0/1,0/0,1/1,0/1,0/1,0/0,0/1,...,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0


In [137]:
phenotypes = pd.read_csv('data/samples/1_ml/synth.pheno1', sep='\t')
phenotypes = (phenotypes.loc[504000:671999]['Phenotype(binary)'])
phens = pd.DataFrame(phenotypes).transpose()
pc = list(phens.columns)
dsc = list( ds.columns[1:])
col_dict = {pc[i]: dsc[i] for i in range(len(dsc))}
phens = phens.rename(columns=col_dict)
phens = phens.rename(index={'Phenotype(binary)':'PHENO'})
phens = phens.reset_index(names='ID')
phens.index = [147]
phens

Unnamed: 0,ID,syn504001_syn504001,syn504002_syn504002,syn504003_syn504003,syn504004_syn504004,syn504005_syn504005,syn504006_syn504006,syn504007_syn504007,syn504008_syn504008,syn504009_syn504009,...,syn671991_syn671991,syn671992_syn671992,syn671993_syn671993,syn671994_syn671994,syn671995_syn671995,syn671996_syn671996,syn671997_syn671997,syn671998_syn671998,syn671999_syn671999,syn672000_syn672000
147,PHENO,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [138]:
final = pd.concat([ds, phens])
final.to_csv('data/samples/1_ml/all_chr.txt')
final

Unnamed: 0,ID,syn504001_syn504001,syn504002_syn504002,syn504003_syn504003,syn504004_syn504004,syn504005_syn504005,syn504006_syn504006,syn504007_syn504007,syn504008_syn504008,syn504009_syn504009,...,syn671991_syn671991,syn671992_syn671992,syn671993_syn671993,syn671994_syn671994,syn671995_syn671995,syn671996_syn671996,syn671997_syn671997,syn671998_syn671998,syn671999_syn671999,syn672000_syn672000
0,rs12117229,0/1,0/1,0/1,1/1,0/1,0/1,0/0,1/1,1/1,...,0/0,1/1,0/0,0/1,1/1,0/0,1/1,0/0,0/1,0/1
1,rs7547573,1/1,0/0,0/1,0/1,1/1,0/1,1/1,0/0,0/1,...,0/0,1/1,1/1,0/1,1/1,0/1,1/1,0/0,1/1,1/1
2,rs10917536,0/1,1/1,0/0,0/1,0/1,0/0,0/1,1/1,0/1,...,0/0,0/1,0/1,0/1,0/0,0/0,0/1,0/0,0/0,0/0
3,rs4652846,0/1,0/1,0/1,0/1,0/0,0/1,0/1,1/1,0/1,...,0/0,0/1,1/1,1/1,0/1,1/1,0/0,0/1,1/1,0/0
4,rs3816989,0/0,0/0,1/1,0/0,1/1,0/0,0/0,0/1,0/0,...,0/1,0/0,0/1,0/0,0/0,0/1,0/0,0/1,0/0,0/1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,rs406418,0/0,0/0,0/0,0/1,0/1,0/1,0/0,0/1,0/1,...,0/0,0/0,0/1,0/1,1/1,0/0,0/1,0/0,0/0,0/0
144,rs5996763,0/1,0/1,0/1,0/0,0/0,0/0,0/1,0/0,0/0,...,1/1,0/0,0/1,0/1,0/0,0/0,0/1,0/0,0/0,0/0
145,rs7290488,0/1,0/0,0/1,0/0,1/1,0/1,0/1,0/0,0/1,...,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0
146,rs2413583,0/0,0/0,0/0,0/1,0/1,0/0,0/1,0/0,0/0,...,0/0,0/1,0/0,0/0,0/1,0/1,0/0,0/0,0/1,0/0


-----------------------------------------------------------------------------
## ___Format user vcf data___

In [1]:
# ALL imports
import pysam   # for VCF handling
import cyvcf2 # for VCF handling
from cyvcf2 import VCF # for VCF handling
import numpy as np  
import pandas as pd 

grCh_ls = pd.read_csv('data/mutations/rsid_causal/causal_grCh_liftover.txt', sep='\t')

In [21]:
def format_txt(txt_filepath, grCh_ls):
    # Filtered mutations
    print('opening file:')
    file = open(txt_filepath, 'r')
    print('done.\nReading lines:')
    rows = file.readlines()
    print('done.')
    found = []
    data = {}
    snp_by_chrom = [0 for _ in range(22)]
    manual_prs = 0
    header = []
    i = 0
    gt_index = 0
    n = 0
    while n in range(len(rows)):
        row = rows[n]
        if not row.startswith('##'): # iterate through each mutation 
            row = row.split("\t", -1)
            if i==0:
                if row[0].startswith('#'):
                    print('Reading header:')
                    header = row 
                else: 
                    print('preset header')
                    header = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","SAMPLE"]
                    n = n-1
                i=1
                gt_index = len(header)-1
                data['SAMPLE'] = header[gt_index].replace('\n', '')
                print('done.')
            else:
                user_pos = 'chr' + row[0] + ':' + row[1] 
                beta = 0
                try:
                    rsid = list(grCh_ls[grCh_ls['grCh37']==user_pos]['RSID'])[0] #grCh37
                    beta = list(grCh_ls[grCh_ls['grCh37']==user_pos]['BETA'])[0] #grCh37
                except:
                    try:
                        rsid = list(grCh_ls[grCh_ls['grCh38']==user_pos]['RSID'])[0] #grCh38
                        beta = list(grCh_ls[grCh_ls['grCh38']==user_pos]['BETA'])[0] #grCh38
                    except:
                        rsid = None
                if (rsid is not None):
                    #print('found:'+ str(rsid))
                    curr_snp_chr = int(row[0])
                    snp_by_chrom[curr_snp_chr-1] += 1
                    found.append(rsid)
                    gt = row[gt_index][:3]  # sample's alt allele count from genotype field (GT=x/y, with x=0/1/2/... 
                                            # and ref=0, alt=1/2/...)
                    alt_freq = int(gt[0]) + int(gt[2])
                    data[rsid] = [alt_freq]
                    manual_prs += (alt_freq*beta)
                if len(found)==147: break
        n += 1

    return (data, found, snp_by_chrom, manual_prs)

def format_vcf(vcf_filepath, grCh_ls):
    # Filtered mutations
    file = VCF(vcf_filepath)
    found = []
    snp_by_chrom = [0 for _ in range(22)]
    manual_prs = 0
    data = {}
    data['SAMPLE'] = file.samples[0]
    for rec in file:   # iterate through each mutation 
        curr_snp_chr = (str(rec.CHROM).replace('chr', ''))
        user_pos = 'chr' + str(curr_snp_chr)+':'+str(rec.end) #grCh37
        try:
            rsid = list(grCh_ls[grCh_ls['grCh37']==user_pos]['RSID'])[0] #grCh37
            beta = list(grCh_ls[grCh_ls['grCh37']==user_pos]['BETA'])[0] #grCh37
        except:
            try:
                rsid = list(grCh_ls[grCh_ls['grCh38']==user_pos]['RSID'])[0] #grCh38
                beta = list(grCh_ls[grCh_ls['grCh38']==user_pos]['BETA'])[0] #grCh38
            except:
                rsid = None
        if (rsid is not None):
            print('found ', rsid)
            found.append(rsid)
            snp_by_chrom[int(curr_snp_chr)-1] += 1
            gt = (str(rec.gt_types))   # sample's alt allele count from genotype field (GT=x/y, with x=0/1/2/... 
                                                    # and ref=0, alt=1/2/...)
            alt_freq = int(gt[1])
            if alt_freq == 3: 
                alt_freq=2
            data[rsid] = [alt_freq]
            manual_prs += (alt_freq*beta)
        if len(found)==147: break

    return (data, found, snp_by_chrom, manual_prs)

def fill_missing_mutations(vcf_dict, found, grCh_ls):
    mut_list = list(grCh_ls['RSID'])
    added = 0
    for snp in mut_list:
        if snp not in found:
            added += 1
            vcf_dict[snp]=[0]
    print(len(vcf_dict))
    return vcf_dict

def to_dataframe(vcf_dict, columns):
    ordered_dict = {}
    ordered_dict['SAMPLE'] = vcf_dict['SAMPLE']
    for col in columns:
        ordered_dict[col] = vcf_dict[col]
    dataset = pd.DataFrame(ordered_dict)

    return dataset

In [22]:
# test datafiile prom PGP
(user, found, snp_by_chrom, manual_prs) = format_vcf('/Users/asiabelfiore/Downloads/NG1N4ZH3KB.mm2.sortdup.bqsr.hc.vcf.gz', grCh_ls)
len(found)

found  rs12117229
found  rs7547573
found  rs4652846
found  rs11805303
found  rs2201841
found  rs4656077
found  rs6583061
found  rs4745
found  rs1110303
found  rs3024505
found  rs3024493
found  rs2066233
found  rs57657143
found  rs4665855
found  rs4665855
found  rs10198193
found  rs13006847
found  rs384507
found  rs2231898
found  rs11895657
found  rs13006529
found  rs2241880
found  rs2073495
found  rs6778655
found  rs2291039
found  rs12638625
found  rs13096767
found  rs13075089
found  rs4861358
found  rs6810921
found  rs13150477
found  rs10474906
found  rs2247870
found  rs1050152
found  rs2161368
found  rs7736084
found  rs30386
found  rs10946345
found  rs56062535
found  rs9690195
found  rs10276619
found  rs62494111
found  rs2128130
found  rs3753115
found  rs506121
found  rs10977417
found  rs10821128
found  rs10781510
found  rs2229760
found  rs10761659
found  rs11041288
found  rs4509745
found  rs2469887
found  rs35264875
found  rs1872765
found  rs15818
found  rs11564148
found  rs73116325

79

In [23]:
filled_data = fill_missing_mutations(user, found, grCh_ls)
mut_list = list(grCh_ls['RSID'])
df = to_dataframe(filled_data, mut_list)
df

148


Unnamed: 0,SAMPLE,rs12117229,rs7547573,rs10917536,rs4652846,rs3816989,rs2863210,rs11805303,rs2201841,rs11209026,...,rs10474906,rs7736084,rs56839110,rs1052248,rs56062535,rs2128130,rs3753115,rs636922,rs10970183,rs10781510
0,NG1N4ZH3KB,2,1,0,1,0,0,2,2,0,...,1,1,0,0,1,1,2,0,0,1
