In [1]:
import numpy as np 
import pickle
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import h5py
import scipy
import vcf
import allel
import sys
np.set_printoptions(threshold=sys.maxsize)

In [3]:
#load in our pickled data that contains population_dict samples
population_dict = pickle.load(open("populationcodes.pkl", "rb"))
#hard code all of our sample lists into lists so that we can use for our read_vcf function
FIN_samples = list(population_dict['FIN'])
CHS_samples = list(population_dict['CHS'])
GBR_samples = list(population_dict['GBR'])
PUR_samples = list(population_dict['PUR'])
CLM_samples = list(population_dict['CLM'])
MXL_samples = list(population_dict['MXL'])
TSI_samples = list(population_dict['TSI'])
LWK_samples = list(population_dict['LWK'])  
JPT_samples = list(population_dict['JPT'])
IBS_samples = list(population_dict['IBS'])
PEL_samples = list(population_dict['PEL'])
CDX_samples = list(population_dict['CDX'])
YRI_samples = list(population_dict['YRI'])
KHV_samples = list(population_dict['KHV']) 
ASW_samples = list(population_dict['ASW']) 
ACB_samples = list(population_dict['ACB']) 
CHB_samples = list(population_dict['CHB'])
GIH_samples = list(population_dict['GIH']) 
GWD_samples = list(population_dict['GWD']) 
PJL_samples = list(population_dict['PJL'])
MSL_samples = list(population_dict['MSL'])
BEB_samples = list(population_dict['BEB']) 
ESN_samples = list(population_dict['ESN'])
STU_samples = list(population_dict['STU'])
ITU_samples = list(population_dict['ITU']) 

#create a list of population sample code lists so that we can iterate through and call each subpopulation sample set into our VCF file. 
sample_names_ls = [FIN_samples,
CHS_samples,
GBR_samples,
PUR_samples,
CLM_samples,
MXL_samples,
TSI_samples,
LWK_samples,
JPT_samples,
IBS_samples,
PEL_samples,
CDX_samples,
YRI_samples,
KHV_samples,
ASW_samples,
ACB_samples,
CHB_samples,
GIH_samples,
GWD_samples,
PJL_samples,
MSL_samples,
BEB_samples,
ESN_samples,
STU_samples,
ITU_samples]

In [4]:
#try calling our dataset for Finnish_in_Finland population
callset = allel.read_vcf('ALL.chr21.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf',tabix = '/Users/AlexGaujean/Downloads/Genomics_Project/ALL.chr21.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi', samples = FIN_samples) 

#run our genotype function on our specified Finnish callset
FIN_gt = allel.GenotypeArray(callset['calldata/GT'])
#slice down our call dataset so we can work on it with more ease  
FIN_gt_slice = FIN_gt[:1500]


#run our callset data for both our Chinese population
callset_1 = allel.read_vcf('ALL.chr21.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf',tabix = '/Users/AlexGaujean/Downloads/Genomics_Project/vcf_zip_files/htslib-1.9/tabix.c' , samples = CHS_samples)

#setup our CHS genotype call files
CHS_gt = allel.GenotypeArray(callset_1['calldata/GT'])

#setup a slice of our CHS file so that we can work with a more manageable set of data
CHS_gt_slice = CHS_gt[:1500]

  ', '.join(map(repr, sorted(samples))))


In [6]:
#create a function that takes in our population genotype at each position and adds together the amount of variant alleles that are present resulting in a (calls x samples) matrix size
def allele_counter(population_gt, population_size):
    population_new_gt = []
    for i in range(0,population_size): 
        new_gt = population_gt[:,i,0] + population_gt[:,i,1]
        population_new_gt.append(new_gt)
    return population_new_gt

In [7]:
#use this function on our FIN population and our CHS population to set up a matrix of data 
FIN_new_gt = allele_counter(FIN_gt_slice, 100)
CHS_new_gt = allele_counter(CHS_gt_slice, 92)

In [8]:
CHS_new_gt

[array([0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 

In [9]:
FIN_new_gt

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 