## All the imports and settings

In [1]:
import os

!!! Пришлось понизить версию питона до 3.7:
```
conda install python=3.7
```

In [2]:
os.environ['JAVA_HOME']='/usr/lib/jvm/java-1.8.0-openjdk-amd64'
os.environ['PATH'] = os.environ['PATH'] + f':{os.environ["JAVA_HOME"]}/bin'
os.environ['JAVA_OPTION'] = '-Xmx48g'

In [3]:
import hail as hl
from bokeh.io import output_notebook, show



In [4]:
hl.init(spark_conf={"spark.driver.memory": "100g"})
output_notebook()
# update-java-alternat

Running on Apache Spark version 2.4.1
SparkUI available at http://217.148.215.30:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.63-cb767a7507c8
LOGGING: writing to /media/array/guar_proj/production_callset/hail-20210505-1149-0.2.63-cb767a7507c8.log


## FILENAMES


In [5]:
REF_NAME = "GUAR_RG"
REF_FILE = "/media/array/guar_proj/reference_masurca/assembly_guar_Masurca1.fasta"
REF_INDEX_FILE = "/media/array/guar_proj/reference_masurca/assembly_guar_Masurca1.fasta.fai"

INPUT_VCF = "/media/array/guar_proj/production_callset/GUAR_2of3_AF5pct.vcf"
INPUT_TSV = '/media/array/guar_proj/production_callset/new_genius_pheno.tsv'

GWAS_EXPORT = 'output_pvals/filtered_gwas_LMM_'
PCA_SCORES = 'out_pc_scores/guar_filtered.pc_scores.csv'
OUT_VCF = 'out_vcf/guar_filtered_MAF_highconf.vcf'
LMM_OUTPUT = 'LMM_output'

### Import data from VCF

In [6]:
ref_g = hl.genetics.ReferenceGenome.from_fasta_file(REF_NAME, 
                                                    REF_FILE, 
                                                    REF_INDEX_FILE)

In [7]:
mt = hl.import_vcf(INPUT_VCF, reference_genome=ref_g)
mt = mt.annotate_cols(genome_id=hl.str(mt.s))
mt = mt.key_cols_by("genome_id")
mt = mt.drop("s")
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    'genome_id': str
----------------------------------------
Row fields:
    'locus': locus<GUAR_RG>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        RAW_MQandDP: array<int32>, 
        ReadPosRankSum: float64, 
        SOR: float64, 
        NGSEP_NHET: array<int32>, 
        NGSEP_NALT: array<int32>, 
        NGSEP_AF: array<float64>, 
        TASSEL_NHET: array<int32>, 
        TASSEL_NALT: array<int32>, 
        TAS

In [8]:
mt.count()

2021-05-05 11:49:17 Hail: INFO: Coerced sorted dataset


(5234, 166)

## Annotation

In [9]:
sa = hl.import_table(INPUT_TSV,
                      key='id', 
                      missing='',)
# Если надо какой-то столбец конвертировать, то делается это так:
sa = sa.annotate(genome_id = hl.str(sa['id']))
sa = sa.annotate(height = hl.float(sa['height']))
sa = sa.annotate(blooming_time = hl.float(sa['blooming_time']))
sa = sa.annotate(maturation_time = hl.float(sa['maturation_time']))
sa = sa.annotate(branch_height = hl.float(sa['branch_height']))
sa = sa.annotate(bean_number = hl.float(sa['bean_number']))
sa = sa.annotate(maturation_pct = hl.float(sa['maturation_pct']))
sa = sa.annotate(mature_bean_weight = hl.float(sa['mature_bean_weight']))
sa = sa.key_by('genome_id')
sa.describe()
sa.height.show()

2021-05-05 11:49:18 Hail: INFO: Reading table without type imputation
  Loading field 'id' as type str (not specified)
  Loading field 'height' as type str (not specified)
  Loading field 'blooming_time' as type str (not specified)
  Loading field 'maturation_time' as type str (not specified)
  Loading field 'branch_height' as type str (not specified)
  Loading field 'bean_number' as type str (not specified)
  Loading field 'maturation_pct' as type str (not specified)
  Loading field 'mature_bean_weight' as type str (not specified)


----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'id': str 
    'height': float64 
    'blooming_time': float64 
    'maturation_time': float64 
    'branch_height': float64 
    'bean_number': float64 
    'maturation_pct': float64 
    'mature_bean_weight': float64 
    'genome_id': str 
----------------------------------------
Key: ['genome_id']
----------------------------------------


2021-05-05 11:49:19 Hail: INFO: Ordering unsorted dataset with network shuffle


genome_id,height
str,float64
"""S1""",-0.476
"""S10""",0.645
"""S100""",0.694
"""S101""",-1.64
"""S102""",-0.131
"""S104""",-0.105
"""S105""",0.348
"""S106""",-0.145
"""S107""",0.597
"""S109""",-1.06


In [10]:
mt = mt.annotate_cols(pheno = sa[mt.genome_id])
#mt.annotate_cols(genome_id_frmt=hl.str(mt.genome_id))
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    'genome_id': str
    'pheno': struct {
        id: str, 
        height: float64, 
        blooming_time: float64, 
        maturation_time: float64, 
        branch_height: float64, 
        bean_number: float64, 
        maturation_pct: float64, 
        mature_bean_weight: float64
    }
----------------------------------------
Row fields:
    'locus': locus<GUAR_RG>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
       

In [11]:
#### Работает?
mt.entries().show(3)

2021-05-05 11:49:19 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'
2021-05-05 11:49:20 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,Unnamed: 29_level_0,pheno,pheno,pheno,pheno,pheno,pheno,pheno,pheno,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0,Unnamed: 43_level_0,Unnamed: 44_level_0,Unnamed: 45_level_0,Unnamed: 46_level_0,Unnamed: 47_level_0,Unnamed: 48_level_0
locus,alleles,rsid,qual,filters,AC,AF,AN,BaseQRankSum,DP,DS,END,ExcessHet,FS,InbreedingCoeff,MLEAC,MLEAF,MQ,MQRankSum,QD,RAW_MQandDP,ReadPosRankSum,SOR,NGSEP_NHET,NGSEP_NALT,NGSEP_AF,TASSEL_NHET,TASSEL_NALT,TASSEL_AF,genome_id,id,height,blooming_time,maturation_time,branch_height,bean_number,maturation_pct,mature_bean_weight,AD,DP,GQ,GT,MIN_DP,PGT,PID,PL,PS,RGQ,SB
locus<GUAR_RG>,array<str>,str,float64,set<str>,array<int32>,array<float64>,int32,float64,int32,bool,int32,float64,float64,float64,array<int32>,array<float64>,float64,float64,float64,array<int32>,float64,float64,array<int32>,array<int32>,array<float64>,array<int32>,array<int32>,array<float64>,str,str,float64,float64,float64,float64,float64,float64,float64,array<int32>,int32,int32,call,int32,call,str,array<int32>,int32,int32,array<int32>
contig_1004:28957,"[""C"",""T""]",,20000.0,,[138],[3.73e-01],370,0.349,1079,False,,-0.0,0.0,0.467,[155],[4.19e-01],60.0,0.0,23.2,,0.0,3.97,[0],[0],[0.00e+00],[16],[69],[1.08e+00],"""S1""","""S1""",-0.476,-2.2,0.306,-1.23,0.827,0.279,0.0593,"[7,0]",7,21,0/0,,,,"[0,21,245]",,,
contig_1004:28957,"[""C"",""T""]",,20000.0,,[138],[3.73e-01],370,0.349,1079,False,,-0.0,0.0,0.467,[155],[4.19e-01],60.0,0.0,23.2,,0.0,3.97,[0],[0],[0.00e+00],[16],[69],[1.08e+00],"""S10""","""S10""",0.645,-2.07,-0.728,-1.15,0.38,0.476,0.457,"[0,3]",3,9,1/1,,,,"[132,9,0]",,,
contig_1004:28957,"[""C"",""T""]",,20000.0,,[138],[3.73e-01],370,0.349,1079,False,,-0.0,0.0,0.467,[155],[4.19e-01],60.0,0.0,23.2,,0.0,3.97,[0],[0],[0.00e+00],[16],[69],[1.08e+00],"""S100""","""S100""",0.694,0.408,0.39,-0.0796,0.187,0.39,0.0198,"[5,0]",5,15,0/0,,,,"[0,15,214]",,,


In [12]:
# call rate before filtering
mt.aggregate_entries(hl.agg.fraction(hl.is_defined(mt.GT)))

2021-05-05 11:49:21 Hail: INFO: Coerced sorted dataset


0.9779972008784086

In [13]:
# final variant and sample count
mt.count()

2021-05-05 11:49:22 Hail: INFO: Coerced sorted dataset


(5234, 166)

## PCAs

In [14]:
pca_eigenvalues, pca_scores, pca_loadings = hl.hwe_normalized_pca(mt.GT, compute_loadings=True)
mt = mt.annotate_cols(pca = pca_scores[mt.genome_id])
p = hl.plot.scatter(mt.pca.scores[0], 
                    mt.pca.scores[1])
show(p)
print([x/sum(pca_eigenvalues) for x in pca_eigenvalues])

2021-05-05 11:49:23 Hail: INFO: Coerced sorted dataset
2021-05-05 11:49:23 Hail: INFO: hwe_normalized_pca: running PCA using 5234 variants.
2021-05-05 11:49:24 Hail: INFO: Coerced sorted dataset
2021-05-05 11:49:24 Hail: INFO: pca: running PCA with 10 components...
2021-05-05 11:49:27 Hail: INFO: Coerced sorted dataset
2021-05-05 11:49:28 Hail: INFO: Coerced sorted dataset


[0.22807032911449915, 0.15280873263213282, 0.13437122419114006, 0.12934191372146497, 0.07230963334679741, 0.06719911347681959, 0.061294231093550186, 0.05550329929387484, 0.05038304784505889, 0.04871847528466229]


In [15]:
#mt = mt.annotate_cols(genome_id_frmt = hl.str(mt.genome_id)
#hl.export_vcf(mt, OUT_VCF)
scores_df = pca_scores.to_pandas()

In [16]:
scores_df.head()

Unnamed: 0,genome_id,scores
0,S1,"[-0.1392856289153399, 0.41817771532057174, -0...."
1,S10,"[-0.5943803992544161, -0.5740264022446521, 0.3..."
2,S100,"[0.006474338429093289, -0.08740784836494055, 0..."
3,S102,"[0.1306331394015496, -0.0656385561836958, 0.22..."
4,S104,"[-0.16780284691597294, 0.16876120371697279, 0...."


In [17]:
for i in range(1, 11):
    scores_df[f'PC{i}'] = [x[i - 1] for x in scores_df['scores']]

In [18]:
scores_df.drop('scores', axis=1, inplace=True)
scores_df.head()

Unnamed: 0,genome_id,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,S1,-0.139286,0.418178,-0.170107,0.045901,0.314681,0.792907,-0.237359,0.113218,0.251693,0.283357
1,S10,-0.59438,-0.574026,0.323683,-1.037562,0.246027,0.148222,0.122373,0.044426,-0.167866,-0.061703
2,S100,0.006474,-0.087408,0.089134,0.251408,0.240211,-0.251115,-0.291133,0.088232,0.000784,-0.011808
3,S102,0.130633,-0.065639,0.22825,0.124664,0.112927,-0.177768,-0.158527,-0.109306,-0.365951,-0.318269
4,S104,-0.167803,0.168761,0.001225,0.237359,0.062603,0.146675,0.306907,-0.155303,-0.034751,0.030807


In [19]:
scores_df.to_csv(PCA_SCORES)

## LD

In [20]:
ld = hl.ld_matrix(mt.GT.n_alt_alleles(), mt.locus, radius=10e6)

2021-05-05 11:49:34 Hail: INFO: Coerced sorted dataset
2021-05-05 11:49:35 Hail: INFO: Wrote all 2 blocks of 5234 x 166 matrix with block size 4096.


In [21]:
pd_ld = ld.to_numpy()
pd_ld.shape

2021-05-05 11:49:37 Hail: INFO: Coerced sorted dataset


(5234, 5234)

In [22]:
snp_data = mt.rows().to_pandas()
snp_data.head()

2021-05-05 11:49:47 Hail: INFO: Coerced sorted dataset


Unnamed: 0,locus.contig,locus.position,alleles,rsid,qual,filters,info.AC,info.AF,info.AN,info.BaseQRankSum,...,info.QD,info.RAW_MQandDP,info.ReadPosRankSum,info.SOR,info.NGSEP_NHET,info.NGSEP_NALT,info.NGSEP_AF,info.TASSEL_NHET,info.TASSEL_NALT,info.TASSEL_AF
0,contig_1004,28957,"[C, T]",,19975.2,,[138],[0.373],370,0.349,...,23.23,,0.0,3.967,[0],[0],[0.0],[16],[69],[1.07692]
1,contig_1004,42060,"[C, T]",,21962.31,,[107],[0.279],384,0.431,...,29.2,,0.0,4.174,[4],[56],[0.33],[0],[0],[0.0]
2,contig_1004,51238,"[C, T]",,27678.85,,[110],[0.286],384,-0.411,...,34.42,,0.0,3.979,[0],[0],[0.0],[7],[58],[0.664865]
3,contig_1004,147808,"[G, A]",,9465.62,,[60],[0.158],380,0.18,...,33.21,,0.0,4.454,[0],[0],[0.0],[3],[26],[0.335366]
4,contig_1004,266475,"[C, T]",,20300.58,,[132],[0.353],374,-0.21,...,30.54,,0.0,4.53,[12],[78],[0.4],[0],[0],[0.0]


In [25]:
out_file = open('/media/array/guar_proj/production_callset/ld_data.tsv', 'w')
for i in range(5234):
    for j in range(5234):
        loc_A = snp_data['locus.contig'][i]
        loc_B = snp_data['locus.contig'][j]
        pos_A = snp_data['locus.position'][i]
        pos_B = snp_data['locus.position'][j]
        AF_A = snp_data['info.AF'][i][0]
        AF_B = snp_data['info.AF'][j][0]
        if loc_A == loc_B:
            print(f'{loc_A}\t{pos_A}\t{AF_A}\t{loc_B}\t{pos_B}\t{AF_B}\t{pd_ld[i,j]}', file=out_file)
out_file.close()

In [20]:
biallelic_dataset = mt.filter_rows(hl.len(mt.alleles) == 2)
tag_snp = hl.ld_prune(biallelic_dataset.GT, r2=0.8, bp_window_size=2000000)

2021-04-15 15:26:52 Hail: INFO: ld_prune: running local pruning stage with max queue size of 729445 variants
2021-04-15 15:26:52 Hail: INFO: Coerced sorted dataset
2021-04-15 15:26:54 Hail: INFO: wrote table with 4910 rows in 1 partition to /tmp/UTBrZNpodEfAsGrgF5ZCgX
    Total size: 147.88 KiB
    * Rows: 147.87 KiB
    * Globals: 11.00 B
    * Smallest partition: 4910 rows (147.87 KiB)
    * Largest partition:  4910 rows (147.87 KiB)
2021-04-15 15:26:54 Hail: INFO: Coerced sorted dataset
2021-04-15 15:26:55 Hail: INFO: Wrote all 2 blocks of 4910 x 166 matrix with block size 4096.
2021-04-15 15:27:07 Hail: INFO: wrote table with 0 rows in 3 partitions to /tmp/4nAmYu8DaZgwTBNiTkwqpZ
    Total size: 58.84 KiB
    * Rows: 63.00 B
    * Globals: 58.78 KiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  0 rows (21.00 B)


In [21]:
tag_snp.count()

4910

In [22]:
biallelic_dataset.count()

2021-04-15 15:27:08 Hail: INFO: Coerced sorted dataset


(5735, 166)

## ADMIXTURE file formatting

In [10]:
pseudoref = hl.genetics.ReferenceGenome.from_fasta_file('pseudoreference_genome', 
                                    '/media/array/guar_proj/production_callset/GUAR_to_admixture.fasta', 
                                    '/media/array/guar_proj/production_callset/GUAR_to_admixture.fasta.fai')

In [11]:
mt_admix = hl.import_vcf('/media/array/guar_proj/production_callset/GUAR_to_admixture.vcf', 
                         reference_genome=pseudoref)
mt_admix = mt_admix.annotate_cols(genome_id=hl.str(mt_admix.s))
mt_admix = mt_admix.key_cols_by("genome_id")
mt_admix = mt_admix.drop("s")
mt_admix = mt_admix.filter_rows(hl.len(mt_admix.alleles) == 2)
mt_admix.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    'genome_id': str
----------------------------------------
Row fields:
    'locus': locus<pseudoreference_genome>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        RAW_MQandDP: array<int32>, 
        ReadPosRankSum: float64, 
        SOR: float64, 
        NGSEP_NHET: array<int32>, 
        NGSEP_NALT: array<int32>, 
        NGSEP_AF: array<float64>, 
        TASSEL_NHET: array<int32>, 
        TASSEL_NALT: array<int32

In [15]:
biallelic_dataset = mt_admix.filter_rows(hl.len(mt_admix.alleles) == 2)
pruned_vars = hl.ld_prune(biallelic_dataset.GT, r2=0.2, bp_window_size=2000000)
mt_admix_pruned = mt_admix.filter_rows(hl.is_defined(pruned_vars[mt_admix.row_key]))

2021-04-28 16:22:53 Hail: INFO: ld_prune: running local pruning stage with max queue size of 729445 variants
2021-04-28 16:22:53 Hail: INFO: Coerced sorted dataset
2021-04-28 16:22:55 Hail: INFO: wrote table with 2005 rows in 1 partition to /tmp/BFOfUR517KsePKI2rTyYHD
    Total size: 59.28 KiB
    * Rows: 59.27 KiB
    * Globals: 11.00 B
    * Smallest partition: 2005 rows (59.27 KiB)
    * Largest partition:  2005 rows (59.27 KiB)
2021-04-28 16:22:55 Hail: INFO: Coerced sorted dataset
2021-04-28 16:22:56 Hail: INFO: Wrote all 1 blocks of 2005 x 166 matrix with block size 4096.
2021-04-28 16:23:02 Hail: INFO: wrote table with 0 rows in 1 partition to /tmp/oyVGUgUbIoNBEbOwoUKivX
    Total size: 23.88 KiB
    * Rows: 21.00 B
    * Globals: 23.86 KiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  0 rows (21.00 B)


In [17]:
hl.export_plink(mt_admix_pruned, 
                output='/media/array/guar_proj/production_callset/GUAR_2of3_AF5pct_hail', 
                ind_id = mt_admix_pruned.genome_id)

2021-04-28 16:23:46 Hail: INFO: Coerced sorted dataset
2021-04-28 16:23:47 Hail: INFO: Coerced sorted dataset
2021-04-28 16:23:47 Hail: INFO: Coerced sorted dataset
2021-04-28 16:23:47 Hail: INFO: Coerced sorted dataset
2021-04-28 16:23:48 Hail: INFO: Coerced sorted dataset
2021-04-28 16:23:49 Hail: INFO: merging 2 files totalling 82.2K...
2021-04-28 16:23:49 Hail: INFO: while writing:
    /media/array/guar_proj/production_callset/GUAR_2of3_AF5pct_hail.bed
  merge time: 83.636ms
2021-04-28 16:23:49 Hail: INFO: merging 1 files totalling 67.6K...
2021-04-28 16:23:49 Hail: INFO: while writing:
    /media/array/guar_proj/production_callset/GUAR_2of3_AF5pct_hail.bim
  merge time: 4.040ms
2021-04-28 16:23:50 Hail: INFO: merging 40 files totalling 2.5K...
2021-04-28 16:23:50 Hail: INFO: while writing:
    /media/array/guar_proj/production_callset/GUAR_2of3_AF5pct_hail.fam
  merge time: 21.492ms
2021-04-28 16:23:50 Hail: INFO: wrote 2005 variants and 166 samples to '/media/array/guar_proj/prod

## GWAS

In [33]:
# rrm = hl.realized_relationship_matrix(mt.GT).to_numpy()  
# i ='height_18'

# model, p = hl.linear_mixed_model(  
#      y = mt.pheno[i],
#      x=[1.0, mt.pca.scores[0], mt.pca.scores[1], mt.sample_qc.r_het_hom_var],
#      k=rrm,
#      p_path=f'{LMM_OUTPUT}/{i}_p.bm',
#      overwrite=True)

In [55]:
mt = hl.sample_qc(mt)

In [56]:
phenotypes = ['height', 'branch_height', 'blooming_time', 'maturation_time', 'maturation_pct',
             'bean_number', 'mature_bean_weight']



for i in phenotypes:
    print('\n\n=============')
    print(i)
    print('=============')
    gwas = hl.linear_regression_rows(y = mt.pheno[i], 
                                     x = mt.GT.n_alt_alleles(), 
                                     covariates=[1.0, mt.pca.scores[0], mt.pca.scores[1], mt.sample_qc.r_het_hom_var])
    gwas.export(f'{GWAS_EXPORT}{i}.tsv')
    p = hl.plot.manhattan(gwas.p_value,
                          significance_line = 1e-06)
    show(p)
    p = hl.plot.qq(gwas.p_value, collect_all=True)
    show(p)



height


2021-04-28 16:41:00 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:01 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:02 Hail: WARN: 1 of 166 samples have a missing phenotype or covariate.
2021-04-28 16:41:02 Hail: INFO: linear_regression_rows: running on 165 samples for 1 response variable y,
    with input variable x, and 4 additional covariates...
2021-04-28 16:41:03 Hail: INFO: merging 1 files totalling 511.8K...
2021-04-28 16:41:03 Hail: INFO: while writing:
    output_pvals/filtered_gwas_LMM_height.tsv
  merge time: 6.768ms


2021-04-28 16:41:06 Hail: INFO: Ordering unsorted dataset with network shuffle




branch_height


2021-04-28 16:41:08 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:10 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:10 Hail: WARN: 1 of 166 samples have a missing phenotype or covariate.
2021-04-28 16:41:10 Hail: INFO: linear_regression_rows: running on 165 samples for 1 response variable y,
    with input variable x, and 4 additional covariates...
2021-04-28 16:41:11 Hail: INFO: merging 1 files totalling 510.8K...
2021-04-28 16:41:11 Hail: INFO: while writing:
    output_pvals/filtered_gwas_LMM_branch_height.tsv
  merge time: 16.455ms


2021-04-28 16:41:15 Hail: INFO: Ordering unsorted dataset with network shuffle




blooming_time


2021-04-28 16:41:17 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:18 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:18 Hail: WARN: 2 of 166 samples have a missing phenotype or covariate.
2021-04-28 16:41:18 Hail: INFO: linear_regression_rows: running on 164 samples for 1 response variable y,
    with input variable x, and 4 additional covariates...
2021-04-28 16:41:19 Hail: INFO: merging 1 files totalling 511.3K...
2021-04-28 16:41:19 Hail: INFO: while writing:
    output_pvals/filtered_gwas_LMM_blooming_time.tsv
  merge time: 6.648ms


2021-04-28 16:41:22 Hail: INFO: Ordering unsorted dataset with network shuffle




maturation_time


2021-04-28 16:41:25 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:26 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:26 Hail: WARN: 1 of 166 samples have a missing phenotype or covariate.
2021-04-28 16:41:26 Hail: INFO: linear_regression_rows: running on 165 samples for 1 response variable y,
    with input variable x, and 4 additional covariates...
2021-04-28 16:41:27 Hail: INFO: merging 1 files totalling 511.5K...
2021-04-28 16:41:27 Hail: INFO: while writing:
    output_pvals/filtered_gwas_LMM_maturation_time.tsv
  merge time: 9.146ms


2021-04-28 16:41:31 Hail: INFO: Ordering unsorted dataset with network shuffle




maturation_pct


2021-04-28 16:41:34 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:35 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:36 Hail: WARN: 1 of 166 samples have a missing phenotype or covariate.
2021-04-28 16:41:36 Hail: INFO: linear_regression_rows: running on 165 samples for 1 response variable y,
    with input variable x, and 4 additional covariates...
2021-04-28 16:41:37 Hail: INFO: merging 1 files totalling 512.4K...
2021-04-28 16:41:37 Hail: INFO: while writing:
    output_pvals/filtered_gwas_LMM_maturation_pct.tsv
  merge time: 24.732ms


2021-04-28 16:41:41 Hail: INFO: Ordering unsorted dataset with network shuffle




bean_number


2021-04-28 16:41:44 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:45 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:46 Hail: WARN: 2 of 166 samples have a missing phenotype or covariate.
2021-04-28 16:41:46 Hail: INFO: linear_regression_rows: running on 164 samples for 1 response variable y,
    with input variable x, and 4 additional covariates...
2021-04-28 16:41:47 Hail: INFO: merging 1 files totalling 511.6K...
2021-04-28 16:41:47 Hail: INFO: while writing:
    output_pvals/filtered_gwas_LMM_bean_number.tsv
  merge time: 8.025ms


2021-04-28 16:41:51 Hail: INFO: Ordering unsorted dataset with network shuffle




mature_bean_weight


2021-04-28 16:41:54 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:55 Hail: INFO: Coerced sorted dataset
2021-04-28 16:41:56 Hail: WARN: 2 of 166 samples have a missing phenotype or covariate.
2021-04-28 16:41:56 Hail: INFO: linear_regression_rows: running on 164 samples for 1 response variable y,
    with input variable x, and 4 additional covariates...
2021-04-28 16:41:57 Hail: INFO: merging 1 files totalling 512.1K...
2021-04-28 16:41:57 Hail: INFO: while writing:
    output_pvals/filtered_gwas_LMM_mature_bean_weight.tsv
  merge time: 13.018ms


2021-04-28 16:42:02 Hail: INFO: Ordering unsorted dataset with network shuffle
