This notebook run Tractor (partially) 

https://github.com/Atkinson-Lab/Tractor/wiki/

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import os
import gzip
import datetime
print('Last run:', datetime.datetime.now().strftime('%Y-%m-%d'))

Last run: 2023-02-08


# 1. Prepare files for tractor

## 1.1 Extract Lipidomic samples from genotype vcfs
Use bcftools, run in terminal

In [42]:
fn_id_mapping = '/data100t1/home/wanying/CCHC/doc/samples_IDs/202211_merged_RNA_lipid_protein_genotype_mapping_and_availability.txt'
df_id_mapping = pd.read_csv(fn_id_mapping, sep='\t')
display(df_id_mapping.head())
df_keep = df_id_mapping[df_id_mapping['lipidomic'].notna() & df_id_mapping['genotype_ID'].notna()].copy()
df_keep.drop_duplicates(subset='RRID', inplace=True)
print(f'There are {len(df_keep)} unique samples (by RRID) with both genotype and lipidomic data')

# Get genotype list fro original genotype file,
# because bcftools need a list with sample order that reflects order in the input file 
count = 0
with gzip.open('/vgipiper04/CCHC/TOPMed_postimpute_042022/chr22.dose.vcf.gz', 'rt') as fh:
    while count != 20:
        line = fh.readline()
        count += 1
# line = line.strip().split()[9:]
df_gt_id = pd.DataFrame(line.strip().split()[9:], columns=['genotype_ID'])
df_keep_merged = df_gt_id.merge(df_keep[['genotype_ID', 'lipidomic']], on='genotype_ID')

# Save retained samples in a list for bcftools to extract
fn_list_keep = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/sample_list_for_bcftools.txt'
if not os.path.isfile(fn_list_keep):
    df_keep_merged[['genotype_ID']].to_csv(fn_list_keep, sep='\t', index=False, header=False)
else:
    print(f'File already exist, skip saving: {fn_list_keep}')

Unnamed: 0,RRID,LABID,genotype_ID,RNAseq,lipidomic,proteomic
0,BD0001,,BD0001_BD4001,,,
1,BD0002,,BD0002_BD4002,,,
2,BD0003,,BD0003_BD4003,,,
3,BD0004,5Y0277,BD0004_BD4004,,,5Y0277
4,BD0004,BD4004,BD0004_BD4004,,,BD4004


There are 2289 unique samples (by RRID) with both genotype and lipidomic data


In [46]:
for i in range(1, 23):
    cmd = f'screen -dmS lip_chr{i}; screen -S lip_chr{i} -X stuff '
    cmd += f'"./subset_vcfs.sh {i}\\n"'
    print(cmd)

screen -dmS lip_chr1; screen -S lip_chr1 -X stuff "./subset_vcfs.sh 1\n"
screen -dmS lip_chr2; screen -S lip_chr2 -X stuff "./subset_vcfs.sh 2\n"
screen -dmS lip_chr3; screen -S lip_chr3 -X stuff "./subset_vcfs.sh 3\n"
screen -dmS lip_chr4; screen -S lip_chr4 -X stuff "./subset_vcfs.sh 4\n"
screen -dmS lip_chr5; screen -S lip_chr5 -X stuff "./subset_vcfs.sh 5\n"
screen -dmS lip_chr6; screen -S lip_chr6 -X stuff "./subset_vcfs.sh 6\n"
screen -dmS lip_chr7; screen -S lip_chr7 -X stuff "./subset_vcfs.sh 7\n"
screen -dmS lip_chr8; screen -S lip_chr8 -X stuff "./subset_vcfs.sh 8\n"
screen -dmS lip_chr9; screen -S lip_chr9 -X stuff "./subset_vcfs.sh 9\n"
screen -dmS lip_chr10; screen -S lip_chr10 -X stuff "./subset_vcfs.sh 10\n"
screen -dmS lip_chr11; screen -S lip_chr11 -X stuff "./subset_vcfs.sh 11\n"
screen -dmS lip_chr12; screen -S lip_chr12 -X stuff "./subset_vcfs.sh 12\n"
screen -dmS lip_chr13; screen -S lip_chr13 -X stuff "./subset_vcfs.sh 13\n"
screen -dmS lip_chr14; screen -S lip_ch

In [None]:
'''
%%bash
# Run in terminal with above command
chr_num=$1
vcf=/vgipiper04/CCHC/TOPMed_postimpute_042022/chr${chr_num}.dose.vcf.gz
retained_samples=/data100t1/home/wanying/CCHC/lipidomics/input_docs/sample_list_for_bcftools.txt
output_fn=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/subset_chr${chr_num}.dose.vcf
echo "Processing chr${chr_num} with bcftools"
bcftools view -S ${retained_samples} -Ov ${vcf} > ${output_fn}

echo "bgzip file ${output_fn}"
bgzip ${output_fn}
'''

# 2. Local ancestry imputaiton using RFmix2

In [None]:
%%bash
# Run in terminal
chr_num=$1
vcf=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/subset_chr${chr_num}.dose.vcf.gz
reference=tba
sample_map=tba
genetic_map=tba
output=/data100t1/home/wanying/CCHC/lipidomics/input_docs/rfmix_output/subset_chr${chr_num}.dose.vcf.gz
rfmix=/data100t1/home/wanying/downloaded_tools/rfmix/rfmix

${rfmix} -f ${vcf} \
-r ${reference} \
--chromosome=${chr_num} \
-m ${sample_map} \
-g ${genetic_map} \
-e 1 -n 5 -o cohort.rfmix.chr$i \
--n-threads 32


# 3. Tractor steps

## 3.0 Local ancestry imputation (to be redo)

## 3.1 Recovering tracts (Optional)
To fix switch error

In [3]:
# RFmix imputed local ancestry: CCHC_rfmix_chr*.rfmix.Q 
rfmix_output_dir = '/vgipiper04/CCHC/local_ancestry/rfmix/CCHC_rfmix/output/'

In [47]:
! ls /vgipiper04/CCHC/local_ancestry/rfmix/CCHC_rfmix/output/ | head -n

CCHC_rfmix_chr10.fb.tsv   CCHC_rfmix_chr17.rfmix.Q  CCHC_rfmix_chr3.fb.tsv
CCHC_rfmix_chr10.msp.tsv  CCHC_rfmix_chr17.sis.tsv  CCHC_rfmix_chr3.msp.tsv
CCHC_rfmix_chr10.rfmix.Q  CCHC_rfmix_chr18.fb.tsv   CCHC_rfmix_chr3.rfmix.Q
CCHC_rfmix_chr10.sis.tsv  CCHC_rfmix_chr18.msp.tsv  CCHC_rfmix_chr3.sis.tsv
CCHC_rfmix_chr11.fb.tsv   CCHC_rfmix_chr18.rfmix.Q  CCHC_rfmix_chr4.fb.tsv
CCHC_rfmix_chr11.msp.tsv  CCHC_rfmix_chr18.sis.tsv  CCHC_rfmix_chr4.msp.tsv
CCHC_rfmix_chr11.rfmix.Q  CCHC_rfmix_chr19.fb.tsv   CCHC_rfmix_chr4.rfmix.Q
CCHC_rfmix_chr11.sis.tsv  CCHC_rfmix_chr19.msp.tsv  CCHC_rfmix_chr4.sis.tsv
CCHC_rfmix_chr12.fb.tsv   CCHC_rfmix_chr19.rfmix.Q  CCHC_rfmix_chr5.fb.tsv
CCHC_rfmix_chr12.msp.tsv  CCHC_rfmix_chr19.sis.tsv  CCHC_rfmix_chr5.msp.tsv
CCHC_rfmix_chr12.rfmix.Q  CCHC_rfmix_chr1.fb.tsv    CCHC_rfmix_chr5.rfmix.Q
CCHC_rfmix_chr12.sis.tsv  CCHC_rfmix_chr1.msp.tsv   CCHC_rfmix_chr5.sis.tsv
CCHC_rfmix_chr13.fb.tsv   CCHC_rfmix_chr1.rfmix.Q   CCHC_rfmix_chr6.fb.tsv
CCHC_rfmix_chr13

## 3.2 Extracting tracts and ancestral dosages

In [None]:
%%bash
# Run in terminal
chr_num=$1
tractor=/data100t1/home/wanying/downloaded_tools/Tractor/ExtractTracts.py
MSP_FILE=/vgipiper04/CCHC/local_ancestry/rfmix/CCHC_rfmix/output/CCHC_rfmix_chr${chr_num}
VCF_FILE=/vgipiper04/CCHC/TOPMed_postimpute_042022/chr${chr_num}.dose
output_dir=/data100t1/home/wanying/CCHC/lipidomics/output/snp_la_dosage/

python ${tractor} \
--msp ${MSP_FILE} \
--vcf ${VCF_FILE} \
--output-path ${output_dir} \
--num-ancs 4


In [None]:
for i in range(1, 23):
    cmd = f'screen -dmS lip_chr{i}; screen -S lip_chr{i} -X stuff '
    cmd += f'"./step2_extract_tracts.sh {i}\\n"'
    print(cmd)