In [1]:
#!/bin/bash
# Cantley Lab of Plant Evolution
# Chenopodium Project
# Example ipyrad code (command line) from https://ipyrad.readthedocs.io/en/master/pedicularis_.html
# Baker-Strader, R.; Bullock, M.; Cantley, J.

In [2]:
# directories and file paths. You'll need to set these
WORKING_DIR="/home/923643692/cheno_pop_tests" # where the main analysis folder is stored; an absolute path, because vcftools doesn't like relative dirs?
IPYRAD_OUT_VCF_PATH="/usr/nfs/923643692/cheno_popgen/ipyrad/tetraploid/chenopodium1_tetraploid/chenopodium_20250304b-tetraploid_outfiles/chenopodium_20250304b-tetraploid.vcf"
BASE_NAME="chenopodium_20250304b-tetraploid"

# You can leave these alone, they're just here to make the follow-on code less clunky
FILTER_PATH="$WORKING_DIR/00_data"
IMISS_PATH="$FILTER_PATH/00a_imiss"
WORKING_PATH="$FILTER_PATH/00b_intermediate_files"
FINAL_VCF_OUTPUT_PATH="$FILTER_PATH/00c_data_all"
SUBSETS_PATH="$FILTER_PATH/00d_data_subsets"

Now we subset based on populations.

In [3]:
# makes lists of populations
sample_list=$(mawk '$5' $IMISS_PATH/missingind.imiss | cut -f1 | cat)
AR_samples=()
CH_samples=()
IP_samples=()
MO_samples=()
PB_samples=()
PK_samples=()

# adds the samples to the lists
for sample in $sample_list; do
  if   [[ $sample == *"AR"* ]]; then AR_samples+=($sample)
  elif [[ $sample == *"CH"* ]]; then CH_samples+=($sample)
  elif [[ $sample == *"IP"* ]]; then IP_samples+=($sample)
  elif [[ $sample == *"MO"* ]]; then MO_samples+=($sample)
  elif [[ $sample == *"PB"* ]]; then PB_samples+=($sample)
  elif [[ $sample == *"PK"* ]]; then PK_samples+=($sample)
  fi
done

# replaces spaces with commas for use in bcftools subsetting
cs_AR_samples=""; for sample in "${AR_samples[@]}"; do cs_AR_samples+="$sample,"; done
cs_CH_samples=""; for sample in "${CH_samples[@]}"; do cs_CH_samples+="$sample,"; done
cs_IP_samples=""; for sample in "${IP_samples[@]}"; do cs_IP_samples+="$sample,"; done
cs_MO_samples=""; for sample in "${MO_samples[@]}"; do cs_MO_samples+="$sample,"; done
cs_PB_samples=""; for sample in "${PB_samples[@]}"; do cs_PB_samples+="$sample,"; done
cs_PK_samples=""; for sample in "${PK_samples[@]}"; do cs_PK_samples+="$sample,"; done

In [4]:
ls $FINAL_VCF_OUTPUT_PATH/
vcf_miss60=chenopodium_20250304b-tetraploid.indivmiss60.locimiss50.thin10.maf0.01.LD0.8-1000

[0m[01;31mchenopodium_20250304b-tetraploid.indivmiss20.locimiss50.thin10.maf0.01.LD0.8-1000.vcf.gz[0m[K
[01;31mchenopodium_20250304b-tetraploid.indivmiss40.locimiss50.thin10.maf0.01.LD0.8-1000.vcf.gz[0m[K
[01;31mchenopodium_20250304b-tetraploid.indivmiss60.locimiss50.thin10.maf0.01.LD0.8-1000.vcf.gz[0m[K


In [5]:
# this uses the carat (^) to exclude the entries in the given string. The "::-1" removes the final trailing comma from the last set.

# no CH, AR
bcftools view -s ^${cs_AR_samples}${cs_CH_samples::-1} \
    $FINAL_VCF_OUTPUT_PATH/$vcf_miss60.vcf.gz \
    --force-samples -Oz -o \
    $SUBSETS_PATH/$vcf_miss60.no-CH-AR.vcf.gz
echo Done!

Warn: exclude called for sample that does not exist in header: "CO-CH-13"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-15"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-16"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-20"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-26"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-4"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-7"... skipping
Done!


In [6]:
#makes lists of the populations by islands
python ../99_misc_scripts/make_lists_of_CH_samples_by_island.py

Written to Big_Island_samples.txt: CO-CH-4,CO-CH-5,CO-CH-3,CO-CH-6,CO-CH-7,CO-CH-8,CO-CH-9,CO-CH-10,CO-CH-11,CO-CH-12,CO-CH-46
Written to FFS_samples.txt: CO-CH-36,CO-CH-37
Written to Kauai_samples.txt: CO-CH-28,CO-CH-27
Written to Kaula_samples.txt: CO-CH-29
Written to Lisianski_samples.txt: CO-CH-38
Written to Maui_samples.txt: CO-CH-14,CO-CH-13,CO-CH-15
Written to Molokai_samples.txt: CO-CH-16,CO-CH-18,CO-CH-39,CO-CH-40,CO-CH-44,CO-CH-45
Written to Necker_samples.txt: CO-CH-32,CO-CH-33,CO-CH-34,CO-CH-35
Written to Nihoa_samples.txt: CO-CH-30,CO-CH-31
Written to Oahu_samples.txt: CO-CH-25,CO-CH-1,CO-CH-2,CO-CH-20,CO-CH-21,CO-CH-22,CO-CH-23,CO-CH-24,CO-CH-26
Written to Big_Island-exclusive_samples.txt: CO-CH-36,CO-CH-37,CO-CH-28,CO-CH-27,CO-CH-29,CO-CH-38,CO-CH-14,CO-CH-13,CO-CH-15,CO-CH-16,CO-CH-18,CO-CH-39,CO-CH-40,CO-CH-44,CO-CH-45,CO-CH-32,CO-CH-33,CO-CH-34,CO-CH-35,CO-CH-30,CO-CH-31,CO-CH-25,CO-CH-1,CO-CH-2,CO-CH-20,CO-CH-21,CO-CH-22,CO-CH-23,CO-CH-24,CO-CH-26
Written to FFS-excl

In [29]:
# no AR; no CH except OAHU
only_OA=$(head -n 1 ../00_data/CH_pop_lists/Oahu-exclusive_samples.txt)
bcftools view -s ^${cs_AR_samples}$only_OA \
    $FINAL_VCF_OUTPUT_PATH/$vcf_miss60.vcf.gz \
    --force-samples -Oz -o \
    $SUBSETS_PATH/$vcf_miss60.no-AR.no-CH-except-OA.vcf.gz
echo Done!

Warn: exclude called for sample that does not exist in header: "CO-CH-4"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-3"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-7"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-36"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-37"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-27"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-38"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-14"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-13"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-15"... skipping
Warn: exclude called for sample that does not exist in header: "CO-CH-16"... skipping
Warn: exclude called for sample that does not exist in he

In [13]:
vcftools --gzvcf $SUBSETS_PATH/$vcf_miss60.no-CH-AR.vcf.gz


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf /home/923643692/cheno_pop_tests/00_data/00d_data_subsets/chenopodium_20250304b-tetraploid.indivmiss60.locimiss50.thin10.maf0.01.LD0.8-1000.no-CH-AR.vcf.gz

Using zlib version: 1.2.13
After filtering, kept 65 out of 65 Individuals
After filtering, kept 18208 out of a possible 18208 Sites
Run Time = 1.00 seconds
