In [1]:
#!/bin/bash
# Cantley Lab of Plant Evolution
# Chenopodium Project
# Example ipyrad code (command line) from https://ipyrad.readthedocs.io/en/master/pedicularis_.html
# Baker-Strader, R.; Bullock, M.; Cantley, J.

In [2]:
# sets the cutoffs
indv_decimal_cutoffs=('0.2' '0.4' '0.6') # 20/40/60% cuts individuals missing > X% missing data 
loci_decimal_cutoffs=('0.5')             # 50%       cuts loci missing > X% missing data 
bp_proximity_cutoffs=('10')              # 10bp      removes sites within X bp of one another to acct for LD
MAF_thresholds=('0.01')                   # 0.01      minor allele frequency threshold
LDs=('0.8')
LD_windows=('1000')

In [3]:
# directories and file paths. You'll need to set these
WORKING_DIR="/home/923643692/cheno_pop_tests" # where the main analysis folder is stored; an absolute path, because vcftools doesn't like relative dirs?
IPYRAD_OUT_VCF_PATH="/usr/nfs/923643692/cheno_popgen/ipyrad/tetraploid/chenopodium1_tetraploid/chenopodium_20250304b-tetraploid_outfiles/chenopodium_20250304b-tetraploid.vcf"
BASE_NAME="chenopodium_20250304b-tetraploid"

# You can leave these alone, they're just here to make the follow-on code less clunky
FILTER_PATH="$WORKING_DIR/00_data"
IMISS_PATH="$FILTER_PATH/00a_imiss"
WORKING_PATH="$FILTER_PATH/00b_intermediate_files"
FINAL_VCF_OUTPUT_PATH="$FILTER_PATH/00c_data_all"
SUBSETS_PATH="$FILTER_PATH/00d_data_subsets"

In [4]:
# makes the missing data file
vcftools --vcf $IPYRAD_OUT_VCF_PATH \
    --missing-indv --out "$FILTER_PATH/missingind"

# Making a list of individuals (sample ID only) with 20, 40, or 60% or more missing data
for IND_DEC in ${indv_decimal_cutoffs[@]}; do
    IND_INT=$(echo "$IND_DEC*100/1" |bc) # multiplies those numbers by 100 for naming
    mawk -v IND_DEC="$IND_DEC" '$5 > IND_DEC' $FILTER_PATH/missingind.imiss | \
        cut -f1 > "$FILTER_PATH/individals_missing_$IND_INT.indv"

    # remove INDIVIDUALS missing more than 20/40/60% of data
    vcftools --vcf $IPYRAD_OUT_VCF_PATH \
        --remove "$FILTER_PATH/individals_missing_$IND_INT.indv" \
			--recode --recode-INFO-all \
			--out "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT"

    # remove LOCI missing more than 50% of data
    for LOC_DEC in ${loci_decimal_cutoffs[@]}; do
        LOC_INT=$(echo "$LOC_DEC*100/1" |bc) # multiplies those numbers by 100 for naming    
        vcftools --vcf "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.recode.vcf" \
            --max-missing $LOC_DEC --recode --recode-INFO-all \
    		--out "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT"

        # remove LOCI missing more than 50% of data
        for NBP in ${bp_proximity_cutoffs[@]}; do
            LOC_INT=$(echo "$LOC_DEC*100/1" |bc) # multiplies those numbers by 100 for naming    
            vcftools --vcf "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.recode.vcf" \
                --thin $NBP --recode --recode-INFO-all \
    			--out "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP"
            
                    # remove LOCI missing more than 50% of data
            for MAF in ${MAF_thresholds[@]}; do
                vcftools --vcf "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.recode.vcf" \
                    --maf $MAF --recode --recode-INFO-all \
        			--out "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF"

                out_file="$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF.recode.vcf"
                bgzip $out_file
                # zips and index for use with bcftools
                tabix -p vcf "$out_file.gz"

                # Does the LD trimming in bcftools
                for LD in ${LDs[@]}; do
                    for WNDW in ${LD_windows[@]}; do
                        bcftools +prune -m $LD -w $WNDW \
                            "$FILTER_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF.recode.vcf.gz" \
                            -Oz -o \
                            "$FINAL_VCF_OUTPUT_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF.LD$LD-$WNDW.vcf.gz"
                    done
                done
            done
        done
    done
done


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf /usr/nfs/923643692/cheno_popgen/ipyrad/tetraploid/chenopodium1_tetraploid/chenopodium_20250304b-tetraploid_outfiles/chenopodium_20250304b-tetraploid.vcf
	--missing-indv
	--out /home/923643692/cheno_pop_tests/00_data/missingind

After filtering, kept 112 out of 112 Individuals
Outputting Individual Missingness
After filtering, kept 54661 out of a possible 54661 Sites
Run Time = 1.00 seconds

01a_filtering.ipynb   .ipynb_checkpoints/   
01b_subsetting.ipynb  out.log               

01a_filtering.ipynb   .ipynb_checkpoints/   
01b_subsetting.ipynb  out.log               

01a_filtering.ipynb   .ipynb_checkpoints/   
01b_subsetting.ipynb  out.log               

01a_filtering.ipynb   .ipynb_checkpoints/   
01b_subsetting.ipynb  out.log               

01a_filtering.ipynb   .ipynb_checkpoints/   
01b_subsetting.ipynb  out.log               

01a_filtering.ipynb   .ipynb_checkpoints/   
01b_subset

In [5]:
# get total numbers of individuals and sites
vcftools --vcf $IPYRAD_OUT_VCF_PATH


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf /usr/nfs/923643692/cheno_popgen/ipyrad/tetraploid/chenopodium1_tetraploid/chenopodium_20250304b-tetraploid_outfiles/chenopodium_20250304b-tetraploid.vcf

After filtering, kept 112 out of 112 Individuals
After filtering, kept 54661 out of a possible 54661 Sites
Run Time = 0.00 seconds


In [8]:
mkdir $WORKING_PATH
mkdir $FINAL_VCF_OUTPUT_PATH
mkdir $SUBSETS_PATH
mkdir $IMISS_PATH

# makes the missing data file
vcftools --vcf $IPYRAD_OUT_VCF_PATH \
    --missing-indv --out "$IMISS_PATH/missingind"

# Making a list of individuals (sample ID only) with 20, 40, or 60% or more missing data
for IND_DEC in ${indv_decimal_cutoffs[@]}; do
    IND_INT=$(echo "$IND_DEC*100/1" |bc) # multiplies those numbers by 100 for naming
    echo "Making missingind file for $IND_INT cutoff..."
    mawk -v IND_DEC="$IND_DEC" '$5 > IND_DEC' "$IMISS_PATH/missingind.imiss" | \
        cut -f1 > "$IMISS_PATH/individuals_missing_$IND_INT.indv"

    # remove INDIVIDUALS missing more than 20/40/60% of data
    echo "Removing individuals missing $IND_INT% or more data..."
    vcftools --vcf $IPYRAD_OUT_VCF_PATH \
        --remove "$IMISS_PATH/individuals_missing_$IND_INT.indv" \
		--recode --recode-INFO-all \
        --out "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT"

    # remove LOCI missing more than 50% of data
    for LOC_DEC in ${loci_decimal_cutoffs[@]}; do
        LOC_INT=$(echo "$LOC_DEC*100/1" |bc) # multiplies those numbers by 100 for naming    
        echo "Removing loci $LOC_INT% or more data..."
        vcftools --vcf "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.recode.vcf" \
            --max-missing $LOC_DEC --recode --recode-INFO-all \
        	--out "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT"

        # remove LOCI within N base pairs of each other
        for NBP in ${bp_proximity_cutoffs[@]}; do
            echo "Removing sites within $NBP bp of one another..."
            vcftools --vcf "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.recode.vcf" \
                --thin $NBP --recode --recode-INFO-all \
                --out "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP"
            
            # filter for minor allele frequency below a certain (MAF) threshold (reduce very rare alleles/singletons)
            for MAF in ${MAF_thresholds[@]}; do
                echo "Removing alleles with MAF < $MAF..."
                vcftools --vcf "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.recode.vcf" \
                --maf $MAF --recode --recode-INFO-all \
                --out "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF" 

                # zips and indexes for use with bcftools
                out_file="$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF.recode.vcf"
                bgzip $out_file
                tabix -p vcf "$out_file.gz"

                # Does the LD trimming in bcftools
                for LD in ${LDs[@]}; do
                    for WNDW in ${LD_windows[@]}; do
                        echo "Pruning sites with LD > $LD in a $WNDW bp window..."
                        bcftools +prune -m $LD -w $WNDW \
                            "$WORKING_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF.recode.vcf.gz" \
                            -Oz -o \
                            "$FINAL_VCF_OUTPUT_PATH/$BASE_NAME.indivmiss$IND_INT.locimiss$LOC_INT.thin$NBP.maf$MAF.LD$LD-$WNDW.vcf.gz"
                   done
                done
            done
        done
    done
done

# compresses intermediate files to save space
cd $WORKING_PATH
bgzip *.vcf
cd ../


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf /usr/nfs/923643692/cheno_popgen/ipyrad/tetraploid/chenopodium1_tetraploid/chenopodium_20250304b-tetraploid_outfiles/chenopodium_20250304b-tetraploid.vcf
	--missing-indv
	--out /home/923643692/cheno_pop_tests/00_data/00a_imiss/missingind

After filtering, kept 112 out of 112 Individuals
Outputting Individual Missingness
After filtering, kept 54661 out of a possible 54661 Sites
Run Time = 0.00 seconds

01_filtering.ipynb  .ipynb_checkpoints/ out.log
Making missingind file for 20 cutoff...
Removing individuals missing 20% or more data...

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf /usr/nfs/923643692/cheno_popgen/ipyrad/tetraploid/chenopodium1_tetraploid/chenopodium_20250304b-tetraploid_outfiles/chenopodium_20250304b-tetraploid.vcf
	--remove /home/923643692/cheno_pop_tests/00_data/00a_imiss/individuals_missing_20.indv
	--recode-INFO-all
	--out 

Now we subset based on populations.

In [6]:
ls $FINAL_VCF_OUTPUT_PATH/
vcf_miss60=chenopodium_20250304b-tetraploid.indivmiss60.locimiss50.thin10.maf0.01.LD0.8-1000

[0m[01;31mchenopodium_20250304b-tetraploid.indivmiss20.locimiss50.thin10.maf0.01.LD0.8-1000.vcf.gz[0m[K
[01;31mchenopodium_20250304b-tetraploid.indivmiss40.locimiss50.thin10.maf0.01.LD0.8-1000.vcf.gz[0m[K
[01;31mchenopodium_20250304b-tetraploid.indivmiss60.locimiss50.thin10.maf0.01.LD0.8-1000.vcf.gz[0m[K
