vcf2allelePlot.pl

#!/usr/bin/perl

use warnings;
use strict;
use Getopt::Std;

my $program = 'vcf2allelePlot.pl';					#name of script

my %parameters;							#input parameters
my ($infile,$outfile,$outfile2,$gffile,$outprefix);
my $qual = 40;
my $mwsize = 5000;
my $lowerH = 0.2;		# upper limit defining heterozygosity (ie <=80% of base calls): problem:A,C .. better to look at pileup file?
my $lwsize = 100000;
my $LOH = 0.001; 
my $df = 2;			# depth filter (e.g. 2 = 2x mean; 3=3x mean)
my $gffoutfile = "prefix.gff";


getopts('i:o:q:g:w:H:DmhI',\%parameters);

if (exists $parameters{"i"}) { $infile = $parameters{"i"}; }
if (exists $parameters{"q"}) { $qual = $parameters{"q"}; }
if (exists $parameters{"g"}) { $gffile = $parameters{"g"}; }
if (exists $parameters{"H"}) { $lowerH = $parameters{"H"}; }
if (exists $parameters{"o"}) { $outfile = $parameters{"o"}.".calls"; $outfile2 = $parameters{"o"}.".all"; $outprefix = $parameters{"o"}; }
elsif (defined $infile) { 
	if ($infile =~ /^(.+)\.\S+?$/m) { $outfile = "$1.calls"; $outfile2 = "$infile.all"; $outprefix = $1; }
	else { $outfile = "$infile.calls"; $outfile2 = "$infile.all"; $outprefix = $1; }
} 
if (exists $parameters{"w"}) { $mwsize = $parameters{"w"}; }

unless (exists $parameters{"i"}) {
	print "\n USAGE: $program -i '<vcf file>'\n\n";
	print   "    -i\tvcf file from bcftools call\n";
	print   "    -q\tminimum phred-scaled quality [$qual]\n";
	print   "    -g\tgff file of annotations [none]\n";
	print   "    -m\tshow mode in non-overlapping sliding windows\n";
	print   "    -h\tshow heterozygosity in non-overlapping sliding windows\n";
	print   "    -H\tfilter heterozygous sites if < $lowerH [$lowerH]\n";
	print   "    -w\twindow size [$mwsize bp]\n";
	print   "    -D\tfilter positions > 2 x mean depth\n";
	print   "    -I\tplot indels instead of read depth at point subs\n";
	print 	"    -o\tprefix for outfiles [prefix of vcf file]\n\n";
	exit;
}

my $upperH = 1-$lowerH;
my $RcmdFile = "$outprefix.Rcmds";
my $summaryFile = "$outprefix.het.txt";


open GFFOUT, ">$outprefix.$mwsize.gff" or die "couldn't open $outprefix.gff : $!";
print "\nLOH annotations generated by this script will be output to $outprefix.$mwsize.gff\n\n";

# READ THE GFF OF ANNOTATIONS IF THERE IS ONE

my (%atype,%astart,%aend,%seentype);		# chr will be the keys in these hashes of arrays
my (@chr, @types);
if (defined $gffile) { 
	open GFF, "<$gffile" or die "couldnt open $gffile : $!"; 
	print "\nReading in annotations from $gffile\n";
	while (<GFF>) {
		if (/^(\S+)\s+\S+\s+(\S+)\s+(\d+)\s+(\d+)/m) {
			print GFFOUT $_;
			$seentype{$2}++;
			push (@{$atype{$1}},$2);
			push (@{$astart{$1}},$3);
			push (@{$aend{$1}},$4);			
		}
		else { warn "\n**Unrecognized format for $gffile.**\n\n"; }
			
	}
	close GFF;
	@chr = sort keys %atype;
	@types = sort keys %seentype;
	print "   Found annotations for ".@chr." chromosomes: @chr\n";
	print "   There are ".@types." types of annotation: @types\n";
	unless (defined $chr[0]) { warn "ERROR: $gffile is empty or has an unrecognized format!\n"; }

}


# Extract relevant information for the allele plot 
# and Read in information relevant for the R commands (e.g. which chromosomes are present)
# and summarize heterozygosity genome-wide, in unannotated regions and 2 other chromosomal loci
#
open DATA, "<$infile" or die "couldn't open $infile : $!";

open OUT, ">$outfile" or die "couldn't open $outfile : $!";
open OUT2, ">$outfile2" or die "couldn't open $outfile2 : $!";


print OUT "chr\tpos\tREF\tALT\tQUAL\tREFfwd\tREFrev\tALTfwd\tALTrev\tpALT\ttype\tfilter\n";
print OUT2 "chr\tpos\tQUAL\tfilter\n";


print "\nReading and printing data from $infile to $outfile and $outfile2 ..\n";

my %chr;
my ($H,$sH,$fH);			# estimate heterozygosity ($H) from the number of high quality point subs ($ps) and high quality length ($l)
my $ps = 0; my $sps = 0; my $fps = 0;	# + short region sH (chr1 200,000..400,000)
my $l = 0; my $sl = 0; my $fl = 0;	# + annotation-filtered fH genome-wide
my $H3; my $ps3 = 0; my $l3 =0;		# + short region sH (chr1 900,000..1,100,000)
my $T;					# total length of sequence (for estimating % of 100,000 to use in histogram of heterozygosity
my $triallelic = 0; my $diallelic = 0; my $totalps=0;

my (%depth,%tdepth,%count,%filter,%fl,%variant,%pAlt,%q,%pos);

while (<DATA>) { 
								# find the lines with data 
	if (/^(\S+)\s+(\d+)\s+\.\s+(\S+)\s+(\S+)\s+(\S+)\s+\.\s+\S+?DP4=(\d+),(\d+),(\d+),(\d+)/m) {
		my $chr = $1;
		my $pos = $2;
		my $ref = $3;
		my $alt = $4;
		$q{$chr}[$pos] = $5;
		push(@{$pos{$chr}},$pos);


		$filter{$chr}[$pos] = "no";
		$T++;

		if ($q{$chr}[$pos] >= $qual) {	 # keep count of high quality sequence (including invariant sites)
			$l++; 										# genome-wide
			if (($chr eq "chr1") && ($pos > 200000) && ($pos <= 400000)) { $sl++; } 	# 200kb on chr1
			if (($chr eq "chr3") && ($pos > 900000) && ($pos <= 1100000)) { $l3++; } 	# 200kb on chr3

			foreach my $achr (@chr) { 
				if ($chr eq $achr) { 	
					for (my $i=0; $i < @{$astart{$chr}}; $i++) {		 
						if (($pos > $astart{$chr}[$i]) && ($pos <= $aend{$chr}[$i])) { 
							$filter{$chr}[$pos] = $atype{$chr}[$i]; 
						}		
					}
				}
			}  
			if ($filter{$chr}[$pos] eq "no") { $fl++; push(@{$fl{$chr}},$pos); }
		}	
		$depth{$chr}[$pos] = $6+$7+$8+$9;			# store depth info for whole genome (to create filter later)
		$tdepth{$chr} += $6+$7+$8+$9;				# estimate total depth (for average depth calculation)
		$count{$chr}++;
#		print "$chr $count{$chr}\t[$depth{$chr}[$pos]]\t{$tdepth{$chr}}\n";

#		print OUT2 "$chr\t$pos\t$q{$chr}[$pos]\t$filter{$chr}[$pos]\n";		# print invariant sites and their filter after applying meandepth filter
		
		if ($alt eq ".") { next; }				# invariant site
				
		$pAlt{$chr}[$pos] = ($8+$9)/$depth{$chr}[$pos];
		print OUT "$chr\t$2\t$3\t$4\t$5\t$6\t$7\t$8\t$9\t$pAlt{$chr}[$pos]\t";

		$chr{$chr}++;					# a hash storing chromosome names and the number of variant sites for each
		my $variant = "snp";				# SNP variant assumed by default
		unless (($ref =~/^\S$/m) && ($ref =~ /^\S$/m)) { $variant = "indel"; }	# should the second $ref be $alt?		
		if ($alt =~ /[A-Z][A-Z]/mi) { $variant = "indel"; }	 	# indels are missed without this
								# NOTE: counting multiple hits as SNPs (e.g $alt = T,A)
		print OUT "$variant\t$filter{$chr}[$pos]\n";

		if (($variant eq "snp") && ($q{$chr}[$pos]>=$qual) && ($pAlt{$chr}[$pos]>=$lowerH) && ($pAlt{$chr}[$pos]<=(1-$lowerH)))	{ 		# point sub with 0.2-0.8 allele ratio?
			$ps++; 										# genome-wide
			if (($chr eq "chr1") && ($pos > 200000) && ($pos <= 400000)) { $sps++; } 	# 200kb on chr1
			if (($chr eq "chr3") && ($pos > 900000) && ($pos <= 1100000)) { $ps3++; } 	# 200kb on chr1
			if ($filter{$chr}[$pos] eq "no") { $fps++; }
		}

		if (($variant eq "snp") && ($q{$chr}[$pos]>=$qual)) {
			$totalps++;
			if ($alt =~ /[ACGT],[ACGT]/) { $triallelic++; }
			elsif (($ref =~ /[ACGT]/) && ($alt =~ /[ACGT]/)) { $diallelic++; }
			else { warn "unregognized \$ref: $ref or \$alt: $alt\n"; }
		}
		$variant{$chr}[$pos] = $variant;
	}
}
close OUT;


my %meandepth;
my $fdps=0; my $fdl=0;

foreach my $chr (sort keys %count) {							# estimate mean depth for each chr
	print "$chr\tlength: $count{$chr}\ttotal read nt: $tdepth{$chr}\n";
	$meandepth{$chr} = $tdepth{$chr}/$count{$chr};
	print "Mean depth for $chr : $meandepth{$chr}\n";

	if (exists $parameters{"D"}) {							# filter high depth unannotated regions if requested
		my $fdepth = $df * $meandepth{$chr};
		if (defined $fl{$chr}[0]) {
			print "Will filter sites with > $fdepth depth on $chr\nAnnotation-filtered length for $chr pre-depth filter: ".@{$fl{$chr}}."\n\n"; 
		}
		else { print "There are 0 sites that are unfiltered after applying the $gffile filter on $chr ($infile)\n\n"; next; }
		
		foreach my $pos (@{$fl{$chr}}) {
			if ($depth{$chr}[$pos] <= $fdepth) { 
				$fdl++; 
				if ((defined $variant{$chr}[$pos]) && ($variant{$chr}[$pos] eq "snp") 
					&& ($q{$chr}[$pos]>=$qual) && ($pAlt{$chr}[$pos]>=$lowerH) && ($pAlt{$chr}[$pos]<=1-$lowerH)) { $fdps++; }
			}
			else { $filter{$chr}[$pos] = "depth_filter"; }			}	
	}
}

foreach my $chr (sort keys %pos) {				# print invariant sites and their filter after applying meandepth filter
	foreach my $pos (@{$pos{$chr}}) {
		print OUT2 "$chr\t$pos\t$q{$chr}[$pos]\t$filter{$chr}[$pos]\n";			
	}
}
close OUT2;

open OUT3, ">$summaryFile" or die "couldn't open $summaryFile : $!";

print "$infile\t$T\t# Length of all sequence (any quality)\n";
print OUT3 "$infile\t$T\t# Length of all sequence (any quality)\n";


print "$infile\t$l\t# Length of high quality sequence (q>=$qual) ".(($l/$T)*100)."%\n";
#print "$infile\t$totalps\t# Number of ALL high quality point subs (q>=$qual)\n";
print "$infile\t$triallelic\t# Number of high quality triallelic point subs (q>=$qual)\n";
print "$infile\t$diallelic\t# Number of high quality diallelic point subs (q>=$qual)\n";
print "$infile\t$ps\t# Number of high quality heterozygous point subs (q>=$qual, allele ratio: $lowerH-".(1-$lowerH).")\n";
print "$infile\t".($ps/$l)."\t( $ps / $l )\t# Genomewide heterozygosity (\$ps/\$l)\n";

print OUT3 "$infile\t$l\t# Length of high quality sequence (q>=$qual) ".(($l/$T)*100)."%\n";
print OUT3 "$infile\t$ps\t# Number of high quality heterozygous point subs (q>=$qual, allele ratio: $lowerH-$upperH\n";
print OUT3 "$infile\t".($ps/$l)."\t( $ps / $l )\t# Genomewide heterozygosity (\$ps/\$l)\n";
print OUT3 "$infile\t".($triallelic/$l)."\t( $triallelic / $l )\t# Genome-wide proportion of triallelic point subs (q>=$qual)\n";
print OUT3 "$infile\t$diallelic\t# Number of high quality diallelic point subs (q>=$qual)\n";

if (exists $parameters{"g"}) {
	print "\n$infile\t$fl\t# Length of unannotated sequence (q>=$qual) ".(($fl/$T)*100)."%\n";
	print "$infile\t$fps\t# Number of heterozygous point subs that are unannotated (q>=$qual)\n";
	print "$infile\t".($fps/$fl)."\t( $fps / $fl )\t# Unannotated heterozygosity ($gffile)\n";
	print OUT3 "$infile\t".($fps/$fl)."\t( $fps / $fl )\t# Unannotated heterozygosity ($gffile)\n";
}

if ((exists $parameters{"D"}) && ($fdl > 0))  {
	print "\n$infile\t$fdl\t# Length of unannotated sequence <=$df"."xmean per chr (q>=$qual)  ".(($fdl/$T)*100)."%\n";
	print "$infile\t$fdps\t# Number of heterozygous point subs that are unannotated and <=$df"."xmean per chr (q>=$qual)\n";
	print "$infile\t".($fdps/$fdl)."\t( $fdps / $fdl )\t# Depth <=$df"."xmean per chr, Unannotated heterozygosity ($gffile)\n";
	print OUT3 "$infile\t".($fdps/$fdl)."\t( $fdps / $fdl )\t# Depth <=$df"."xmean per chr, Unannotated heterozygosity ($gffile)\n";
}

if ($sl > 0) {
	print "\n$infile\t$sl\t# Length of sequence on chr1 200,000..400,000 (q>=$qual)\n";
	print "$infile\t$sps\t# Number of heterozygous point subs on chr1 200,000..400,000 (q>=$qual)\n";
	print "$infile\t".($sps/$sl)."\t# Chr1 200kb q$qual+ unfiltered heterozygosity (\$sps/\$sl)\n";
	print OUT3 "$infile\t".($sps/$sl)."\t# Chr1 200kb q$qual+ unfiltered heterozygosity (\$sps/\$sl)\n";
}

if ($l3 > 0) {
	print "\n$infile\t$l3\t# Length of sequence on chr3 900,000..1,100,000 (q>=$qual)\n";
	print "$infile\t$ps3\t# Number of heterozygous point subs on chr3 900,000..1,100,000 (q>=$qual)\n";
	print "$infile\t".($ps3/$l3)."\t# Chr3 200kb q$qual+ unfiltered heterozygosity (\$ps3/\$l3)\n\n";
	print OUT3 "$infile\t".($ps3/$l3)."\t# Chr3 200kb heterozygosity q$qual+ unfiltered (\$ps3/\$l3)\n\n";
}

close OUT3;

# Print Rcmds to run with R CMD BATCH
open RCMD, ">$RcmdFile" or die "couldn't open $RcmdFile : $!";

print RCMD "
rm(list=ls())
data<-read.table(\"$outfile\",header=T)
attach(data)
head(data)
data2<-read.table(\"$outfile2\",header=T)
head(data2)

pdf(\"$outprefix.$mwsize.pdf\")

# CHECK R GETS THE SAME SNP TOTALS AS IN PERL STDOUT ABOVE:

#  Number of high quality point subs (q>=$qual)
length(pALT[type==\"snp\"&QUAL>=$qual&pALT>=$lowerH&pALT<=$upperH])

# Number of point subs that are unannotated (q>=40)
length(pALT[type==\"snp\"&QUAL>=$qual&pALT>=$lowerH&pALT<=$upperH&filter==\"no\"])

				# A function for estimating the mode
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

				# Visualise LOH threshold in whole-genome analysis
glx<-ceiling(max(pos[QUAL>=$qual])/100000)
glW<-1:((glx/($lwsize/1000))*100)*$lwsize-($lwsize/2)
head(glW)
tail(glW)
gldiff_snp<-0
glerr_snp<-0
glhet_snp<-0	# genome-wide heterozygous ($lowerH-$upperH) snps >= q40
fglhet_snp<-0	# genome-wide heterozygous ($lowerH-$upperH) snps >= q40 that are unannotated
glhet_length<-0	# genome-wide heterozygous length >= q40
fglhet_length<-0	# genome-wide heterozygous length >= q40 that are unannotated


";
								# add an extra plot for heterozygosity if requested
if (defined $parameters{"h"}) { print RCMD "par(mfrow=c(3,1),cex=0.5)\n";  } 
else { print RCMD "par(mfrow=c(2,1),cex=0.5)\n"; }

my $firstchr;

foreach my $chr (sort keys %chr) {

	print RCMD "

				# prepare a sliding window vector for estimating mode
x<-ceiling(max(pos[chr==\"$chr\"&QUAL>=$qual])/100000)
W<-1:((x/($mwsize/1000))*100)*$mwsize-($mwsize/2)
head(W)
tail(W)
modepALTsnp<-0
modepALTindel<-0
diff_snp<-0
err_snp<-0
het_snp<-0

for(i in 1:length(W)) { 
	modepALTsnp[i] <- Mode(pALT[pos>(W[i]-($mwsize/2))&pos<=(W[i]+($mwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]) 
	diff_snp[i] <- sum(pALT[pos>(W[i]-($mwsize/2))&pos<=(W[i]+($mwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]>(1-$lowerH)) 
	err_snp[i] <- sum(pALT[pos>(W[i]-($mwsize/2))&pos<=(W[i]+($mwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]<$lowerH) 
	het_snp[i] <- sum(pALT[pos>(W[i]-($mwsize/2))&pos<=(W[i]+($mwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]<=(1-$lowerH)&pALT[pos>(W[i]-($mwsize/2))&pos<=(W[i]+($mwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]>=$lowerH)
}
summary(modepALTsnp)
head(cbind(W,modepALTsnp))
tail(cbind(W,modepALTsnp))
head(cbind(W,diff_snp/$mwsize))
head(cbind(W,err_snp/$mwsize))
head(cbind(W,het_snp/$mwsize))


for(i in 1:length(W)) { modepALTindel[i] <- Mode(pALT[pos>(W[i]-($mwsize/2))&pos<=(W[i]+($mwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"indel\"]) }


				# MAKE SEPARATE STACKED PLOTS FOR SNPs AND INDELS
				# SNPs
plot(c(0,max(W)),c(0,1),main=\"$infile: $chr freq of alternate alleles\",sub=\"SNPs Q$qual+\",xlab=\"position\",xaxt=\"n\",ylab=\"Proportion of base calls differing from reference\",ylim=c(0,1),xlim=c(1,max(pos[chr==\"$chr\"])),type = \"n\")
points(pos[chr==\"$chr\"&QUAL>=$qual&type==\"snp\"],pALT[chr==\"$chr\"&QUAL>=$qual&type==\"snp\"],pch=20,col=\"black\")
axis(1, xaxp=c(0, signif(max(pos[chr==\"$chr\"]),3), 20))
abline(h=0.5)
abline(h=mean(pALT[chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]),col=\"orange\")


## ESTIMATE LOH regions using LOH window size ($lwsize)
				# prepare a sliding window vector for estimating LOH regions
lx<-ceiling(max(pos[chr==\"$chr\"&QUAL>=$qual])/100000)
lW<-1:((lx/($lwsize/1000))*100)*$lwsize-($lwsize/2)
head(lW)
tail(lW)
ldiff_snp<-0
lerr_snp<-0
lhet_snp<-0
lhet_length<-0

fldiff_snp<-0
flerr_snp<-0
flhet_snp<-0
flhet_length<-0

for(i in 1:length(lW)) { 
	ldiff_snp[i] <- sum(pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]>1-$lowerH) 
	lerr_snp[i] <- sum(pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]<$lowerH) 
	lhet_snp[i] <- sum(pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]<=$upperH&pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]>=$lowerH)
	lhet_length[i] <- length(pALT[data2\$pos>(lW[i]-($lwsize/2))&data2\$pos<=(lW[i]+($lwsize/2))&data2\$chr==\"$chr\"&data2\$QUAL>=$qual])

	fldiff_snp[i] <- sum(pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"&filter==\"no\"]>$upperH) 
	flerr_snp[i] <- sum(pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"&filter==\"no\"]<$lowerH) 
	flhet_snp[i] <- sum(pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"&filter==\"no\"]<=(1-$lowerH)&pALT[pos>(lW[i]-($lwsize/2))&pos<=(lW[i]+($lwsize/2))&chr==\"$chr\"&QUAL>=$qual&type==\"snp\"&filter==\"no\"]>=$lowerH)
	flhet_length[i] <- length(pALT[data2\$pos>(lW[i]-($lwsize/2))&data2\$pos<=(lW[i]+($lwsize/2))&data2\$chr==\"$chr\"&data2\$QUAL>=$qual&data2\$filter==\"no\"])

}
lhet<-lhet_snp/lhet_length
flhet<-flhet_snp/flhet_length

options(scipen=999)
#hqseq<-$lwsize*($l/$T)
#LOHstarts_$chr<-lW[lhet_snp/hqseq<$LOH]-($lwsize/2)
#LOHends_$chr<-lW[lhet_snp/hqseq<$LOH]+($lwsize/2)

LOHstarts_$chr<-lW[flhet<$LOH]-($lwsize/2)	# identify LOH after filtering
LOHends_$chr<-lW[flhet<$LOH]+($lwsize/2)


LOHstarts_$chr
LOHends_$chr

head(cbind(lW,lhet))
head(cbind(lW,flhet))

\n";

	if (!$firstchr++) { 
		print RCMD "glhet_snp<-lhet_snp\n"; 
		print RCMD "fglhet_snp<-flhet_snp\n"; 
		print RCMD "glhet_length<-lhet_length\n"; 
		print RCMD "fglhet_length<-flhet_length\n"; 
	}
	else { 
		print RCMD "glhet_snp<-c(glhet_snp,lhet_snp)\n"; 
		print RCMD "fglhet_snp<-c(fglhet_snp,flhet_snp)\n"; 
		print RCMD "glhet_length<-c(glhet_length,lhet_length)\n"; 
		print RCMD "fglhet_length<-c(fglhet_length,flhet_length)\n"; 
	}


	if (defined $parameters{'m'}) { showmode($chr,"snp"); }	# show mode if requested
	if (defined $parameters{'g'}) { annotate($chr); }	# show annotations on SNP plot if requested 
	
	if (defined $parameters{'h'}) { 			# make a new plot of H if requested
								# legend format for 3 plots per page
		print RCMD "par(xpd=T)\n";	# do print legend outside the plot	
		print RCMD "legend(0,-0.16,c(round(mean(pALT[chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]),3),0.5),lty=c(1,1),col=c(\"orange\",\"black\"),title=\"mean\",bty=\"n\")\n";
		print RCMD "par(xpd=F)\n";	# don't print annotations outside the plot

		showH($chr,"snp");
		annotate($chr); 				# show annotations on H plot if requested 
	}			
	else { 							# legend format for 2 plots per page
		print RCMD "legend(0,0.15,c(round(mean(pALT[chr==\"$chr\"&QUAL>=$qual&type==\"snp\"]),3),0.5),lty=c(1,1),col=c(\"orange\",\"black\"),title=\"mean\",bty=\"n\")\n"; 
	}					

	if (exists $parameters{"I"}) {		# plot indels instead of read depth at point subs (not useful with the -I command of mpileup)
		print RCMD "
				# INDELs
plot(c(0,max(W)),c(0,1),main=\"$infile: $chr freq of alternate alleles\",sub=\"Indels Q$qual+\",xlab=\"position\",ylab=\"Proportion of base calls differing from reference\",xaxt=\"n\",ylim=c(0,1),xlim=c(1,max(pos[chr==\"$chr\"])),type = \"n\")	
points(pos[chr==\"$chr\"&QUAL>=$qual&type==\"indel\"],pALT[chr==\"$chr\"&QUAL>=$qual&type==\"indel\"],pch=20,col=\"black\")	
axis(1, xaxp=c(0, signif(max(pos[chr==\"$chr\"]),3), 20))
abline(h=0.5)
abline(h=mean(pALT[chr==\"$chr\"&QUAL>=$qual&type==\"indel\"]),col=\"orange\")
legend(0,-0.16,c(round(mean(pALT[chr==\"$chr\"&QUAL>=$qual&type==\"indel\"]),3),0.5),lty=c(1,1),col=c(\"orange\",\"black\"),title=\"mean\",bty=\"n\")
";

	if (defined $parameters{'g'}) { annotate($chr); }		# show annotations on INDEL plot if requested (in progress)
	if (defined $parameters{'m'}) { showmode($chr,"indel"); }	# show mode if requested

	}
	else {					# plot read depth for each point sub
		print RCMD "
depth<-REFfwd+REFrev+ALTfwd+ALTrev
plot(c(0,max(W)),c(0,max(depth[chr==\"$chr\"])),type='n',ylab=\"Read depth\",xlab=\"position\", main=\"$chr: Depth at positions of point substitutions\",xlim=c(1,max(pos[chr==\"$chr\"])))
points(pos[chr==\"$chr\"],depth[chr==\"$chr\"],pch=20)
quantile(depth[chr==\"$chr\"],probs=c(0,0.005,0.025,0.5,0.975,0.995,1))
abline(h=mean(depth[chr==\"$chr\"],col=\"blue\"))
abline(h=as.numeric(quantile(depth[chr==\"$chr\"],0.995)),col=\"red\")
abline(h=($meandepth{$chr}*$df),col=\"purple\")
legend(\"topright\",c(\"mean\",\"99% quantile\",\"mean*$df\"),lty=c(1,1,1),col=c(\"blue\",\"red\",\"purple\"),bty=\"n\")

";
	}

	if (defined $parameters{'g'}) { annotate($chr); }		# show annotations on read depth plot if requested (in progress)

	print RCMD "rm(W,modepALTsnp,modepALTindel)\n";			# CLEAN UP


}

print RCMD "sum(glhet_snp)\n";
print RCMD "length(glhet_snp)\n";
print RCMD "sum(glhet_length)\n";
print RCMD "length(glhet_length)\n";
print RCMD "summary((glhet_snp/glhet_length)[glhet_snp/glhet_length<0.001])\n";

print RCMD "sum(fglhet_snp)\n";
print RCMD "length(fglhet_snp)\n";
print RCMD "sum(fglhet_length)\n";
print RCMD "length(fglhet_length)\n";

print RCMD "cbind(glhet_snp,glhet_length)\n";


print RCMD "par(mfrow=c(2,1))\n";

print RCMD "hist(glhet_snp/glhet_length,20,main=\"Unfiltered heterozygosity q40+\")\n";
print RCMD "abline(v=$LOH,col=\"red\")\n";

print RCMD "hist(fglhet_snp/fglhet_length,20,main=\"Unannotated heterozygosity q40+\")\n";
#print RCMD "abline(v=$LOH,col=\"red\")\n";


close RCMD;

# RUN THE R COMMANDS WITH OUTPUTS GOING OUT TO THE DEFAULT FILE NAMES

print "Running $RcmdFile commands in R ..\n";
`R CMD BATCH $RcmdFile`;
#`mv vcf2allelePlot.Rcmds.Rout $outprefix.Rout`;		# save R input and output for future reference
#`mv vcf2allelePlot.Rcmds $outprefix.Rcmds`;		# save R input and output for future reference
print "Done. R output is in $RcmdFile.Rout and plots are in $outprefix.$mwsize.pdf\n\n";


# READ IN R OUTPUT TO EXTRACT THE LOH REGIONS TO PRINT IN GFF FORMAT

open ROUT, "<$RcmdFile.Rout" or die "couldn't open $RcmdFile.Rout : $!";

my $results;
print "LOH regions:\n";
while (<ROUT>) { $results .= $_; }
#print $results;
while ($results =~ /LOHstarts_(\S+).*?(\s+\S+\s+.*?)LOHends_(\S+).*?\s+/imsg) { 
	my $chr = $1;
	my $starts = $2;

	if ($starts =~ /numeric\(0\)/) { print "$chr\tNo LOH regions\n"; next; }	# there were no LOH regions
	unless ($starts =~ /[0-9]/) { next; }						# spurious match in R output
#	my $ends = $3;

	my (@starts,@ends);
	while ($starts =~/\s+(\d+)/g) { push (@starts,$1); push (@ends,$1+$lwsize); }

#	print "chromosome: $chr\nstarts: @starts\nends: @ends\n";


	my (@jstarts,@jends);
	my $prevend = -1; my $jstart = "none";			# in progress: need to join into continuos blocks
	for (my $i=0; $i<@starts; $i++) {
#		print "$chr\t$starts[$i] .. $ends[$i]\t$jstart\t[$prevend]\n";
		if ($jstart eq "none") {			# a new LOH block 
			push (@jstarts, $starts[$i]); 
			$jstart = $starts[$i]; 
			$prevend = $ends[$i];
			next;
		}
		if ($prevend == $starts[$i]) { 			# continuing a LOH block
			$prevend = $ends[$i]; 
			next; 
		}
		else {						# end of a LOH block
			print "$infile\tLOHblock: $chr\t$jstart\t$prevend\n";		# problem: 1st entry gets mistaken for a block
			print GFFOUT "$chr\tvcf2allelePlot.pl\tLOH\t$jstart\t$prevend\t.\t+\t.\tLOHregion. Windowsize=$lwsize; snp heterozygosity < $LOH\n";		

			push (@jends,$prevend);
			$jstart = $starts[$i];
			push (@jstarts, $starts[$i]); 
			$prevend = $ends[$i];
		}
	}
	my $i = @starts-1;
	print "$infile\tLOHblock: $chr\t$jstart\t$prevend\n";		# last LOH block
	if ($jstart ne "none") {
		print GFFOUT "$chr\tvcf2allelePlot.pl\tLOH\t$jstart\t$ends[$i]\t.\t+\t.\tLOHregion. Windowsize=$lwsize; snp heterozygosity < $LOH\n";
		push (@jends,$ends[$i]);
	}
	else { warn "no LOH regions found for $chr in $RcmdFile.Rout\n"; } 
}

close ROUT;
close GFFOUT;

# SUBROUTINES 

sub annotate {
						# annotate the plot with slightly transparent colored rectangles
	my $chr = shift;

	my $height = 10000;

	print RCMD "par(xpd=F)\n";	# do not print rectangles outside the plot	

#	unless (defined $astart{$chr}[0]) { last; }
	unless (defined $astart{$chr}[0]) { return; }


	for (my $i=0; $i<@{$astart{$chr}}; $i++) {
		my $j;				# use a number from 1 to n for type color	
		for ($j=0; $j<@types; $j++) { if ($atype{$chr}[$i] eq $types[$j]) { $j += 1; last; } }	
			
		print RCMD "rect($astart{$chr}[$i],0,$aend{$chr}[$i],$height,col=rainbow(".@types.",alpha=0.3)[$j],border=rainbow(".@types.",alpha=0.3)[$j])\n"; 	

	}
	print RCMD "par(xpd=T)\n";	# do print legend outside the plot	


	for (my $j=1; $j<=@types; $j++) {
		print RCMD "legend(W[100],-0.15-($j/20),lty=1,legend=\"$types[$j-1]\",col=rainbow(".@types.",alpha=0.3)[$j],bty=\"n\")\n";	
	}
	print RCMD "par(xpd=F)\n";	# don't print annotations outside the plot
}	

sub showmode {
	my $chr = shift;
	my $type = shift;
	print RCMD "par(xpd=T)\n";	# do print legend outside the plot	
	print RCMD "lines(W,modepALT$type,col=\"green\")\n";
	print RCMD "legend(W[60],-0.16,Mode(pALT[chr==\"$chr\"&QUAL>=$qual&type==\"$type\"]),lty=1,col=\"green\",title=\"mode\",bty=\"n\")\n";
	print RCMD "par(xpd=F)\n";	# don't print annotations outside the plot	
}

sub showH {
	my $chr = shift;
	my $type = shift;

	print RCMD "
my<-max((het_$type+diff_$type+err_$type)/$mwsize)
plot(c(0,max(W)),c(0,max((het_$type+diff_$type+err_$type)/$mwsize)),type='n',ylab=\"No.sites per $mwsize\",xlab=\"position\",main=\"$infile: $chr sliding window ($mwsize) of heterozygosity, homozygous diffs and error\",sub=\"$type Q$qual+\",xaxt=\"n\")
axis(1, xaxp=c(0, signif(max(pos[chr==\"$chr\"]),3), 20))
\n";
	print RCMD "lines(W,diff_$type/$mwsize,col=\"blue\")\n"; # No. of sites per window size (ie does not control for missing data
	print RCMD "lines(W,het_$type/$mwsize,col=\"red\")\n";
	print RCMD "lines(W,err_$type/$mwsize,col=\"grey\")\n";

	print RCMD "par(xpd=T)\n";	# do print legend outside the plot	
	print RCMD "legend(W[60],my,c(\"diff_$type\",\"het_$type\",\"err_$type\"),lty=c(1,1,1),col=c(\"blue\",\"red\",\"grey\"),bty=\"n\")\n";
	print RCMD "par(xpd=F)\n";	# don't print annotations outside the plot	

}