diff --git a/commands.go b/commands.go index 7c382fff..9d893338 100644 --- a/commands.go +++ b/commands.go @@ -3,11 +3,13 @@ package main import ( "bufio" "bytes" + "crypto" "encoding/json" "fmt" "io" "os" "path/filepath" + "strings" "sync" "github.com/urfave/cli/v2" @@ -40,16 +42,7 @@ parse them, and then spit out a similiarly named file with the .json extension. func convert(c *cli.Context) error { if isPipe() { - var annotatedSequence AnnotatedSequence - - // logic for determining input format, then parses accordingly. - if c.String("i") == "json" { - json.Unmarshal([]byte(stdinToString(os.Stdin)), &annotatedSequence) - } else if c.String("i") == "gbk" || c.String("i") == "gb" { - annotatedSequence = ParseGbk(stdinToString(os.Stdin)) - } else if c.String("i") == "gff" { - annotatedSequence = ParseGff(stdinToString(os.Stdin)) - } + annotatedSequence := parseStdin(c) var output []byte @@ -66,16 +59,8 @@ func convert(c *cli.Context) error { // } else { - var matches []string - - //take all args and get their pattern matches. - for argIndex := 0; argIndex < c.Args().Len(); argIndex++ { - match, _ := filepath.Glob(c.Args().Get(argIndex)) - matches = append(matches, match...) - } - - //filtering pattern matches for duplicates. - matches = uniqueNonEmptyElementsOf(matches) + // gets glob pattern matches to determine which files to use. + matches := getMatches(c) // TODO write basic check to see if input flag or all paths have accepted file extensions. @@ -93,18 +78,7 @@ func convert(c *cli.Context) error { // executing Go routine. go func(match string) { extension := filepath.Ext(match) - var annotatedSequence AnnotatedSequence - - // determining which reader to use and parse into AnnotatedSequence struct. - if extension == ".gff" || c.String("i") == "gff" { - annotatedSequence = ReadGff(match) - } else if extension == ".gbk" || extension == ".gb" || c.String("i") == "gbk" || c.String("i") == "gb" { - annotatedSequence = ReadGbk(match) - } else if extension == ".json" || c.String("i") == "json" { - annotatedSequence = ReadJSON(match) - } else { - // TODO put default error handling here. - } + annotatedSequence := fileParser(c, match) // determining output format and name, then writing out to name. outputPath := match[0 : len(match)-len(extension)] @@ -130,6 +104,95 @@ func convert(c *cli.Context) error { return nil } +/****************************************************************************** + +hash currently has two modes. Pipe and fileio. + +The function isPipe() detects if input is coming from a pipe like: + + cat data/bsub.gbk | poly hash -i gbk -o json > test.json + +In this case the output goes directly to standard out and can be redirected +into a file. + +Without the "-o json" json flag only the hash is printed to stdout. + +to force all output to go to stdout use --stdout. + +If not from a pipe convert checks args for file patterns to find, then iterates +over each matched file pattern to read in a file, then spit out the desired +output. + +For example: + + poly hash -o json *.gbk *.gff + +will read all files in a directory with ".gbk" or ".gff" as their extension +parse them, and then spit out a similiarly named file with the .json extension along with their hashes. + +******************************************************************************/ +func hash(c *cli.Context) { + + if isPipe() { + annotatedSequence := parseStdin(c) + sequenceHash := flagSwitchHash(c, annotatedSequence) + if c.String("o") == "json" { + annotatedSequence.Sequence.Hash = sequenceHash + annotatedSequence.Sequence.HashFunction = strings.ToUpper(c.String("t")) + output, _ := json.MarshalIndent(annotatedSequence, "", " ") + fmt.Print(string(output)) + } else { + fmt.Print(sequenceHash) + } + } else { + + // gets glob pattern matches to determine which files to use. + matches := getMatches(c) + + // declaring wait group outside loop + var wg sync.WaitGroup + + // concurrently iterate through each pattern match, read the file, output to new format. + for _, match := range matches { + + // incrementing wait group for Go routine + wg.Add(1) + + // executing Go routine. + go func(match string) { + extension := filepath.Ext(match) + annotatedSequence := fileParser(c, match) + sequenceHash := flagSwitchHash(c, annotatedSequence) + if c.String("o") == "json" { + annotatedSequence.Sequence.Hash = sequenceHash + annotatedSequence.Sequence.HashFunction = strings.ToUpper(c.String("t")) + + if c.Bool("stdout") == true { + output, _ := json.MarshalIndent(annotatedSequence, "", " ") + fmt.Print(string(output)) + } else { + outputPath := match[0 : len(match)-len(extension)] + WriteJSON(annotatedSequence, outputPath+".json") + } + + } else { + fmt.Println(sequenceHash) + } + + // decrementing wait group. + wg.Done() + + }(match) // passing match to Go routine anonymous function. + + } + + // waiting outside for loop for Go routines so they can run concurrently. + wg.Wait() + + } + +} + // a simple helper function to convert an *os.File type into a string. func stdinToString(file *os.File) string { var stringBuffer bytes.Buffer @@ -173,3 +236,100 @@ func isPipe() bool { } return flag } + +// a simple helper function to take stdin from a pipe and parse it into an annotated sequence +func parseStdin(c *cli.Context) AnnotatedSequence { + var annotatedSequence AnnotatedSequence + + // logic for determining input format, then parses accordingly. + if c.String("i") == "json" { + json.Unmarshal([]byte(stdinToString(os.Stdin)), &annotatedSequence) + } else if c.String("i") == "gbk" || c.String("i") == "gb" { + annotatedSequence = ParseGbk(stdinToString(os.Stdin)) + } else if c.String("i") == "gff" { + annotatedSequence = ParseGff(stdinToString(os.Stdin)) + } + return annotatedSequence +} + +// helper function to hash sequence based on flag using generic hash. +func flagSwitchHash(c *cli.Context, annotatedSequence AnnotatedSequence) string { + + var hashString string + switch strings.ToUpper(c.String("t")) { + case "MD5": + hashString = annotatedSequence.hash(crypto.MD5) + case "SHA1": + hashString = annotatedSequence.hash(crypto.SHA1) + case "SHA244": + hashString = annotatedSequence.hash(crypto.SHA224) + case "SHA256": + hashString = annotatedSequence.hash(crypto.SHA256) + case "SHA384": + hashString = annotatedSequence.hash(crypto.SHA384) + case "SHA512": + hashString = annotatedSequence.hash(crypto.SHA512) + case "RIPEMD160": + hashString = annotatedSequence.hash(crypto.RIPEMD160) + case "SHA3_224": + hashString = annotatedSequence.hash(crypto.SHA3_224) + case "SHA3_256": + hashString = annotatedSequence.hash(crypto.SHA3_256) + case "SHA3_384": + hashString = annotatedSequence.hash(crypto.SHA3_384) + case "SHA3_512": + hashString = annotatedSequence.hash(crypto.SHA3_512) + case "SHA512_224": + hashString = annotatedSequence.hash(crypto.SHA512_224) + case "SHA512_256": + hashString = annotatedSequence.hash(crypto.SHA512_256) + case "BLAKE2s_256": + hashString = annotatedSequence.hash(crypto.BLAKE2s_256) + case "BLAKE2b_256": + hashString = annotatedSequence.hash(crypto.BLAKE2b_256) + case "BLAKE2b_384": + hashString = annotatedSequence.hash(crypto.BLAKE2b_384) + case "BLAKE2b_512": + hashString = annotatedSequence.hash(crypto.BLAKE2b_512) + case "BLAKE3": + hashString = annotatedSequence.blake3Hash() + default: + break + } + return hashString +} + +// helper function to get unique glob patterns from cli.context +func getMatches(c *cli.Context) []string { + var matches []string + + //take all args and get their pattern matches. + for argIndex := 0; argIndex < c.Args().Len(); argIndex++ { + match, _ := filepath.Glob(c.Args().Get(argIndex)) + matches = append(matches, match...) + } + + //filtering pattern matches for duplicates. + matches = uniqueNonEmptyElementsOf(matches) + + return matches + +} + +// function to parse whatever file is at a matched path. +func fileParser(c *cli.Context, match string) AnnotatedSequence { + extension := filepath.Ext(match) + var annotatedSequence AnnotatedSequence + + // determining which reader to use and parse into AnnotatedSequence struct. + if extension == ".gff" || c.String("i") == "gff" { + annotatedSequence = ReadGff(match) + } else if extension == ".gbk" || extension == ".gb" || c.String("i") == "gbk" || c.String("i") == "gb" { + annotatedSequence = ReadGbk(match) + } else if extension == ".json" || c.String("i") == "json" { + annotatedSequence = ReadJSON(match) + } else { + // TODO put default error handling here. + } + return annotatedSequence +} diff --git a/commands_test.go b/commands_test.go index 62438d3b..cc2361e5 100644 --- a/commands_test.go +++ b/commands_test.go @@ -3,6 +3,7 @@ package main import ( "os" "os/exec" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -58,3 +59,37 @@ func TestConvert(t *testing.T) { } } + +func TestHash(t *testing.T) { + + puc19GbkBlake3Hash := "4031e1971acc8ff1bf0aa4ed623bc58beefc15e043075866a0854d592d80b28b" + + // testing pipe input + command := "cat data/puc19.gbk | poly hash -i gbk" + hashOutput, _ := exec.Command("bash", "-c", command).Output() + hashOutputString := strings.TrimSpace(string(hashOutput)) + + if hashOutputString != puc19GbkBlake3Hash { + t.Errorf("TestHash for piped input has failed. Returned %q, want %q", hashOutputString, puc19GbkBlake3Hash) + } + + // testing regular input + command = "poly hash data/puc19.gbk" + hashOutput, _ = exec.Command("bash", "-c", command).Output() + hashOutputString = strings.TrimSpace(string(hashOutput)) + + if hashOutputString != puc19GbkBlake3Hash { + t.Errorf("TestHash for regular input has failed. Returned %q, want %q", hashOutputString, puc19GbkBlake3Hash) + } + + // testing json write output + command = "poly hash -o json data/puc19.gbk" + exec.Command("bash", "-c", command).Output() + hashOutputString = ReadJSON("data/puc19.json").Sequence.Hash + os.Remove("data/puc19.json") + + if hashOutputString != puc19GbkBlake3Hash { + t.Errorf("TestHash for json write output has failed. Returned %q, want %q", hashOutputString, puc19GbkBlake3Hash) + } + +} diff --git a/go.mod b/go.mod index 3dae1a0d..f7be6f38 100644 --- a/go.mod +++ b/go.mod @@ -6,5 +6,8 @@ require ( github.com/PuerkitoBio/goquery v1.5.1 github.com/google/go-cmp v0.4.1 github.com/pmezard/go-difflib v1.0.0 + github.com/sergi/go-diff v1.1.0 github.com/urfave/cli/v2 v2.2.0 + golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9 + lukechampine.com/blake3 v1.0.0 ) diff --git a/go.sum b/go.sum index 05b332d3..7e18c671 100644 --- a/go.sum +++ b/go.sum @@ -5,23 +5,47 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/go-cmp v0.4.1 h1:/exdXoGamhu5ONeUJH0deniYLWYvQwW66yvlfiiKTu0= github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/urfave/cli/v2 v2.2.0 h1:JTTnM6wKzdA0Jqodd966MVj4vWbbquZykeX1sKbe2C4= github.com/urfave/cli/v2 v2.2.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9 h1:vEg9joUBmeBcK9iSJftGNf3coIG4HqZElCPehJsfAYM= +golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4 h1:/eiJrUcujPVeJ3xlSWaiNi3uSVmDGBK1pDHUHAnao1I= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +lukechampine.com/blake3 v1.0.0 h1:dNj1NVD7SLgkU7dykKjmmOSOTTx7ZmxnDyUyvxnQP2Q= +lukechampine.com/blake3 v1.0.0/go.mod h1:e0XQzEQp6LtbXBhzYxRoh6s3kcmX+fMMg8sC9VgWloQ= diff --git a/hash.go b/hash.go new file mode 100644 index 00000000..6dde73dd --- /dev/null +++ b/hash.go @@ -0,0 +1,133 @@ +package main + +import ( + "crypto" + _ "crypto/md5" + _ "crypto/sha1" + _ "crypto/sha256" + _ "crypto/sha512" + "encoding/hex" + "errors" + + _ "golang.org/x/crypto/blake2b" + _ "golang.org/x/crypto/blake2s" + _ "golang.org/x/crypto/ripemd160" + _ "golang.org/x/crypto/sha3" + "lukechampine.com/blake3" +) + +// Where each hash function comes from. +// MD5 // import crypto/md5 +// SHA1 // import crypto/sha1 +// SHA224 // import crypto/sha256 +// SHA256 // import crypto/sha256 +// SHA384 // import crypto/sha512 +// SHA512 // import crypto/sha512 +// MD5SHA1 // no implementation; MD5+SHA1 used for TLS RSA +// RIPEMD160 // import golang.org/x/crypto/ripemd160 +// SHA3_224 // import golang.org/x/crypto/sha3 +// SHA3_256 // import golang.org/x/crypto/sha3 +// SHA3_384 // import golang.org/x/crypto/sha3 +// SHA3_512 // import golang.org/x/crypto/sha3 +// SHA512_224 // import crypto/sha512 +// SHA512_256 // import crypto/sha512 +// BLAKE2s_256 // import golang.org/x/crypto/blake2s +// BLAKE2b_256 // import golang.org/x/crypto/blake2b +// BLAKE2b_384 // import golang.org/x/crypto/blake2b +// BLAKE2b_512 // import golang.org/x/crypto/blake2b + +// BoothLeastRotation gets the least rotation of a circular string. +// https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation +// this is generally over commented but I'm keeping it this way for now. - Tim +func BoothLeastRotation(sequence string) int { + + // first concatenate the sequence to itself to avoid modular arithmateic + sequence += sequence // maybe do this as a buffer just for speed? May get annoying with larger sequences. + leastRotationIndex := 0 + + //initializing failure slice. + failureSlice := make([]int, len(sequence)) + for i := range failureSlice { + failureSlice[i] = -1 + } + // iterate through each character in the doubled over sequence + for characterIndex := 1; characterIndex < len(sequence); characterIndex++ { + // get character + character := sequence[characterIndex] + // get failure + failure := failureSlice[characterIndex-leastRotationIndex-1] + // while failure does not equal -1 and character does not equal the character found at the least rotation + failure + 1 <- why this? + for failure != -1 && character != sequence[leastRotationIndex+failure+1] { + + // if character is lexically less than whatever is at the leastRotationIndex index update leastRotation index + if character < sequence[leastRotationIndex+failure+1] { + leastRotationIndex = characterIndex - failure - 1 + } + // update failure using previous failure as index? + failure = failureSlice[failure] + } + + // if character does not equal whatever character is at leastRotationIndex plus failure. + if character != sequence[leastRotationIndex+failure+1] { + + // if character is lexically less then what is rotated least leastRotatationIndex gets value of character index. + if character < sequence[leastRotationIndex] { + leastRotationIndex = characterIndex + } + // assign -1 to whatever is at the index of difference between character and rotation indeces. + failureSlice[characterIndex-leastRotationIndex] = -1 + + // if character does equal whatever character is at leastRotationIndex plus failure. + } else { + // assign failure + 1 at the index of difference between character and rotation indeces. + failureSlice[characterIndex-leastRotationIndex] = failure + 1 + } + } // end loop + + return leastRotationIndex +} + +//RotateSequence rotates circular sequences to deterministic point. +func RotateSequence(sequence string) string { + rotationIndex := BoothLeastRotation(sequence) + concatenatedSequence := sequence + sequence + sequence = concatenatedSequence[rotationIndex : rotationIndex+len(sequence)] + return sequence +} + +// GenericSequenceHash takes a byte slice and a hash function and hashes it. +// from https://stackoverflow.com/questions/32620290/how-to-dynamically-switch-between-hash-algorithms-in-golang +func GenericSequenceHash(annotatedSequence AnnotatedSequence, hash crypto.Hash) (string, error) { + if !hash.Available() { + return "", errors.New("hash unavailable") + } + if annotatedSequence.Meta.Locus.Circular { + annotatedSequence.Sequence.Sequence = RotateSequence(annotatedSequence.Sequence.Sequence) + } + h := hash.New() + return hex.EncodeToString(h.Sum([]byte(annotatedSequence.Sequence.Sequence))), nil +} + +// method wrapper for hashing annotatedSequence structs. +func (annotatedSequence AnnotatedSequence) hash(hash crypto.Hash) string { + seqHash, _ := GenericSequenceHash(annotatedSequence, hash) + return seqHash +} + +// Blake3SequenceHash Blake3 function doesn't use standard golang hash interface +// so we couldn't use it with the generic sequence hash. +func Blake3SequenceHash(annotatedSequence AnnotatedSequence) string { + + if annotatedSequence.Meta.Locus.Circular { + annotatedSequence.Sequence.Sequence = RotateSequence(annotatedSequence.Sequence.Sequence) + } + + b := blake3.Sum256([]byte(annotatedSequence.Sequence.Sequence)) + return hex.EncodeToString(b[:]) +} + +// method wrapper for hashing annotatedSequence structs with Blake3. +func (annotatedSequence AnnotatedSequence) blake3Hash() string { + seqHash := Blake3SequenceHash(annotatedSequence) + return seqHash +} diff --git a/hash_test.go b/hash_test.go new file mode 100644 index 00000000..2f9b7797 --- /dev/null +++ b/hash_test.go @@ -0,0 +1,43 @@ +package main + +import ( + "bytes" + "fmt" + "testing" + + "github.com/sergi/go-diff/diffmatchpatch" +) + +func TestBlake3HashRegression(t *testing.T) { + puc19GbkBlake3Hash := "4031e1971acc8ff1bf0aa4ed623bc58beefc15e043075866a0854d592d80b28b" + puc19 := ReadGbk("data/puc19.gbk") + if got := puc19.blake3Hash(); got != puc19GbkBlake3Hash { + t.Errorf("TestBlake3HashRegression has failed. Blake3 has returned %q, want %q", got, puc19GbkBlake3Hash) + } +} + +func TestLeastRotation(t *testing.T) { + annotatedSequence := ReadGbk("data/puc19.gbk") + var sequenceBuffer bytes.Buffer + + sequenceBuffer.WriteString(annotatedSequence.Sequence.Sequence) + bufferLength := sequenceBuffer.Len() + + var rotatedSequence string + for elementIndex := 0; elementIndex < bufferLength; elementIndex++ { + bufferElement, _, _ := sequenceBuffer.ReadRune() + sequenceBuffer.WriteRune(bufferElement) + if elementIndex == 0 { + rotatedSequence = RotateSequence(sequenceBuffer.String()) + } else { + newRotatedSequence := RotateSequence(sequenceBuffer.String()) + if rotatedSequence != newRotatedSequence { + dmp := diffmatchpatch.New() + diffs := dmp.DiffMain(rotatedSequence, newRotatedSequence, false) + t.Errorf("TestLeastRotation() has failed. rotationSequence mutated.") + fmt.Println(dmp.DiffPrettyText(diffs)) + } + } + } + +} diff --git a/io.go b/io.go index 50feb7db..c2206b41 100644 --- a/io.go +++ b/io.go @@ -98,8 +98,10 @@ type Feature struct { // Sequence holds raw sequence information in an AnnotatedSequence struct. type Sequence struct { - Description string - Sequence string + Description string + Hash string + HashFunction string + Sequence string } // AnnotatedSequence holds all sequence information in a single struct. diff --git a/main.go b/main.go index 8193902c..8157832c 100644 --- a/main.go +++ b/main.go @@ -38,6 +38,37 @@ func main() { convert(c) return nil }}, + { + Name: "hash", + Aliases: []string{"ha"}, + Usage: "Hash a sequence while accounting for circularity.", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "t", + Value: "blake3", + Usage: "Specify hash function type. Has many options. Blake3 is probably fastest.", + }, + &cli.StringFlag{ + Name: "i", + Value: "json", + Usage: "Specify file input type. Options are Gff, gbk/gb, and json.", + }, + &cli.StringFlag{ + Name: "o", + Value: "string", + Usage: "Specify output type. Options are string and json. Defaults to string.", + }, + &cli.BoolFlag{ + Name: "stdout", + Value: false, + Usage: "Will write to standard out whenever applicable. Defaults to false.", + }, + }, + Action: func(c *cli.Context) error { + hash(c) + return nil + }, + }, }} err := app.Run(os.Args)