# Rosalind Solutions

Nothing better to learn a new language (F# in this case) than to use it!

### Counting Nucleotides

Not too hard!

In [None]:
// http://rosalind.info/problems/dna/

let CountDNAString (str: string) = 
    str.ToCharArray() |> Array.countBy id |> Array.sortBy (fun x -> fst x)

"AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC" |> CountDNAString

index,Item1,Item2
0,A,20
1,C,12
2,G,17
3,T,21


### Converting DNA to RNA

In [None]:
// http://rosalind.info/problems/rna/

let DNA2RNA (dna: string) = dna.Replace('T', 'U')

"GATGGAACTTGACTACGTAAATT" |> DNA2RNA

GAUGGAACUUGACUACGUAAAUU

### Reverse Complement of a DNA sequence

In [None]:
// http://rosalind.info/problems/revc/

let GetComplement (cval: char) =
    match cval with
    | 'A' -> 'T'
    | 'T' -> 'A'
    | 'C' -> 'G'
    | 'G' -> 'C'
    | _ -> cval

let ReverseComplement (dna: string) = 
    let complementArray = dna.ToCharArray() |> Array.rev |> Array.map (GetComplement)
    System.String.Join("", complementArray)

"AAAACCCGGT" |> ReverseComplement

ACCGGGTTTT

### Hamming distance of 2 DNA sequences

Taking another look at this, I could have gone with a more functional solution by [zipping](https://fsharp.github.io/fsharp-core-docs/reference/fsharp-collections-arraymodule.html#zip) both strings's respective char arrays and comparing them that way.

But an imperative solution works just as well in this case.

In [None]:
// http://rosalind.info/problems/hamm/

let HammingDistance (seq1: string) (seq2: string) = 
    let mutable distance = 0
    for i = 0 to seq1.Length - 1 do
        if seq1.[i] <> seq2.[i] 
        then distance <- distance + 1
        else ()

    distance

HammingDistance "GAGCCTACTAACGGGAT" "CATCGTAATGACGGCCT"

(Update: Here's the functional solution.)

In [None]:
// http://rosalind.info/problems/hamm/

let HammingDistanceFunc (seq1: string) (seq2: string) =
    Array.zip (seq1.ToCharArray()) (seq2.ToCharArray())
    |> Array.map (fun (x,y) -> if x = y then 0 else 1)
    |> Array.sum

HammingDistanceFunc "GAGCCTACTAACGGGAT" "CATCGTAATGACGGCCT"

### Finding a motif within a DNA sequence

In [None]:
// http://rosalind.info/problems/subs/

let FindMotif (motif: string) (sequence: string) = 
    let distances = System.Collections.Generic.List<int>()
    for i = 0 to sequence.Length - motif.Length - 1 do
        if sequence.Substring(i, motif.Length) = motif
        then distances.Add(i + 1)
        else ()

    System.String.Join(" ", distances).Trim()

printf "%s" (FindMotif "ATAT" "GATATATGCATATACTT")


2 4 10

### Converting RNA to protein

This one was a tad painful, mainly due to ensuring all the cases in the `RNA2Protein` function were correct.

In [None]:
// http://rosalind.info/problems/prot/

let RNA2Protein rna = 
    match rna with
    | "UUU" | "UUC" -> "F"
    | "UUA" | "UUG" | "CUU" | "CUC" | "CUA" | "CUG" -> "L"
    | "UCU" | "UCC" | "UCA" | "UCG" -> "S"
    | "UAU" | "UAC" -> "Y"
    | "UGU" | "UGC" -> "C"
    | "UGG" -> "W"
    | "CCU" | "CCC" | "CCA" | "CCG" -> "P"
    | "CAU" | "CAC" -> "H"
    | "CAA" | "CAG" -> "Q"
    | "CGU" | "CGC" | "CGA" | "CGG" | "AGA" | "AGG" -> "R"
    | "AUU" | "AUC" | "AUA" -> "I"
    | "AUG" -> "M"
    | "ACU" | "ACC" | "ACA" | "ACG" -> "T"
    | "AAU" | "AAC" -> "N"
    | "AAA" | "AAG" -> "K"
    | "AGU" | "AGC" -> "S"
    | "GUU" | "GUC" | "GUA" | "GUG" -> "V"
    | "GCU" | "GCC" | "GCA" | "GCG" -> "A"
    | "GAU" | "GAC" -> "D"
    | "GAA" | "GAG" -> "E"
    | "GGU" | "GGC" | "GGA" | "GGG" -> "G"
    | "UAA" | "UAG" | "UGA" | _ -> ""

let RNASequence2Protein (sequence: string) = 
    let str = System.Text.StringBuilder()

    for i in 0 .. 3 .. (sequence.Length - 4) do
        str.Append(sequence.Substring(i, 3) |> RNA2Protein) |> ignore
    
    str.ToString()

RNASequence2Protein "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"


MAMAPRTEINSTRING

### Computing GC Content

Honestly, the hardest part was dealing with the weird formatting Rosalind uses for the datasets.

In [None]:
// https://rosalind.info/problems/gc/

let ComputeGC (seq: string) = 
    let gcValues = seq |> String.filter (fun x -> x = 'G' || x = 'C')
    (gcValues.Length |> double) / (seq.Length |> double)  

let GetLargestGCValue (values: (string * string) []) = 
    values 
    |> Array.map (fun (x, y) -> (x, (y |> ComputeGC) * 100.))
    |> Array.maxBy (fun (_, y) -> y)

let ProcessGCFile (gc: string) = 
    let lines = File.ReadAllText(gc)
    let procResult = 
        lines.Split('>')
        |> Array.map (fun x -> x.Split ('\n'))
        |> Array.map (fun x -> x |> List.ofArray)
        |> Array.map (fun x -> (List.head x, List.tail x |> List.fold (+) ""))
        |> Array.map (fun (x, y) -> (x, y |> String.filter (fun x -> not <| System.Char.IsWhiteSpace(x))))
        |> Array.filter (fun (x, y) -> (not <| System.String.IsNullOrEmpty(x)) || (not <| System.String.IsNullOrEmpty(y)))
        |> GetLargestGCValue

    sprintf "%s\n%.6f" (fst procResult) (snd procResult)

(__SOURCE_DIRECTORY__) + @"/test.txt" |> ProcessGCFile

Rosalind_0808
60.919540

### Calculating Protein Mass

In [None]:
// https://rosalind.info/problems/prtm/

// get a table of mass values
// https://rosalind.info/glossary/monoisotopic-mass-table/
let massTable = 
    File.ReadAllLines(__SOURCE_DIRECTORY__ + @"/monoisotopic_mass_table.txt")
    |> Array.map (fun x -> x.Split(' '))
    |> Array.map ((fun x -> x |> Array.filter (fun y -> not <| System.String.IsNullOrEmpty(y))))
    |> Array.map (fun x -> (x.[0] |> char, x.[1] |> float))
    |> Map.ofArray

let GetProteinWeight (sequence: string) = 
    sequence.ToCharArray() 
    |> Array.map (fun x -> 
                    match massTable |> Map.tryFind x with 
                    | None -> 0.
                    | Some y -> y)
    |> Array.sum

sprintf "%.3f" ("SKADYEK" |> GetProteinWeight)

821.392