## Read file

In [None]:
#!fsharp
open System.IO
let imageBuffer = File.ReadAllBytes "./images/passport-1.jpg"

## Convert image to base64 string 

In [None]:
#!fsharp
open System
let image64 = Convert.ToBase64String imageBuffer

## Use yandex API to extract text from image

### Get yandex token first

Your current folder has been set to 'default' (id = b1g3aavp9tndrdr9q148).


[ya-ocr docs](https://cloud.yandex.com/en-ru/docs/vision/operations/ocr/text-detection)



In [None]:
#!powershell
yc iam create-token

t1.9euelZrIzpiTk8mYl4rPipCdlsfNye3rnpWal8ebi5aJmJ2Ky4yVkc2dmJrl8_coZHl3-e9iERVa_N3z92gSd3f572IRFVr8.JoEPzqiPReSaCIxa2XzaijQle3TABxa4e4S4rsEcmdrBlLtSdpc6mSnQ4Y8IndLp5Re_iUskvuF9vObQYeIQAg


### Extract data from the image using yandex API

In [None]:
// Result type def
// created with https://app.quicktype.io/ convert to typescript and then modify manually
type Vertex = {
    x: string;
    y: string;
}

type Language = {
    languageCode: string;
    confidence:   float;
}

type BoundingBox = {
    vertices: Vertex[];
}

type Word = {
    boundingBox: BoundingBox;
    languages:   Language[];
    text:        string;
    confidence:  float;
}

type Line = {
    boundingBox: BoundingBox;
    words:       Word[];
    confidence:  float;
}

type Block = {
    boundingBox: BoundingBox;
    lines:       Line[];
}

type Page = {
    blocks: Block[];
    width:  string;
    height: string;
}

type TextDetection = {
    pages: Page[];
}

type ResultResult = {
    textDetection: TextDetection;
}

type YaOcrResult = {
    results: ResultResult[];
}

type YaOcr = {
    results: YaOcrResult[];
}



In [None]:
#!fsharp

#r "nuget:Oryx"
#r "nuget:Oryx.SystemTextJson"

// create you yandex account first
let YA_FOLDER = "b1g3aavp9tndrdr9q148"
// copy value from output of pervious cell
let YA_TOKEN="t1.9euelZrIzpiTk8mYl4rPipCdlsfNye3rnpWal8ebi5aJmJ2Ky4yVkc2dmJrl8_coZHl3-e9iERVa_N3z92gSd3f572IRFVr8.JoEPzqiPReSaCIxa2XzaijQle3TABxa4e4S4rsEcmdrBlLtSdpc6mSnQ4Y8IndLp5Re_iUskvuF9vObQYeIQAg"
let YA_URL="https://vision.api.cloud.yandex.net/vision/v1/batchAnalyze"

open Oryx
open Oryx.SystemTextJson.ResponseReader
open System.Net.Http
open System.Threading.Tasks
open System.Text.Json

let options = JsonSerializerOptions()

let bodyFeatures = 
    {|
        Type = "TEXT_DETECTION"
        text_detection_config = {| language_codes = [|"*"|] |}
    |}
    
let body = {|
    folderId = YA_FOLDER
    analyze_specs = [|
      {|
        content = image64
        features = bodyFeatures
      |}
    |]
|}


let bodyContent = Json.JsonContent.Create body :> HttpContent

let request = 
    POST 
    >=> withUrl YA_URL 
    >=> withContent (fun _ -> bodyContent)
    >=> fetch
    >=> json<YaOcr> options

let client = new HttpClient ()
let ctx = 
    HttpContext.defaultContext     
    |> HttpContext.withHeader ("Authorization", $"Bearer {YA_TOKEN}")
    |> HttpContext.withHttpClient client 

let yaOcrJsonResult = request |> runAsync ctx |> Async.AwaitTask |> Async.RunSynchronously




In [None]:
// extract text from result

let yaOcrJson = 
    match yaOcrJsonResult with 
    | Ok yaOcrJson -> yaOcrJson
    | Error err ->
        raise err

let blocks = yaOcrJson.results.[0].results.[0].textDetection.pages.[0].blocks

let words =
    blocks|> Array.collect(fun block -> block.lines |> Array.collect(fun line -> line.words |> Array.map(fun word -> word.text)))

words

index,value
0,с
1,и
2,й
3,с
4,к
5,а
6,я
7,ф
8,е
9,д


### Preapre data and utilities to cleanup words

In [None]:
open System.IO

File.WriteAllLines("./out/passport-1-words.txt", words)