/
utf8.carp
174 lines (134 loc) · 4.68 KB
/
utf8.carp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
(deftype Rune [bytes (Array Char)])
(derive Rune =)
(defmodule Rune
(defn str [r]
(String.from-chars (bytes r)))
(implements str Rune.str)
(defn prn [r] (str (the (Ref Rune) r)))
(implements prn Rune.prn)
(defn add-byte [r c]
(set-bytes @r (Array.push-back @(Rune.bytes r) c)))
(defn length [r]
(Array.length (bytes r)))
)
(defmodule RuneCopy
(defn = [a b]
(Rune.= &a &b))
(implements = RuneCopy.=)
)
(doc UTF8 "is a simple UTF-8 package for Carp. This nascent string replacement
type allows you to use many of the functions you know from Carp strings while
respecting unicode runes instead of just having bytes.
## Installation
You can obtain this library like so:
```clojure
(load \"git@github.com:carpentry-org/utf8.carp@0.0.5\")
```
## Usage
First, let’s define a UTF-8 string to work with!
```clojure
(let [s (UTF8.from-string \"hεllö\")]
```
That’s a cute, short string! Hm, I wonder how long it is!
```clojure
(length s) ; => 5
```
That’s surprising! So the length is the actual number of runes, and not the
number of bytes, you say? Most curious!
You know what, I want to see this second character there up close! It somehow
looks all Greek to me!
```clojure
(nth s 1) ; => ε
```
Hm, so this is what that looks like, huh? Interesting. And what’s its type?
```clojure
(type UTF8.nth) ; => UTF8.nth : (λ [(Ref UTF8), Int] Rune)
```
So, it’s called a `Rune`, huh? Hm, they don’t seem to be super interesting, but
I seem to be able to compare them and stringify them, and even take their length
in bytes! Quite delicious!
I wonder what else I can do with these functions?")
(deftype UTF8 [runes (Array Rune)])
(defmodule UTF8
(defn = [a b]
(= (runes a) (runes b)))
(implements = UTF8.=)
(hidden utf8-cont?)
(private utf8-cont?)
(defn utf8-cont? [c]
(let [i (Char.to-int c)]
(= 128 (Int.bit-and 192 i))))
(hidden to-runes)
(private to-runes)
(defn to-runes [s]
(let-do [chs (chars s)
rune (Rune.init [])
res []]
(for [i 0 (Array.length &chs)]
(let-do [cur @(Array.unsafe-nth &chs i)]
(when (and (not (utf8-cont? cur)) (> (Rune.length &rune) 0))
(do
(set! res (Array.push-back res @&rune))
(set! rune (Rune.init []))))
(set! rune (Rune.add-byte &rune cur))))
(when (> (Rune.length &rune) 0)
(set! res (Array.push-back res rune)))
res))
(hidden from-runes)
(private from-runes)
(defn from-runes [r]
(Array.reduce &(fn [x y] (String.append &x &(from-chars (Rune.bytes y))))
@""
r))
(doc from-string "creates an UTF-8 string from a regular string.")
(defn from-string [s]
(init (to-runes s)))
(implements from-string UTF8.from-string)
(doc str "creates a regular string from a UTF-8 string")
(defn str [u]
(from-runes (runes u)))
(implements str UTF8.str)
(doc nth "returns the `n`th rune from a UTF-8-encoded string.")
(defn nth [u n]
(Array.nth (runes u) n))
(doc unsafe-nth "returns the `n`th rune from a UTF-8-encoded string unsafely.")
(defn unsafe-nth [u n]
@(Array.unsafe-nth (runes u) n))
(doc length "returns the length of a UTF-8-encoded string.")
(defn length [u]
(Array.length (runes u)))
(doc append "appends two UTF-8-encoded strings.")
(defn append [a b]
(UTF8.init (Array.concat &[@(runes a) @(runes b)])))
(doc reverse "reverses a UTF-8-encoded string.")
(defn reverse [u]
(UTF8.init (Array.reverse @(runes u))))
(doc empty? "checks whether a UTF-8-encoded string is empty.")
(defn empty? [u]
(= (length (the (Ref UTF8) u)) 0))
(doc slice "returns a substring of the string from the index `a` to the index
`b`.")
(defn slice [u a b]
(UTF8.init (Array.slice (runes u) a b)))
(implements slice UTF8.slice)
(doc prefix "returns the first `n` chararacters of the string `u`.")
(defn prefix [u n]
(UTF8.init (Array.prefix (runes u) n)))
(doc suffix "returns the last `n` chararacters of the string `u`.")
(defn suffix [u n]
(UTF8.init (Array.suffix (runes u) n)))
(doc starts-with? "checks if the string `u` begins with the string `sub`.")
(sig starts-with? (Fn [(Ref UTF8) (Ref UTF8)] Bool))
(defn starts-with? [u sub]
(let [ls (length sub)]
(and (>= (length u) ls) (= sub &(UTF8.prefix u ls)))))
(doc ends-with? "checks if the string `u` ends with the string `sub`.")
(sig ends-with? (Fn [(Ref UTF8) (Ref UTF8)] Bool))
(defn ends-with? [u sub]
(let [lu (length u)
ls (length sub)]
(and (>= lu ls) (= sub &(UTF8.suffix u (- lu ls))))))
(doc zero "Returns the empty string.")
(defn zero [] (the UTF8 (from-string "")))
(implements zero UTF8.zero)
)