In [1]:
# Define character types. Types adapted from Haruechaiyasak et al. 2008.

# Character that can be the final consonant in a word
chartype_c = (
    "\u0e01\u0e02\u0e03\u0e04\u0e06\u0e07\u0e08\u0e0a\u0e0b\u0e0d\u0e0e\u0e0f\u0e10"
    + "\u0e11\u0e12\u0e13\u0e14\u0e15\u0e16\u0e17\u0e18\u0e19\u0e1a\u0e1b\u0e1e\u0e1f"
    + "\u0e20\u0e21\u0e22\u0e23\u0e24\u0e25\u0e26\u0e27\u0e28\u0e29\u0e2a\u0e2c\u0e2d"
)

# Character that cannot be the final consonant in a word
chartype_n = "\u0e05\u0e09\u0e0c\u0e1c\u0e1d\u0e2b\u0e2e"

# Vowel that cannot begin a word
chartype_v = "\u0e30\u0e31\u0e32\u0e33\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39\u0e45\u0e47"

# Vowel that can begin a word
chartype_w = "\u0e40\u0e41\u0e42\u0e43\u0e44"

# Combining symbol
chartype_s = "\u0e3a\u0e4c\u0e4d\u0e4e"

# Standalone symbol
chartype_a = "\u0e2f\u0e46\u0e4f\u0e5a\u0e5b"

# Tone marks
chartype_t = "\u0e48\u0e49\u0e4a\u0e4b"

# Digit character
chartype_d = "0123456789\u0e50\u0e51\u0e52\u0e53\u0e54\u0e55\u0e56\u0e57\u0e58\u0e59"

# Currency character
chartype_b = "$฿"

# Quote character
chartype_q = "'\""

# Other character
chartype_o = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

# Space character inside a word
# chartype_p

# Space character
chartype_z = " \u00a0\n"

# Undefined
# chartype_x

tags = [
    ("c", chartype_c),
    ("n", chartype_n),
    ("v", chartype_v),
    ("w", chartype_w),
    ("s", chartype_s),
    ("a", chartype_a),
    ("t", chartype_t),
    ("d", chartype_d),
    ("b", chartype_b),
    ("q", chartype_q),
    ("o", chartype_o),
    ("z", chartype_z),
]

generic_chartypes = ["d", "b", "q", "o", "z", "x"]

# Character type
def get_chartype(c: str) -> str:
    for tag in tags:
        if c in tag[1]:
            return tag[0]
    return "x"

In [8]:
NGRAM = 21

def extract_features(doc: str):
    len_doc = len(doc)
    look_range = list(range(-int(NGRAM / 2), int(NGRAM / 2) + 1))

    doc_features = []
    for i, char in enumerate(doc):
        ct = get_chartype(char)
        char_features = ["bias"]

        if i == 0:
            char_features.append("BOS")  # Beginning of string
        elif i == len_doc - 1:
            char_features.append("EOS")  # End of string

        # Look forward
        for j in look_range:
            if i >= j and i < len_doc - j:
                c = doc[i + j]
                ct = get_chartype(c)
                char_features.append("t{}={}".format(j, ct))
                #if ct not in generic_chartypes:
                char_features.append("c{}={}".format(j, c))
            else:
                break

        doc_features.append(char_features)

    return doc_features

feats = extract_features("ทดสอบ 1 การทำงาน 123 Af ของระบบ gram แบบใหม่")
feats

[['bias',
  'BOS',
  't-10=o',
  'c-10=a',
  't-9=o',
  'c-9=m',
  't-8=z',
  'c-8= ',
  't-7=w',
  'c-7=แ',
  't-6=c',
  'c-6=บ',
  't-5=c',
  'c-5=บ',
  't-4=w',
  'c-4=ใ',
  't-3=n',
  'c-3=ห',
  't-2=c',
  'c-2=ม',
  't-1=t',
  'c-1=่',
  't0=c',
  'c0=ท'],
 ['bias',
  't-10=o',
  'c-10=m',
  't-9=z',
  'c-9= ',
  't-8=w',
  'c-8=แ',
  't-7=c',
  'c-7=บ',
  't-6=c',
  'c-6=บ',
  't-5=w',
  'c-5=ใ',
  't-4=n',
  'c-4=ห',
  't-3=c',
  'c-3=ม',
  't-2=t',
  'c-2=่',
  't-1=c',
  'c-1=ท',
  't0=c',
  'c0=ด',
  't1=c',
  'c1=ส'],
 ['bias',
  't-10=z',
  'c-10= ',
  't-9=w',
  'c-9=แ',
  't-8=c',
  'c-8=บ',
  't-7=c',
  'c-7=บ',
  't-6=w',
  'c-6=ใ',
  't-5=n',
  'c-5=ห',
  't-4=c',
  'c-4=ม',
  't-3=t',
  'c-3=่',
  't-2=c',
  'c-2=ท',
  't-1=c',
  'c-1=ด',
  't0=c',
  'c0=ส',
  't1=c',
  'c1=อ',
  't2=c',
  'c2=บ'],
 ['bias',
  't-10=w',
  'c-10=แ',
  't-9=c',
  'c-9=บ',
  't-8=c',
  'c-8=บ',
  't-7=w',
  'c-7=ใ',
  't-6=n',
  'c-6=ห',
  't-5=c',
  'c-5=ม',
  't-4=t',
  'c-4=่',
  't-3

In [27]:
gram = 11

look_around = int(gram/2)
for j in range(look_around, 0, -1):
    print(j)

5
4
3
2
1
