-
Notifications
You must be signed in to change notification settings - Fork 0
/
txt_vec.py
111 lines (88 loc) · 3.15 KB
/
txt_vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
"""Snippets to process text in a dataframe with numpy vectorized functions."""
# standard library imports
import os, sys, math
from pathlib import Path
import ntpath
import pprint as pp
import itertools
import functools
import operator
import re
import collections
import unicodedata
import string
from IPython.core.display import HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# commonly installed imports
import numpy as np
import pandas as pd
import tqdm
import toolz
import pdvega
# other third-party package imports
import validators
import tldextract
print(sys.executable)
print(sys.version)
pd_np_str = lambda d: d.values.astype(dtype=np.str_)
def make_trans_table(tolower=True, toupper=False, repl_num=False, repl_punc=True, bad_chars=''):
"""create string translation table for translate string function"""
good_chars = string.whitespace
repl_chars = len(string.whitespace) * ' '
if tolower:
good_chars += string.ascii_uppercase
repl_chars += string.ascii_lowercase
if toupper:
good_chars += string.ascii_lowercase
repl_chars += string.ascii_uppercase
if repl_num:
good_chars += string.digits
repl_chars += len(string.digits) * ' '
if repl_punc:
good_chars += string.punctuation
repl_chars += len(string.punctuation) * ' '
good_chars += '–' + '—'
repl_chars += ' ' + ' '
bad_chars += '®'
return str.maketrans(good_chars, repl_chars, bad_chars)
s_trans_table = make_trans_table(True, False, True, True, "'-`")
TRANS_TABLE = make_trans_table(True, False, False, True, '')
def clean_np_str(a):
return np.char.translate(a, TRANS_TABLE)
camelEx = re.compile(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))')
camelSplitStr = lambda s: camelEx.sub(r' \1', s)
camelNP = np.vectorize(camelSplitStr, otypes=[np.str_])
uni_norm = lambda s: unicodedata.normalize('NFKD', s)
uni_norm_np = np.vectorize(uni_norm, otypes=[np.str_])
WORD_LENGTH = 3
def filter_words(s):
return (s and
len(s) >= WORD_LENGTH and
not s.isnumeric() and
s not in stop_all)
clean_words = lambda l: list(filter(filter_words, l))
clean_words_np = np.vectorize(clean_words, otypes=[object]) #[np.str_])
rejoin_np = lambda a: np.char.join(' ', a)
procs = reversed([pd_np_str,
#camelNP,
uni_norm_np,
clean_np_str,
np.char.split,
clean_words_np,
np.vectorize(set, otypes=[object]),
rejoin_np
])
pd_proc = toolz.functoolz.compose(*procs)
s_proc_list = reversed([pd_np_str,
uni_norm_np,
lambda a: np.char.split(a, '/'),
lambda l: list(map(filter_toks, l)),
rejoin_np,
#camelNP,
lambda a: np.char.translate(a, s_trans_table),
np.char.split,
np.vectorize(clean_words, otypes=[object])
])
s_proc = toolz.functoolz.compose(*s_proc_list)